# Fix column names 

This notebook analyses inconsistent column names and fixes column names to be consistent among excel sheets

```
pathogenic: 
"Location in  Genome release  37 (hg19)" (orig) -> "Location in Genome release 37 (hg19)" (new)
"Location in Genome release  38 (hg38)" (orig) -> "Location in Genome release 38 (hg38)" (new)
"Protein or mRNA variants" (orig) -> "Protein or mRNA Variants" (new)
" Functional outcome (MLCL/CL ratio)" (orig) -> "Functional outcome (MLCL/CL ratio)" (new)
"Taffazin Functional motifs " (orig) -> "Taffazin Functional motifs" (new)

benign:
"Location in  Genome release 37 (hg19)" (orig) -> "Location in Genome release 37 (hg19)" (new)
"Splicing prediction" (orig) -> "Splicing Prediction" (new)

vus:
"Location in  Genome release 37 (hg19)" (orig) -> "Location in Genome release 37 (hg19)" (new)

exon5:
"Genome Assembly Release 37" (orig) -> "Location in Genome release 37 (hg19)" (new)
"Genome Assembly Release 38" (orig) -> "Location in Genome release 38 (hg38)" (new)
```

In [1]:
import pandas as pd
import os
import helpers

In [2]:
database_folder = '../intermediate_pipeline_db_versions'

In [3]:
input_path_prefix = f'{database_folder}/00000_2024-01-18-20-00-05-801504_Human-TAFAZZIN-Variants-Database_' 
# TODO get this automatically in next notebooks

output_path_prefix = helpers.create_database_output_path_prefix()

version number: 00010
database output path prefix: ../intermediate_pipeline_db_versions/00010_2024-01-18-20-05-26-311470_Human-TAFAZZIN-Variants-Database_


# Load data

In [4]:
df_pathogenic = pd.read_csv(input_path_prefix + 'pathogenic.csv')
df_vus = pd.read_csv(input_path_prefix + 'vus.csv')
df_exon5 = pd.read_csv(input_path_prefix + 'exon5.csv')
df_benign = pd.read_csv(input_path_prefix + 'benign.csv')

In [5]:
print(df_pathogenic.shape)
display(df_pathogenic.head(3))

(406, 16)


Unnamed: 0,Location,Location in Genome release 37 (hg19),Location in Genome release 38 (hg38),Protein Variant Type,Impact of Variant,DNA Modifications,Protein or mRNA variants,Functional outcome (MLCL/CL ratio),Taffazin Functional motifs,Method of Validation,References,Source,Additional variants in other genes,Location and Order of Discovery,Notes,Unnamed: 15
0,Exon 1,X:153640189,X:154411852,Frameshift,,c.9_10dupG,p.His4Alafs*130,MLCL/CL elevated,,,Ref. 1 (Pat.1); Ref. 80; Ref. 113,,,1-1,,
1,Exon 1,X:153640197 - 153640198,X: 154411860 - 154411861,Frameshift,,c.18_22dup,p.Pro8fs*,,,,Ref. 140; Ref.83,ClinVar,,1-12,,
2,Exon 1,X:153640219_241,X:154411882_904,Frameshift,,c.39_60del22,p.Pro14Alafs*19,MLCL/CL elevated,,,Ref. 95 (Pat. 1),,"Mitochondrial: m.1555A>G in 12S rRNA, homoplasmic",1-11,,


In [6]:
print(df_vus.shape)
display(df_vus.head(3))

(126, 12)


Unnamed: 0,Location,Location in Genome release 37 (hg19),Location in Genome release 38 (hg38),DNA Modifications,Protein or mRNA Variants,References & population frequency,Source,SIFT prediction,PolyPhen2 prediction,Amino acid conservation & comments,Additional variants in other genes,Notes
0,Exon 1,X:153640193,X:154411856,c.13G>T,p.Val5Leu,gnomad exomes; Ashk Jewish female 5/99652,ClinVar; LMM 2011; GeneDX 2017; Invitae 2020;...,tolerated; 0.14,0.984,Vertebrates 100% Val; invertebrates have Ile,,
1,Exon 1,X:153640197_198,X:154411860_861,c.17_18insA,fs*,Ref. 57,,,,,,Not in ClinVar
2,Exon 1,X:153640207,X:154411870,c.27C>G,p.Phe9Leu,,ClinVar; Invitae 2020; Ambry 2020,deleterious,benign,vertebrates 100%,,Not in ExAC


In [7]:
print(df_benign.shape)
display(df_benign.head(3))

(178, 13)


Unnamed: 0,Location,Location in Genome release 37 (hg19),Location in Genome release 38 (hg38),DNA Modifications,Protein or mRNA Variants,References & population frequency,Source,Amino acid conservation & comments,SIFT prediction,PolyPhen2 prediction,Splicing prediction,Additional variants in other genes,Notes
0,5'UTR,X:153640060-153640061,X:154411724-154411725,c.-119 (or '-121_-119) insert/del T,,ExAC 5786/6746 alleles; 3249 homozy; 1111 hemizy,ClinVar,,,,,,Benign
1,5'UTR,X:153640093,X:154411756,c.-88G > C,,ExAC 28/6688; 13 hemizyg; Ref. 4; Ref. 14,ClinVar Jun 2016,,,,,,
2,5'UTR,X:153640097,X:154411760,c.-84C>G,,,,,,,,,


In [8]:
print(df_exon5.shape)
display(df_exon5.head(3))

(11, 13)


Unnamed: 0,Location,Classification,Genome Assembly Release 37,Genome Assembly Release 38,DNA Modifications,Protein or mRNA Variants,References & population frequency,Source,Amino acid conservation & comments,SIFT prediction,PolyPhen2 prediction,Splicing Prediction,Notes
0,Exon 5,VUS,X:153642485,X:154414148_52,c.418_422del ACAGGinsA,p.Arg142ThrfsX41,ExAC 1/79840; 1 hemizyg (LOW CONFIDENCE),,,,,,Not in ClinVar
1,Exon 5,VUS,X:153642486,X:154414149,c.419C>T,p.Thr140Ile,Ref. 83,ClinVar Jan 2014,Not all primates,Tolerated; 0.36,0.0,Acceptor much reduced; donor small reduction,VUS; not in ExAC
2,Exon 5,VUS,X:153642492_95,X:154414155_58,c.425_428delGGCA or c.419_422delCAGG,p.Arg142ThrfsX41,Ref. 83,ClinVar Feb 2017,Should only manifest in FL variant,,,,Likely path. Var ID 423852; Not in ExAC


# Inspect column names

In [9]:
columns_all = list(df_pathogenic.columns) + list(df_benign.columns) + \
            list(df_vus.columns) + list(df_exon5.columns)

In [10]:
columns_not_in_all = [x for x in set(columns_all) if columns_all.count(x) != 4]

In [11]:
print('Column names, which are not present in all 4 sheets:')
sorted(columns_not_in_all)

Column names, which are not present in all 4 sheets:


[' Functional outcome (MLCL/CL ratio)',
 'Additional variants in other genes',
 'Amino acid conservation & comments',
 'Classification',
 'Genome Assembly Release 37',
 'Genome Assembly Release 38',
 'Impact of Variant',
 'Location and Order of Discovery',
 'Location in  Genome release  37 (hg19)',
 'Location in  Genome release 37 (hg19)',
 'Location in Genome release  38 (hg38)',
 'Location in Genome release 38 (hg38)',
 'Method of Validation',
 'PolyPhen2 prediction',
 'Protein Variant Type',
 'Protein or mRNA Variants',
 'Protein or mRNA variants',
 'References',
 'References & population frequency',
 'SIFT prediction',
 'Splicing Prediction',
 'Splicing prediction',
 'Taffazin Functional motifs ',
 'Unnamed: 15']

### Column names missing in some sheets, but seem to be OK:

--> no action needed

In [12]:
columns_missing_seem_ok = ['Functional outcome (MLCL/CL ratio)', # only in df_pathogenic
    'Additional variants in other genes', # not in exon5
    'Amino acid conservation & comments', # not in df_pathogenic
    'Classification', # only in exon5
    'Impact of Variant', # only in df_pathogenic
    'Location and Order of Discovery', # only in df_pathogenic
    'Method of Validation'] # only in df_pathogenic

In [13]:
len(columns_not_in_all)

24

In [14]:
columns_not_in_all = [x for x in columns_not_in_all if x not in columns_missing_seem_ok]

In [15]:
len(columns_not_in_all)

18

### Column names are missing, but column renaming is not enough, more parsing will be needed:

--> work left to be done in next notebooks TODO

In [16]:
columns_missing_more_parsing_needed = ['PolyPhen2 prediction', # column not in df_pathogenic, but this info can be extracted from 'Method of Validation'
    'SIFT prediction', # column not in df_pathogenic, but this info can be extracted from 'Method of Validation'
    'Protein Variant Type', # only in df_pathogenic, should be added also to other sheets
    'References', # only in df_pathogenic, should be made consistent with other sheets, where similar column contains population frequency, see below
    'References & population frequency'] # not in df_pathogenic, see above

In [17]:
columns_not_in_all = [x for x in columns_not_in_all if x not in columns_missing_more_parsing_needed]

In [18]:
len(columns_not_in_all)

13

# Rename columns:

#### ' Functional outcome (MLCL/CL ratio)':

reason: typo: space at the beginning

In [19]:
new_col_name = 'Functional outcome (MLCL/CL ratio)'

In [20]:
columns_all.count(' Functional outcome (MLCL/CL ratio)')

1

In [21]:
' Functional outcome (MLCL/CL ratio)'in df_pathogenic

True

In [22]:
df_pathogenic.rename(columns={' Functional outcome (MLCL/CL ratio)': new_col_name.strip()}, inplace=True)

In [23]:
assert new_col_name in df_pathogenic.columns

In [24]:
columns_not_in_all.remove(' Functional outcome (MLCL/CL ratio)')

In [25]:
len(columns_not_in_all)

12

#### 'Genome Assembly Release 37'
#### 'Location in  Genome release  37 (hg19)'
#### 'Location in  Genome release 37 (hg19)'

reason: typo: names inconsistent, spaces in names

In [26]:
new_col_name = 'Location in Genome release 37 (hg19)' # new name with fixed spaces

In [27]:
columns_all.count('Genome Assembly Release 37')

1

In [28]:
'Genome Assembly Release 37' in df_exon5

True

In [29]:
df_exon5.rename(
    columns={'Genome Assembly Release 37': new_col_name}, inplace=True)

In [30]:
columns_all.count('Location in  Genome release  37 (hg19)')

1

In [31]:
df_pathogenic.rename(
    columns={'Location in  Genome release  37 (hg19)': new_col_name}, inplace=True)

In [32]:
columns_all.count('Location in  Genome release 37 (hg19)')

2

In [33]:
'Location in  Genome release 37 (hg19)' in df_vus

True

In [34]:
'Location in  Genome release 37 (hg19)' in df_benign

True

In [35]:
df_benign.rename(columns={
    'Location in  Genome release 37 (hg19)': new_col_name}, inplace=True)

In [36]:
df_vus.rename(
    columns={'Location in  Genome release 37 (hg19)': new_col_name}, inplace=True)

In [37]:
assert new_col_name in df_exon5

In [38]:
assert new_col_name in df_pathogenic

In [39]:
assert new_col_name in df_benign.columns

In [40]:
assert new_col_name in df_vus.columns

In [41]:
columns_not_in_all.remove('Genome Assembly Release 37')

In [42]:
columns_not_in_all.remove('Location in  Genome release  37 (hg19)')

In [43]:
columns_not_in_all.remove('Location in  Genome release 37 (hg19)')

In [44]:
len(columns_not_in_all)

9

#### 'Genome Assembly Release 38'
#### 'Location in Genome release  38 (hg38)',
#### 'Location in Genome release 38 (hg38)'

reason: typo: names inconsistent, spaces in names

In [45]:
new_col_name = 'Location in Genome release 38 (hg38)'

In [46]:
columns_all.count('Genome Assembly Release 38')

1

In [47]:
'Genome Assembly Release 38' in df_exon5

True

In [48]:
df_exon5.rename(
    columns={'Genome Assembly Release 38': new_col_name}, inplace=True)

In [49]:
columns_all.count('Location in Genome release  38 (hg38)')

1

In [50]:
'Location in Genome release  38 (hg38)' in df_pathogenic

True

In [51]:
df_pathogenic.rename(
    columns={'Location in Genome release  38 (hg38)': new_col_name}, inplace=True)

In [52]:
assert new_col_name in df_exon5

In [53]:
assert new_col_name in df_pathogenic

In [54]:
assert new_col_name in df_benign

In [55]:
assert new_col_name in df_vus

In [56]:
columns_not_in_all.remove('Genome Assembly Release 38')

In [57]:
columns_not_in_all.remove('Location in Genome release  38 (hg38)')

In [58]:
columns_not_in_all.remove('Location in Genome release 38 (hg38)')

In [59]:
len(columns_not_in_all)

6

#### 'Protein or mRNA Variants',
#### 'Protein or mRNA variants'

reason: typo: names inconsistent, spaces in names

In [60]:
new_col_name = 'Protein or mRNA Variants'

In [61]:
assert new_col_name in df_benign

In [62]:
assert new_col_name in df_vus

In [63]:
assert new_col_name in df_exon5

In [64]:
df_pathogenic.rename(
    columns={'Protein or mRNA variants': new_col_name}, inplace=True)

In [65]:
assert new_col_name in df_pathogenic

In [66]:
columns_not_in_all.remove('Protein or mRNA Variants')

In [67]:
columns_not_in_all.remove('Protein or mRNA variants')

In [68]:
len(columns_not_in_all)

4

#### 'Taffazin Functional motifs '

reason: space at the end of the column name

In [69]:
columns_all.count('Taffazin Functional motifs ')

1

In [70]:
'Taffazin Functional motifs ' in df_pathogenic

True

In [71]:
new_col_name = 'Taffazin Functional motifs'

In [72]:
df_pathogenic.rename(
    columns={'Taffazin Functional motifs ': new_col_name}, inplace=True)

In [73]:
assert new_col_name in df_pathogenic

In [74]:
columns_not_in_all.remove('Taffazin Functional motifs ')

In [75]:
len(columns_not_in_all)

3

#### 'Splicing Prediction'
#### 'Splicing prediction'

reason: inconsistent column names

TODO why is this column not in pathogenic and vus?

In [76]:
columns_all.count('Splicing prediction')

1

In [77]:
columns_all.count('Splicing Prediction')

1

In [78]:
'Splicing prediction' in df_benign

True

In [79]:
'Splicing Prediction' in df_exon5

True

In [80]:
new_col_name = 'Splicing Prediction'

In [81]:
df_benign.rename(
    columns={'Splicing prediction': new_col_name}, inplace=True)

In [82]:
assert new_col_name in df_benign

In [83]:
assert new_col_name in df_exon5

In [84]:
columns_not_in_all.remove('Splicing Prediction')

In [85]:
columns_not_in_all.remove('Splicing prediction')

In [86]:
len(columns_not_in_all)

1

#### 'Unnamed: 15' column

reason: only space in one row, inserted by mistake

TODO remove unnamed: 15 column in next notebook

In [87]:
'Unnamed: 15' in df_pathogenic

True

In [88]:
columns_all.count('Unnamed: 15')

1

In [89]:
df_pathogenic['Unnamed: 15'].value_counts()

     1
Name: Unnamed: 15, dtype: int64

In [90]:
df_pathogenic[~df_pathogenic['Unnamed: 15'].isna()]['Unnamed: 15'].iloc[0]

' '

In [91]:
# TODO in next notebook
# df_pathogenic.drop(['Unnamed: 15'], axis=1, inplace=True)
# assert not 'Unnamed: 15' in df_pathogenic
# columns_not_in_all.remove('Unnamed: 15')
# assert len(columns_not_in_all) == 0, 'all columns should have been handled by now'

# Save data

In [92]:
helpers.save_output_as_csv(output_path_prefix,
             df_pathogenic=df_pathogenic,
             df_benign=df_benign,
             df_vus=df_vus,
             df_exon5=df_exon5)

Dataframe of shape (406, 16) saved to ../intermediate_pipeline_db_versions/00010_2024-01-18-20-05-26-311470_Human-TAFAZZIN-Variants-Database_pathogenic.csv
Dataframe of shape (178, 13) saved to ../intermediate_pipeline_db_versions/00010_2024-01-18-20-05-26-311470_Human-TAFAZZIN-Variants-Database_benign.csv
Dataframe of shape (126, 12) saved to ../intermediate_pipeline_db_versions/00010_2024-01-18-20-05-26-311470_Human-TAFAZZIN-Variants-Database_vus.csv
Dataframe of shape (11, 13) saved to ../intermediate_pipeline_db_versions/00010_2024-01-18-20-05-26-311470_Human-TAFAZZIN-Variants-Database_exon5.csv


# Load what was saved and compare with original version

In [93]:
! diff {output_path_prefix}vus.csv "{input_path_prefix}vus.csv" 

1c1
< Location,Location in Genome release 37 (hg19),Location in Genome release 38 (hg38),DNA Modifications,Protein or mRNA Variants,References & population frequency,Source,SIFT prediction,PolyPhen2 prediction,Amino acid conservation & comments,Additional variants in other genes,Notes
---
> Location,Location in  Genome release 37 (hg19),Location in Genome release 38 (hg38),DNA Modifications,Protein or mRNA Variants,References & population frequency,Source,SIFT prediction,PolyPhen2 prediction,Amino acid conservation & comments,Additional variants in other genes,Notes


In [94]:
! diff {output_path_prefix}vus.csv "{input_path_prefix}vus.csv" 

1c1
< Location,Location in Genome release 37 (hg19),Location in Genome release 38 (hg38),DNA Modifications,Protein or mRNA Variants,References & population frequency,Source,SIFT prediction,PolyPhen2 prediction,Amino acid conservation & comments,Additional variants in other genes,Notes
---
> Location,Location in  Genome release 37 (hg19),Location in Genome release 38 (hg38),DNA Modifications,Protein or mRNA Variants,References & population frequency,Source,SIFT prediction,PolyPhen2 prediction,Amino acid conservation & comments,Additional variants in other genes,Notes


In [95]:
! diff {output_path_prefix}benign.csv "{input_path_prefix}benign.csv" 

1c1
< Location,Location in Genome release 37 (hg19),Location in Genome release 38 (hg38),DNA Modifications,Protein or mRNA Variants,References & population frequency,Source,Amino acid conservation & comments,SIFT prediction,PolyPhen2 prediction,Splicing Prediction,Additional variants in other genes,Notes
---
> Location,Location in  Genome release 37 (hg19),Location in Genome release 38 (hg38),DNA Modifications,Protein or mRNA Variants,References & population frequency,Source,Amino acid conservation & comments,SIFT prediction,PolyPhen2 prediction,Splicing prediction,Additional variants in other genes,Notes


In [96]:
! diff {output_path_prefix}pathogenic.csv "{input_path_prefix}pathogenic.csv" 

1c1
< Location,Location in Genome release 37 (hg19),Location in Genome release 38 (hg38),Protein Variant Type,Impact of Variant,DNA Modifications,Protein or mRNA Variants,Functional outcome (MLCL/CL ratio),Taffazin Functional motifs,Method of Validation,References,Source,Additional variants in other genes,Location and Order of Discovery,Notes,Unnamed: 15
---
> Location,Location in  Genome release  37 (hg19),Location in Genome release  38 (hg38),Protein Variant Type,Impact of Variant,DNA Modifications,Protein or mRNA variants, Functional outcome (MLCL/CL ratio),Taffazin Functional motifs ,Method of Validation,References,Source,Additional variants in other genes,Location and Order of Discovery,Notes,Unnamed: 15


In [97]:
# remove working dataframes to make sure we are comparing only the saved excels
del df_pathogenic
del df_vus
del df_benign
del df_exon5

### Load original vs new data

In [98]:
df_pathogenic_orig = pd.read_csv(input_path_prefix + 'pathogenic.csv')
df_vus_orig = pd.read_csv(input_path_prefix + 'vus.csv')
df_exon5_orig = pd.read_csv(input_path_prefix + 'exon5.csv')
df_benign_orig = pd.read_csv(input_path_prefix + 'benign.csv')

In [99]:
df_pathogenic_new = pd.read_csv(output_path_prefix + 'pathogenic.csv')
df_vus_new = pd.read_csv(output_path_prefix + 'vus.csv')
df_exon5_new = pd.read_csv(output_path_prefix + 'exon5.csv')
df_benign_new = pd.read_csv(output_path_prefix + 'benign.csv')

#### Show column renaming:

In [100]:
def print_diffs(df, df_orig):
    df_cols = df.columns 
    df_orig_cols = df_orig.columns 
    for i in range(0, len(df.columns)):
        if df_cols[i] != df_orig_cols[i]:
            print(f'difference: "{df_orig_cols[i]}" (orig) -> "{df_cols[i]}" (new)')

In [101]:
print_diffs(df_pathogenic_new, df_pathogenic_orig)

difference: "Location in  Genome release  37 (hg19)" (orig) -> "Location in Genome release 37 (hg19)" (new)
difference: "Location in Genome release  38 (hg38)" (orig) -> "Location in Genome release 38 (hg38)" (new)
difference: "Protein or mRNA variants" (orig) -> "Protein or mRNA Variants" (new)
difference: " Functional outcome (MLCL/CL ratio)" (orig) -> "Functional outcome (MLCL/CL ratio)" (new)
difference: "Taffazin Functional motifs " (orig) -> "Taffazin Functional motifs" (new)


In [102]:
print_diffs(df_benign_new, df_benign_orig)

difference: "Location in  Genome release 37 (hg19)" (orig) -> "Location in Genome release 37 (hg19)" (new)
difference: "Splicing prediction" (orig) -> "Splicing Prediction" (new)


In [103]:
print_diffs(df_vus_new, df_vus_orig)

difference: "Location in  Genome release 37 (hg19)" (orig) -> "Location in Genome release 37 (hg19)" (new)


In [104]:
print_diffs(df_exon5_new, df_exon5_orig)

difference: "Genome Assembly Release 37" (orig) -> "Location in Genome release 37 (hg19)" (new)
difference: "Genome Assembly Release 38" (orig) -> "Location in Genome release 38 (hg38)" (new)


#### Show that nothing else changed:

In [105]:
df_benign_orig.columns = df_benign_new.columns
assert df_benign_new.equals(df_benign_orig)

In [106]:
df_vus_orig.columns = df_vus_new.columns
assert df_vus_new.equals(df_vus_orig)

In [107]:
df_exon5_orig.columns = df_exon5_new.columns
assert df_exon5_new.equals(df_exon5_orig)

In [108]:
df_pathogenic_orig.columns = df_pathogenic_new.columns
assert df_pathogenic_new.equals(df_pathogenic_orig)

# Show new dataframes

In [109]:
df_pathogenic_new

Unnamed: 0,Location,Location in Genome release 37 (hg19),Location in Genome release 38 (hg38),Protein Variant Type,Impact of Variant,DNA Modifications,Protein or mRNA Variants,Functional outcome (MLCL/CL ratio),Taffazin Functional motifs,Method of Validation,References,Source,Additional variants in other genes,Location and Order of Discovery,Notes,Unnamed: 15
0,Exon 1,X:153640189,X:154411852,Frameshift,,c.9_10dupG,p.His4Alafs*130,MLCL/CL elevated,,,Ref. 1 (Pat.1); Ref. 80; Ref. 113,,,1-1,,
1,Exon 1,X:153640197 - 153640198,X: 154411860 - 154411861,Frameshift,,c.18_22dup,p.Pro8fs*,,,,Ref. 140; Ref.83,ClinVar,,1-12,,
2,Exon 1,X:153640219_241,X:154411882_904,Frameshift,,c.39_60del22,p.Pro14Alafs*19,MLCL/CL elevated,,,Ref. 95 (Pat. 1),,"Mitochondrial: m.1555A>G in 12S rRNA, homoplasmic",1-11,,
3,Exon 1,X:153640231,X:154411894,Nonsense,,c.51G>A,p.Trp17*,MLCL/CL elevated,,,Ref. 5; Ref. 80,,,1-2,,
4,Exon 1,X:153640231,X:154411894,Nonsense,,c.51G>A,p.Trp17*,,,,Ref. 14; 119,,,1-3,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,Large Deletion,,,Deletion,Null,partial deletion,incomplete description,MLCL/CL elevated,,,"Ref. 70, Ref. 85",,,D-21,,
402,Large Deletion,,,Deletion,Null,NM_006730.2(DNASE1L1)c.-517_NM_000116.3(TAZ)c....,Ex1_Ex5 del,MLCL/CL elevated,,,Ref. 81,,,D-22a,brother of below,
403,Large Deletion,,,Deletion,Null,NM_000116.3(TAZ)c.-72_109+51del,Ex 1 del,,,,Ref. 81,,,D-22b,brother of above,
404,Large Deletion,,,Deletion,Null,complete deletion,X: 153640161-153649363,,,,Ref. 83,ClinVar,,D-28,,


In [110]:
df_exon5_new

Unnamed: 0,Location,Classification,Location in Genome release 37 (hg19),Location in Genome release 38 (hg38),DNA Modifications,Protein or mRNA Variants,References & population frequency,Source,Amino acid conservation & comments,SIFT prediction,PolyPhen2 prediction,Splicing Prediction,Notes
0,Exon 5,VUS,X:153642485,X:154414148_52,c.418_422del ACAGGinsA,p.Arg142ThrfsX41,ExAC 1/79840; 1 hemizyg (LOW CONFIDENCE),,,,,,Not in ClinVar
1,Exon 5,VUS,X:153642486,X:154414149,c.419C>T,p.Thr140Ile,Ref. 83,ClinVar Jan 2014,Not all primates,Tolerated; 0.36,0.0,Acceptor much reduced; donor small reduction,VUS; not in ExAC
2,Exon 5,VUS,X:153642492_95,X:154414155_58,c.425_428delGGCA or c.419_422delCAGG,p.Arg142ThrfsX41,Ref. 83,ClinVar Feb 2017,Should only manifest in FL variant,,,,Likely path. Var ID 423852; Not in ExAC
3,Exon 5,BENIGN,,X:154413944 - 154414116,,,,,,,,,MIR repetitive DNA 173 bp
4,Exon 5,BENIGN,X:153642438,X:154414101,c.371G>A,p.Gly124Glu,ExAC A=0.0000164,,,,,Significantly reduced acceptor score,
5,Exon 5,BENIGN,X:153642450,X:154414113,c.383T>C,p.Phe128Ser,ExAC 521/78935; 98 hemizyg; 9 homozyg,ClinVar 2015,would only affect higher primates,All tolerated; 0.73,0.01,Lousy acceptor improved; donor slight reduction,benign on ClinVar
6,Exon 5,BENIGN,X:153642472,X:154414135,c.405A>G,p.Lys135=,Ref. 4,,,,,ex5 poor acceptor is worse; donor reduced,
7,Exon 5,BENIGN,X:153642474,X:154414137,c.407G>T,p.Gly136Val,ExAC 1/80705; 0 hemizyg,,100% conserved in primates,Tolerated; 0.13,0.867,Poor acceptor score reduced; donor some reduction,
8,Exon 5,BENIGN,X:153642504,X:154414167,c.437G>T,p.Gly146Val,ExAC 1/77810; 0 hemizyg,,100% in primates except Callicebus,Tolerated; 0.09,0.05,Acceptor minimal improvement; donor noted redu...,
9,Exon 5,BENIGN,X:153642509,X:154414172,c.442G>A,p.Gly148Arg,ExAC 1/76588; 0 hemizyg,,100% primates but missing in Saimiri,All tolerated; 0.03,0.0,Both acceptor and donor scores much reduced,


In [111]:
df_benign_new

Unnamed: 0,Location,Location in Genome release 37 (hg19),Location in Genome release 38 (hg38),DNA Modifications,Protein or mRNA Variants,References & population frequency,Source,Amino acid conservation & comments,SIFT prediction,PolyPhen2 prediction,Splicing Prediction,Additional variants in other genes,Notes
0,5'UTR,X:153640060-153640061,X:154411724-154411725,c.-119 (or '-121_-119) insert/del T,,ExAC 5786/6746 alleles; 3249 homozy; 1111 hemizy,ClinVar,,,,,,Benign
1,5'UTR,X:153640093,X:154411756,c.-88G > C,,ExAC 28/6688; 13 hemizyg; Ref. 4; Ref. 14,ClinVar Jun 2016,,,,,,
2,5'UTR,X:153640097,X:154411760,c.-84C>G,,,,,,,,,
3,5'UTR,X:153640102 - 153640103,X:154411765,c.-79_-78insG,,,,,,,,,
4,5'UTR,X:153640107,X:154411770,c.-74C>A,,ExAC A=0.000037,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,Exon 11,X:153649325,X:154420986,c.861C>A,p.His287Gln,gnomad exomes; East Asian females 1/115416,,mammals highly conserved; other verts not cons...,Tolerated; 0.56,benign,Acceptor =,,
174,Exon 11,X:153649337,X:154420998,c.873G>A,p.Gly291=,"gnomad exomes & genomic; Eur (non-Finn), Lati...",ClinVar Mar 2012,,same codon as rodent,,Acceptor score =,,Benign; reported by multiple labs
175,Exon 11,X:153649338,X:154420999,c.874A>G,p.Arg292Gly,ExAC 1/79087; 0 hemizyg,,mammals highly conserved; other verts not cons...,Not tolerated; 0,0.014,Splicing score not affected,,
176,3'UTR,X:153649368,X:154421029,25 b after Term C>A,,ExAC 1/66337; 1 hemi.,,,,,,,


In [112]:
df_vus_new

Unnamed: 0,Location,Location in Genome release 37 (hg19),Location in Genome release 38 (hg38),DNA Modifications,Protein or mRNA Variants,References & population frequency,Source,SIFT prediction,PolyPhen2 prediction,Amino acid conservation & comments,Additional variants in other genes,Notes
0,Exon 1,X:153640193,X:154411856,c.13G>T,p.Val5Leu,gnomad exomes; Ashk Jewish female 5/99652,ClinVar; LMM 2011; GeneDX 2017; Invitae 2020;...,tolerated; 0.14,0.984,Vertebrates 100% Val; invertebrates have Ile,,
1,Exon 1,X:153640197_198,X:154411860_861,c.17_18insA,fs*,Ref. 57,,,,,,Not in ClinVar
2,Exon 1,X:153640207,X:154411870,c.27C>G,p.Phe9Leu,,ClinVar; Invitae 2020; Ambry 2020,deleterious,benign,vertebrates 100%,,Not in ExAC
3,Exon 1,X:153640209,X:154411872,c.29C>G,p.Pro10Arg,Ref. 125; Ref. 126,"ClinVar Jul 2019; Klaasen lab 2019, Charite Un...",deleterious,,vertebrates 100%,,
4,Exon 1,"X:153,640,218",X:154411881,c.38C>T,p.Pro13Leu,gnomad exomes; East Asian male 1/54864,,deleterious,possibly damaging,vertebrates 98 %,,Not in ClinVar
...,...,...,...,...,...,...,...,...,...,...,...,...
121,Exon 11,X:153649313,X:154420974,c.849G>C,p.Gln283His,,ClinVar; Invitae 2019,likely tolerated ?,likely tolerated,Primates & rodents 100%; other verts variable,,
122,Exon 11,X:153649314,X:154420975,c.850C>T,p.Leu284Phe,gnomad exomes; Eur (non-Finn) male 1/67181,ClinVar; GeneDx 2015,deleterious; low confidence,probably damaging,Vertebrates 100%,,
123,Exon 11,X:153649336,X:154420997,c.872G>A,p.Gly291Glu,gnomad exomes; South Asian female 1/115350,,tolerated; low confidence,benign,"Primates, rodents 100%.",,Not in ClinVar
124,Exon 11,X:153649338,X:154420999,c.874A>G,p.Arg292Gly,gnomad exomes; Latino female 1/115350,,deleterious; low confidence,probably damaging,mammals highly conserved; other verts not cons...,,Not in ClinVar
