# Transform original excel data to CSV

In [1]:
import pandas as pd
import os
import helpers

In [2]:
input_path = '../database_original/[Data Only] Human TAFAZZIN Variants Database_v07-20-2023.xlsx'
output_path_prefix = helpers.create_database_output_path_prefix()

version number: 00000
database output path prefix: ../intermediate_pipeline_db_versions/00000_2024-01-18-17-55-32-824395_Human-TAFAZZIN-Variants-Database_


# Load excel data

In [3]:
if not os.path.exists(input_path):
    print('Please download the original database into `database_original` folder from https://drive.google.com/drive/folders/1O2MKa5FHsvq3hyjOVSsOZf37xkwKYAJ8 ')

In [4]:
xls = pd.ExcelFile(input_path)
sheet_names = xls.sheet_names
print(sheet_names)

['1.PATHOGENIC LIKELY_v07062023', '2.VUS_v01222023', '3.BENIGN_v09012021', '4.EXON 5_v01012020']


In [5]:
pathogenic_sheet_names = [a for a in sheet_names if 'PATHOGENIC' in a] 
assert len(pathogenic_sheet_names) == 1, 'we expect just one "PATHOGENIC" sheet'
df_pathogenic = pd.read_excel(xls, pathogenic_sheet_names[0])
    
vus_sheet_names = [a for a in sheet_names if 'VUS' in a] 
assert len(vus_sheet_names) == 1, 'we expect just one "VUS" sheet'
df_vus = pd.read_excel(xls, vus_sheet_names[0])
    
benign_sheet_names = [a for a in sheet_names if 'BENIGN' in a] 
assert len(benign_sheet_names) == 1, 'we expect just one "BENIGN" sheet'
df_benign = pd.read_excel(xls, benign_sheet_names[0])
    
exon5_sheet_names = [a for a in sheet_names if 'EXON 5' in a] 
assert len(exon5_sheet_names) == 1, 'we expect just one "EXON 5" sheet'
df_exon5 = pd.read_excel(xls, exon5_sheet_names[0])    

In [6]:
print(df_pathogenic.shape)
df_pathogenic.head(3)

(406, 16)


Unnamed: 0,Location,Location in Genome release 37 (hg19),Location in Genome release 38 (hg38),Protein Variant Type,Impact of Variant,DNA Modifications,Protein or mRNA variants,Functional outcome (MLCL/CL ratio),Taffazin Functional motifs,Method of Validation,References,Source,Additional variants in other genes,Location and Order of Discovery,Notes,Unnamed: 15
0,Exon 1,X:153640189,X:154411852,Frameshift,,c.9_10dupG,p.His4Alafs*130,MLCL/CL elevated,,,Ref. 1 (Pat.1); Ref. 80; Ref. 113,,,1-1,,
1,Exon 1,X:153640197 - 153640198,X: 154411860 - 154411861,Frameshift,,c.18_22dup,p.Pro8fs*,,,,Ref. 140; Ref.83,ClinVar,,1-12,,
2,Exon 1,X:153640219_241,X:154411882_904,Frameshift,,c.39_60del22,p.Pro14Alafs*19,MLCL/CL elevated,,,Ref. 95 (Pat. 1),,"Mitochondrial: m.1555A>G in 12S rRNA, homoplasmic",1-11,,


In [7]:
print(df_vus.shape)
df_vus.head(3)

(126, 12)


Unnamed: 0,Location,Location in Genome release 37 (hg19),Location in Genome release 38 (hg38),DNA Modifications,Protein or mRNA Variants,References & population frequency,Source,SIFT prediction,PolyPhen2 prediction,Amino acid conservation & comments,Additional variants in other genes,Notes
0,Exon 1,X:153640193,X:154411856,c.13G>T,p.Val5Leu,gnomad exomes; Ashk Jewish female 5/99652,ClinVar; LMM 2011; GeneDX 2017; Invitae 2020;...,tolerated; 0.14,0.984,Vertebrates 100% Val; invertebrates have Ile,,
1,Exon 1,X:153640197_198,X:154411860_861,c.17_18insA,fs*,Ref. 57,,,,,,Not in ClinVar
2,Exon 1,X:153640207,X:154411870,c.27C>G,p.Phe9Leu,,ClinVar; Invitae 2020; Ambry 2020,deleterious,benign,vertebrates 100%,,Not in ExAC


In [8]:
print(df_benign.shape)
df_benign.head(3)

(178, 13)


Unnamed: 0,Location,Location in Genome release 37 (hg19),Location in Genome release 38 (hg38),DNA Modifications,Protein or mRNA Variants,References & population frequency,Source,Amino acid conservation & comments,SIFT prediction,PolyPhen2 prediction,Splicing prediction,Additional variants in other genes,Notes
0,5'UTR,X:153640060-153640061,X:154411724-154411725,c.-119 (or '-121_-119) insert/del T,,ExAC 5786/6746 alleles; 3249 homozy; 1111 hemizy,ClinVar,,,,,,Benign
1,5'UTR,X:153640093,X:154411756,c.-88G > C,,ExAC 28/6688; 13 hemizyg; Ref. 4; Ref. 14,ClinVar Jun 2016,,,,,,
2,5'UTR,X:153640097,X:154411760,c.-84C>G,,,,,,,,,


In [9]:
print(df_exon5.shape)
df_exon5.head(3)

(11, 13)


Unnamed: 0,Location,Classification,Genome Assembly Release 37,Genome Assembly Release 38,DNA Modifications,Protein or mRNA Variants,References & population frequency,Source,Amino acid conservation & comments,SIFT prediction,PolyPhen2 prediction,Splicing Prediction,Notes
0,Exon 5,VUS,X:153642485,X:154414148_52,c.418_422del ACAGGinsA,p.Arg142ThrfsX41,ExAC 1/79840; 1 hemizyg (LOW CONFIDENCE),,,,,,Not in ClinVar
1,Exon 5,VUS,X:153642486,X:154414149,c.419C>T,p.Thr140Ile,Ref. 83,ClinVar Jan 2014,Not all primates,Tolerated; 0.36,0.0,Acceptor much reduced; donor small reduction,VUS; not in ExAC
2,Exon 5,VUS,X:153642492_95,X:154414155_58,c.425_428delGGCA or c.419_422delCAGG,p.Arg142ThrfsX41,Ref. 83,ClinVar Feb 2017,Should only manifest in FL variant,,,,Likely path. Var ID 423852; Not in ExAC


# Save as csvs

In [10]:
helpers.save_output_as_csv(output_path_prefix,
             df_pathogenic=df_pathogenic,
             df_benign=df_benign,
             df_vus=df_vus,
             df_exon5=df_exon5)

Dataframe of shape (406, 16) saved to ../intermediate_pipeline_db_versions/00000_2024-01-18-17-55-32-824395_Human-TAFAZZIN-Variants-Database_pathogenic.csv
Dataframe of shape (178, 13) saved to ../intermediate_pipeline_db_versions/00000_2024-01-18-17-55-32-824395_Human-TAFAZZIN-Variants-Database_benign.csv
Dataframe of shape (126, 12) saved to ../intermediate_pipeline_db_versions/00000_2024-01-18-17-55-32-824395_Human-TAFAZZIN-Variants-Database_vus.csv
Dataframe of shape (11, 13) saved to ../intermediate_pipeline_db_versions/00000_2024-01-18-17-55-32-824395_Human-TAFAZZIN-Variants-Database_exon5.csv


# Load and compare old and new versions

In [11]:
# remove working dataframes to make sure we are comparing only the saved excels

del df_pathogenic
del df_benign
del df_vus
del df_exon5

In [12]:
xls = pd.ExcelFile(input_path)
sheet_names = xls.sheet_names
print(sheet_names)

['1.PATHOGENIC LIKELY_v07062023', '2.VUS_v01222023', '3.BENIGN_v09012021', '4.EXON 5_v01012020']


In [13]:
xls_orig = pd.ExcelFile(input_path)

df_pathogenic_orig = pd.read_excel(xls_orig, pathogenic_sheet_names[0])
df_vus_orig = pd.read_excel(xls_orig, vus_sheet_names[0])
df_benign_orig = pd.read_excel(xls_orig, benign_sheet_names[0])
df_exon5_orig = pd.read_excel(xls_orig, exon5_sheet_names[0]) 

In [14]:
df_pathogenic_new = pd.read_csv(output_path_prefix + 'pathogenic.csv')
df_vus_new = pd.read_csv(output_path_prefix + 'vus.csv')
df_exon5_new = pd.read_csv(output_path_prefix + 'exon5.csv')
df_benign_new = pd.read_csv(output_path_prefix + 'benign.csv')

In [15]:
assert df_pathogenic_new.equals(df_pathogenic_orig)

In [16]:
assert df_exon5_new.equals(df_exon5_orig)

### For benign and vus, we have different types in one column which has mixed types:

(this is OK and expected, probably just caused by difference in loading libraries of csv files vs excel files)

In [17]:
df_benign_new.compare(df_benign_orig)

Unnamed: 0_level_0,PolyPhen2 prediction,PolyPhen2 prediction
Unnamed: 0_level_1,self,other
20,0.018,0.018
21,0.991,0.991
23,0.994,0.994
24,0.996,0.996
25,0.996,0.996
26,0.996,0.996
33,0.991,0.991
77,0.01,0.01
80,0.867,0.867
85,0.05,0.05


In [18]:
df_benign_new['PolyPhen2 prediction'].iloc[20]

'0.018'

In [19]:
df_benign_orig['PolyPhen2 prediction'].iloc[20]

0.018

In [20]:
df_benign_orig['PolyPhen2 prediction'].value_counts()

benign           13
0.996             3
0.991             2
0.018             1
0.994             1
benign  0.209     1
0.01              1
0.867             1
0.05              1
0.0               1
0.006             1
0.696             1
0.014             1
Name: PolyPhen2 prediction, dtype: int64

In [21]:
df_benign_new['PolyPhen2 prediction'].value_counts()

benign           13
0.996             3
0.991             2
0.018             1
0.994             1
benign  0.209     1
0.01              1
0.867             1
0.05              1
0.0               1
0.006             1
0.696             1
0.014             1
Name: PolyPhen2 prediction, dtype: int64

In [22]:
df_vus_new.compare(df_vus_orig)

Unnamed: 0_level_0,PolyPhen2 prediction,PolyPhen2 prediction
Unnamed: 0_level_1,self,other
0,0.984,0.984
79,0.63,0.63
80,0.999,0.999
81,0.998,0.998
97,0.993,0.993
119,0.997,0.997


In [23]:
df_vus_new['PolyPhen2 prediction'].iloc[79]

'0.63'

In [24]:
df_vus_orig['PolyPhen2 prediction'].iloc[79]

0.63

In [25]:
df_vus_new['PolyPhen2 prediction'].value_counts()

benign                                            26
probably damaging                                 12
possibly damaging                                  6
likely tolerated                                   2
0.984                                              1
0.63                                               1
benirgn                                            1
SIFT, PolyPhen-2, Align-GVGD suggest tolerated     1
damaging                                           1
0.993                                              1
possibly damaging 0.942                            1
Possibly damaging: 0.918??  Benign??               1
0.998                                              1
0.999                                              1
likely benign                                      1
probably-damaging:0.989                            1
probably damaging 0.998                            1
benign  0                                          1
p.LOF high                                    

In [26]:
df_vus_orig['PolyPhen2 prediction'].value_counts()

benign                                            26
probably damaging                                 12
possibly damaging                                  6
likely tolerated                                   2
0.984                                              1
0.63                                               1
benirgn                                            1
SIFT, PolyPhen-2, Align-GVGD suggest tolerated     1
damaging                                           1
0.993                                              1
possibly damaging 0.942                            1
Possibly damaging: 0.918??  Benign??               1
0.998                                              1
0.999                                              1
likely benign                                      1
probably-damaging:0.989                            1
probably damaging 0.998                            1
benign  0                                          1
p.LOF high                                    

# Show new dataframes

In [27]:
df_benign_new

Unnamed: 0,Location,Location in Genome release 37 (hg19),Location in Genome release 38 (hg38),DNA Modifications,Protein or mRNA Variants,References & population frequency,Source,Amino acid conservation & comments,SIFT prediction,PolyPhen2 prediction,Splicing prediction,Additional variants in other genes,Notes
0,5'UTR,X:153640060-153640061,X:154411724-154411725,c.-119 (or '-121_-119) insert/del T,,ExAC 5786/6746 alleles; 3249 homozy; 1111 hemizy,ClinVar,,,,,,Benign
1,5'UTR,X:153640093,X:154411756,c.-88G > C,,ExAC 28/6688; 13 hemizyg; Ref. 4; Ref. 14,ClinVar Jun 2016,,,,,,
2,5'UTR,X:153640097,X:154411760,c.-84C>G,,,,,,,,,
3,5'UTR,X:153640102 - 153640103,X:154411765,c.-79_-78insG,,,,,,,,,
4,5'UTR,X:153640107,X:154411770,c.-74C>A,,ExAC A=0.000037,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,Exon 11,X:153649325,X:154420986,c.861C>A,p.His287Gln,gnomad exomes; East Asian females 1/115416,,mammals highly conserved; other verts not cons...,Tolerated; 0.56,benign,Acceptor =,,
174,Exon 11,X:153649337,X:154420998,c.873G>A,p.Gly291=,"gnomad exomes & genomic; Eur (non-Finn), Lati...",ClinVar Mar 2012,,same codon as rodent,,Acceptor score =,,Benign; reported by multiple labs
175,Exon 11,X:153649338,X:154420999,c.874A>G,p.Arg292Gly,ExAC 1/79087; 0 hemizyg,,mammals highly conserved; other verts not cons...,Not tolerated; 0,0.014,Splicing score not affected,,
176,3'UTR,X:153649368,X:154421029,25 b after Term C>A,,ExAC 1/66337; 1 hemi.,,,,,,,


In [28]:
df_pathogenic_new

Unnamed: 0,Location,Location in Genome release 37 (hg19),Location in Genome release 38 (hg38),Protein Variant Type,Impact of Variant,DNA Modifications,Protein or mRNA variants,Functional outcome (MLCL/CL ratio),Taffazin Functional motifs,Method of Validation,References,Source,Additional variants in other genes,Location and Order of Discovery,Notes,Unnamed: 15
0,Exon 1,X:153640189,X:154411852,Frameshift,,c.9_10dupG,p.His4Alafs*130,MLCL/CL elevated,,,Ref. 1 (Pat.1); Ref. 80; Ref. 113,,,1-1,,
1,Exon 1,X:153640197 - 153640198,X: 154411860 - 154411861,Frameshift,,c.18_22dup,p.Pro8fs*,,,,Ref. 140; Ref.83,ClinVar,,1-12,,
2,Exon 1,X:153640219_241,X:154411882_904,Frameshift,,c.39_60del22,p.Pro14Alafs*19,MLCL/CL elevated,,,Ref. 95 (Pat. 1),,"Mitochondrial: m.1555A>G in 12S rRNA, homoplasmic",1-11,,
3,Exon 1,X:153640231,X:154411894,Nonsense,,c.51G>A,p.Trp17*,MLCL/CL elevated,,,Ref. 5; Ref. 80,,,1-2,,
4,Exon 1,X:153640231,X:154411894,Nonsense,,c.51G>A,p.Trp17*,,,,Ref. 14; 119,,,1-3,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,Large Deletion,,,Deletion,Null,partial deletion,incomplete description,MLCL/CL elevated,,,"Ref. 70, Ref. 85",,,D-21,,
402,Large Deletion,,,Deletion,Null,NM_006730.2(DNASE1L1)c.-517_NM_000116.3(TAZ)c....,Ex1_Ex5 del,MLCL/CL elevated,,,Ref. 81,,,D-22a,brother of below,
403,Large Deletion,,,Deletion,Null,NM_000116.3(TAZ)c.-72_109+51del,Ex 1 del,,,,Ref. 81,,,D-22b,brother of above,
404,Large Deletion,,,Deletion,Null,complete deletion,X: 153640161-153649363,,,,Ref. 83,ClinVar,,D-28,,


In [29]:
df_vus_new

Unnamed: 0,Location,Location in Genome release 37 (hg19),Location in Genome release 38 (hg38),DNA Modifications,Protein or mRNA Variants,References & population frequency,Source,SIFT prediction,PolyPhen2 prediction,Amino acid conservation & comments,Additional variants in other genes,Notes
0,Exon 1,X:153640193,X:154411856,c.13G>T,p.Val5Leu,gnomad exomes; Ashk Jewish female 5/99652,ClinVar; LMM 2011; GeneDX 2017; Invitae 2020;...,tolerated; 0.14,0.984,Vertebrates 100% Val; invertebrates have Ile,,
1,Exon 1,X:153640197_198,X:154411860_861,c.17_18insA,fs*,Ref. 57,,,,,,Not in ClinVar
2,Exon 1,X:153640207,X:154411870,c.27C>G,p.Phe9Leu,,ClinVar; Invitae 2020; Ambry 2020,deleterious,benign,vertebrates 100%,,Not in ExAC
3,Exon 1,X:153640209,X:154411872,c.29C>G,p.Pro10Arg,Ref. 125; Ref. 126,"ClinVar Jul 2019; Klaasen lab 2019, Charite Un...",deleterious,,vertebrates 100%,,
4,Exon 1,"X:153,640,218",X:154411881,c.38C>T,p.Pro13Leu,gnomad exomes; East Asian male 1/54864,,deleterious,possibly damaging,vertebrates 98 %,,Not in ClinVar
...,...,...,...,...,...,...,...,...,...,...,...,...
121,Exon 11,X:153649313,X:154420974,c.849G>C,p.Gln283His,,ClinVar; Invitae 2019,likely tolerated ?,likely tolerated,Primates & rodents 100%; other verts variable,,
122,Exon 11,X:153649314,X:154420975,c.850C>T,p.Leu284Phe,gnomad exomes; Eur (non-Finn) male 1/67181,ClinVar; GeneDx 2015,deleterious; low confidence,probably damaging,Vertebrates 100%,,
123,Exon 11,X:153649336,X:154420997,c.872G>A,p.Gly291Glu,gnomad exomes; South Asian female 1/115350,,tolerated; low confidence,benign,"Primates, rodents 100%.",,Not in ClinVar
124,Exon 11,X:153649338,X:154420999,c.874A>G,p.Arg292Gly,gnomad exomes; Latino female 1/115350,,deleterious; low confidence,probably damaging,mammals highly conserved; other verts not cons...,,Not in ClinVar


In [30]:
df_exon5_new

Unnamed: 0,Location,Classification,Genome Assembly Release 37,Genome Assembly Release 38,DNA Modifications,Protein or mRNA Variants,References & population frequency,Source,Amino acid conservation & comments,SIFT prediction,PolyPhen2 prediction,Splicing Prediction,Notes
0,Exon 5,VUS,X:153642485,X:154414148_52,c.418_422del ACAGGinsA,p.Arg142ThrfsX41,ExAC 1/79840; 1 hemizyg (LOW CONFIDENCE),,,,,,Not in ClinVar
1,Exon 5,VUS,X:153642486,X:154414149,c.419C>T,p.Thr140Ile,Ref. 83,ClinVar Jan 2014,Not all primates,Tolerated; 0.36,0.0,Acceptor much reduced; donor small reduction,VUS; not in ExAC
2,Exon 5,VUS,X:153642492_95,X:154414155_58,c.425_428delGGCA or c.419_422delCAGG,p.Arg142ThrfsX41,Ref. 83,ClinVar Feb 2017,Should only manifest in FL variant,,,,Likely path. Var ID 423852; Not in ExAC
3,Exon 5,BENIGN,,X:154413944 - 154414116,,,,,,,,,MIR repetitive DNA 173 bp
4,Exon 5,BENIGN,X:153642438,X:154414101,c.371G>A,p.Gly124Glu,ExAC A=0.0000164,,,,,Significantly reduced acceptor score,
5,Exon 5,BENIGN,X:153642450,X:154414113,c.383T>C,p.Phe128Ser,ExAC 521/78935; 98 hemizyg; 9 homozyg,ClinVar 2015,would only affect higher primates,All tolerated; 0.73,0.01,Lousy acceptor improved; donor slight reduction,benign on ClinVar
6,Exon 5,BENIGN,X:153642472,X:154414135,c.405A>G,p.Lys135=,Ref. 4,,,,,ex5 poor acceptor is worse; donor reduced,
7,Exon 5,BENIGN,X:153642474,X:154414137,c.407G>T,p.Gly136Val,ExAC 1/80705; 0 hemizyg,,100% conserved in primates,Tolerated; 0.13,0.867,Poor acceptor score reduced; donor some reduction,
8,Exon 5,BENIGN,X:153642504,X:154414167,c.437G>T,p.Gly146Val,ExAC 1/77810; 0 hemizyg,,100% in primates except Callicebus,Tolerated; 0.09,0.05,Acceptor minimal improvement; donor noted redu...,
9,Exon 5,BENIGN,X:153642509,X:154414172,c.442G>A,p.Gly148Arg,ExAC 1/76588; 0 hemizyg,,100% primates but missing in Saimiri,All tolerated; 0.03,0.0,Both acceptor and donor scores much reduced,
