# Transform original excel data to CSV

In [2]:
import pandas as pd
import os
import helpers

In [3]:
input_path = '../database_original/[Data Only] Human TAFAZZIN Variants Database_v07-20-2023.xlsx'
output_path_prefix = helpers.create_database_output_path_prefix()

AssertionError: version_number (= beginning of filename of this Python notebook before the underscore) must have length 4, now is 00000

# Load excel data

In [None]:
if not os.path.exists(input_path):
    print('Please download the original database into `database_original` folder from https://drive.google.com/drive/folders/1O2MKa5FHsvq3hyjOVSsOZf37xkwKYAJ8 ')

In [None]:
xls = pd.ExcelFile(input_path)
sheet_names = xls.sheet_names
print(sheet_names)

In [None]:
pathogenic_sheet_names = [a for a in sheet_names if 'PATHOGENIC' in a] 
assert len(pathogenic_sheet_names) == 1, 'we expect just one "PATHOGENIC" sheet'
df_pathogenic = pd.read_excel(xls, pathogenic_sheet_names[0])
    
vus_sheet_names = [a for a in sheet_names if 'VUS' in a] 
assert len(vus_sheet_names) == 1, 'we expect just one "VUS" sheet'
df_vus = pd.read_excel(xls, vus_sheet_names[0])
    
benign_sheet_names = [a for a in sheet_names if 'BENIGN' in a] 
assert len(benign_sheet_names) == 1, 'we expect just one "BENIGN" sheet'
df_benign = pd.read_excel(xls, benign_sheet_names[0])
    
exon5_sheet_names = [a for a in sheet_names if 'EXON 5' in a] 
assert len(exon5_sheet_names) == 1, 'we expect just one "EXON 5" sheet'
df_exon5 = pd.read_excel(xls, exon5_sheet_names[0])    

In [None]:
print(df_pathogenic.shape)
df_pathogenic.head(3)

In [None]:
print(df_vus.shape)
df_vus.head(3)

In [None]:
print(df_benign.shape)
df_benign.head(3)

In [None]:
print(df_exon5.shape)
df_exon5.head(3)

# Save as csvs

In [None]:
helpers.save_output_as_csv(output_path_prefix,
             df_pathogenic=df_pathogenic,
             df_benign=df_benign,
             df_vus=df_vus,
             df_exon5=df_exon5)

# Load and compare old and new versions

In [None]:
# remove working dataframes to make sure we are comparing only the saved excels

del df_pathogenic
del df_benign
del df_vus
del df_exon5

In [None]:
xls = pd.ExcelFile(input_path)
sheet_names = xls.sheet_names
print(sheet_names)

In [None]:
xls_orig = pd.ExcelFile(input_path)

df_pathogenic_orig = pd.read_excel(xls_orig, pathogenic_sheet_names[0])
df_vus_orig = pd.read_excel(xls_orig, vus_sheet_names[0])
df_benign_orig = pd.read_excel(xls_orig, benign_sheet_names[0])
df_exon5_orig = pd.read_excel(xls_orig, exon5_sheet_names[0]) 

In [None]:
df_pathogenic_new = pd.read_csv(output_path_prefix + 'pathogenic.csv')
df_vus_new = pd.read_csv(output_path_prefix + 'vus.csv')
df_exon5_new = pd.read_csv(output_path_prefix + 'exon5.csv')
df_benign_new = pd.read_csv(output_path_prefix + 'benign.csv')

In [None]:
assert df_pathogenic_new.equals(df_pathogenic_orig)

In [None]:
assert df_exon5_new.equals(df_exon5_orig)

### For benign and vus, we have different types in one column which has mixed types:

(this is OK and expected, probably just caused by difference in loading libraries of csv files vs excel files)

In [None]:
df_benign_new.compare(df_benign_orig)

In [None]:
df_benign_new['PolyPhen2 prediction'].iloc[20]

In [None]:
df_benign_orig['PolyPhen2 prediction'].iloc[20]

In [None]:
df_benign_orig['PolyPhen2 prediction'].value_counts()

In [None]:
df_benign_new['PolyPhen2 prediction'].value_counts()

In [None]:
df_vus_new.compare(df_vus_orig)

In [None]:
df_vus_new['PolyPhen2 prediction'].iloc[79]

In [None]:
df_vus_orig['PolyPhen2 prediction'].iloc[79]

In [None]:
df_vus_new['PolyPhen2 prediction'].value_counts()

In [None]:
df_vus_orig['PolyPhen2 prediction'].value_counts()

# Show new dataframes

In [1]:
df_benign_new

NameError: name 'df_benign_new' is not defined