In [1]:
import pandas as pd
import numpy as np

In [44]:
df_uniprotid = pd.read_csv('./data/metas/uniprotid_uniprot_mapping.csv')
df_uniprotid

Unnamed: 0,UniProtKB-AC,ID
0,P31946,1433B_HUMAN
1,P62258,1433E_HUMAN
2,Q04917,1433F_HUMAN
3,P61981,1433G_HUMAN
4,P31947,1433S_HUMAN
...,...,...
207044,Q5ZEY9,Q5ZEY9_HUMAN
207045,V9H1E3,V9H1E3_HUMAN
207046,B4E0L3,B4E0L3_HUMAN
207047,Q59FC1,Q59FC1_HUMAN


In [45]:
# use uniparc to trace non-redundant ids
df_uniparc = pd.read_csv('./data/metas/unipac_uniprot_mapping.csv')
df_uniparc

Unnamed: 0,UniProtKB-AC,ID
0,P31946-1,UPI000013C714
1,P31946-2,UPI000059C8F6
2,P31946,UPI000013C714
3,P62258-2,UPI00001E6021
4,P62258-1,UPI0000021A46
...,...,...
239679,Q5ZEY9,UPI0000450360
239680,V9H1E3,UPI000011E201
239681,B4E0L3,UPI00017A8581
239682,Q59FC1,UPI00004F6D35


In [46]:
df_intermediate = pd.merge(df_uniprotid, df_uniparc, how='left', on='UniProtKB-AC')
df_intermediate

Unnamed: 0,UniProtKB-AC,ID_x,ID_y
0,P31946,1433B_HUMAN,UPI000013C714
1,P62258,1433E_HUMAN,UPI0000021A46
2,Q04917,1433F_HUMAN,UPI000013CC64
3,P61981,1433G_HUMAN,UPI000000106B
4,P31947,1433S_HUMAN,UPI000004D117
...,...,...,...
207044,Q5ZEY9,Q5ZEY9_HUMAN,UPI0000450360
207045,V9H1E3,V9H1E3_HUMAN,UPI000011E201
207046,B4E0L3,B4E0L3_HUMAN,UPI00017A8581
207047,Q59FC1,Q59FC1_HUMAN,UPI00004F6D35


In [47]:
df_clinvar_organized = pd.read_csv('./data/metas/final_clinvar_variant_uniprot_id_mapping.csv')

In [48]:
cared_uniprotids = pd.DataFrame(df_clinvar_organized['UniProtKB-AC'].unique(), columns=['UniProtKB-AC'])
cared_uniprotids

Unnamed: 0,UniProtKB-AC
0,Q92610
1,Q96CU9
2,Q8TB37
3,Q30201
4,Q9P2L0-2
...,...
9039,Q92621
9040,P03999
9041,P10619
9042,P23760


In [49]:
# all have mappings, no NAs
clinvar_cared_uniparc_mapping = pd.merge(cared_uniprotids, df_uniparc, how='left', on='UniProtKB-AC')
clinvar_cared_uniparc_mapping

Unnamed: 0,UniProtKB-AC,ID
0,Q92610,UPI000013E5FC
1,Q96CU9,UPI0000037C04
2,Q8TB37,UPI00003669AB
3,Q30201,UPI0000001700
4,Q9P2L0-2,UPI00001AF317
...,...,...
9039,Q92621,UPI00001D74D8
9040,P03999,UPI000000014B
9041,P10619,UPI00001323E1
9042,P23760,UPI0000131369


In [50]:
df_clinvar_cared_uniprotkbid_mapping = pd.merge(clinvar_cared_uniparc_mapping, df_intermediate, how='left', left_on='ID', right_on='ID_y')
df_clinvar_cared_uniprotkbid_mapping

Unnamed: 0,UniProtKB-AC_x,ID,UniProtKB-AC_y,ID_x,ID_y
0,Q92610,UPI000013E5FC,Q92610,ZN592_HUMAN,UPI000013E5FC
1,Q96CU9,UPI0000037C04,Q96CU9,FXRD1_HUMAN,UPI0000037C04
2,Q8TB37,UPI00003669AB,Q8TB37,NUBPL_HUMAN,UPI00003669AB
3,Q8TB37,UPI00003669AB,X5D2R5,X5D2R5_HUMAN,UPI00003669AB
4,Q30201,UPI0000001700,Q30201,HFE_HUMAN,UPI0000001700
...,...,...,...,...,...
11064,P03999,UPI000000014B,Q0PJU0,Q0PJU0_HUMAN,UPI000000014B
11065,P10619,UPI00001323E1,P10619,PPGB_HUMAN,UPI00001323E1
11066,P23760,UPI0000131369,P23760,PAX3_HUMAN,UPI0000131369
11067,B0ZBE2,UPI0000125B13,P01019,ANGT_HUMAN,UPI0000125B13


In [51]:
df_clinvar_cared_uniprotkbid_mapping[df_clinvar_cared_uniprotkbid_mapping['ID_x'] == 'A1AT_HUMAN']

Unnamed: 0,UniProtKB-AC_x,ID,UniProtKB-AC_y,ID_x,ID_y
2229,E9KL23,UPI000000CBEC,P01009,A1AT_HUMAN,UPI000000CBEC


In [52]:
# don't care about NaN (isoforms)
df_clinvar_cared_uniprotkbid_mapping.dropna(inplace=True)
df_clinvar_cared_uniprotkbid_mapping.drop(['ID', 'ID_y', 'UniProtKB-AC_y'], axis=1, inplace=True)
df_clinvar_cared_uniprotkbid_mapping.rename(columns={'UniProtKB-AC_x': 'UniProtKB-AC', 'ID_x':'UniProtKB-ID'}, inplace=True)
df_clinvar_cared_uniprotkbid_mapping

Unnamed: 0,UniProtKB-AC,UniProtKB-ID
0,Q92610,ZN592_HUMAN
1,Q96CU9,FXRD1_HUMAN
2,Q8TB37,NUBPL_HUMAN
3,Q8TB37,X5D2R5_HUMAN
4,Q30201,HFE_HUMAN
...,...,...
11064,P03999,Q0PJU0_HUMAN
11065,P10619,PPGB_HUMAN
11066,P23760,PAX3_HUMAN
11067,B0ZBE2,ANGT_HUMAN


In [53]:
# duplicated IDs exists
df_clinvar_cared_uniprotkbid_mapping[df_clinvar_cared_uniprotkbid_mapping.duplicated(subset=['UniProtKB-AC'], keep=False)].sort_values('UniProtKB-AC')

Unnamed: 0,UniProtKB-AC,UniProtKB-ID
9668,A0A023T787,RBM8A_HUMAN
9669,A0A023T787,A0A023T787_HUMAN
10721,A0A024QYR3,A0A024QYR3_HUMAN
10720,A0A024QYR3,TM9S4_HUMAN
1739,A0A024QYT5,PAI1_HUMAN
...,...,...
6296,Q9UI43,MRM2_HUMAN
9527,Q9Y2Y9,X5DNR2_HUMAN
9526,Q9Y2Y9,KLF13_HUMAN
10147,Q9Y6N8,X5D8X5_HUMAN


In [61]:
# get the ones EVE used
import os
uniprotkb_id_list = []
path = './data/eve_download/variant_files/'
for file in os.listdir(path):
    uniprotkb_id_list.append(os.path.splitext(file)[0])
assert len(uniprotkb_id_list) == 3212

In [62]:
final_mapping_df = df_clinvar_cared_uniprotkbid_mapping.loc[df_clinvar_cared_uniprotkbid_mapping['UniProtKB-ID'].isin(uniprotkb_id_list)]
final_mapping_df

Unnamed: 0,UniProtKB-AC,UniProtKB-ID
1,Q96CU9,FXRD1_HUMAN
2,Q8TB37,NUBPL_HUMAN
4,Q30201,HFE_HUMAN
7,Q86XE5,HOGA1_HUMAN
9,O95876,FRITZ_HUMAN
...,...,...
11061,Q8N2C7,UNC80_HUMAN
11062,Q92621,NU205_HUMAN
11065,P10619,PPGB_HUMAN
11066,P23760,PAX3_HUMAN


In [56]:
# Checked, these three are the exactly same proteins
final_mapping_df[final_mapping_df.duplicated(subset=['UniProtKB-AC'], keep=False)].sort_values('UniProtKB-AC')

Unnamed: 0,UniProtKB-AC,UniProtKB-ID
2676,B4DJ51,CALM1_HUMAN
2677,B4DJ51,CALM2_HUMAN
2678,B4DJ51,CALM3_HUMAN


In [57]:
final_mapping_df.drop_duplicates(subset=['UniProtKB-AC'],inplace=True)
final_mapping_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_mapping_df.drop_duplicates(subset=['UniProtKB-AC'],inplace=True)


Unnamed: 0,UniProtKB-AC,UniProtKB-ID
1,Q96CU9,FXRD1_HUMAN
2,Q8TB37,NUBPL_HUMAN
4,Q30201,HFE_HUMAN
7,Q86XE5,HOGA1_HUMAN
9,O95876,FRITZ_HUMAN
...,...,...
11061,Q8N2C7,UNC80_HUMAN
11062,Q92621,NU205_HUMAN
11065,P10619,PPGB_HUMAN
11066,P23760,PAX3_HUMAN


In [58]:
# always double check for duplicates
final_mapping_df[final_mapping_df.duplicated(subset=['UniProtKB-ID'], keep=False)].sort_values('UniProtKB-ID')

Unnamed: 0,UniProtKB-AC,UniProtKB-ID


In [59]:
final_mapping_df.to_csv('./data/metas/clinvar_variant_uniprotid_uniprotkb_mapping.csv', index=False)

In [63]:
# unmapped entries
set1 = set(uniprotkb_id_list)
set2 = set(final_mapping_df['UniProtKB-ID'].tolist())
unique_to_list1 = set1 - set2
unique_to_list2 = set2 - set1
print('unique to eve list', unique_to_list1)
print('unique to final mapping (should be none)', unique_to_list2)

unique to eve list {'TECT1_HUMAN', 'MYH14_HUMAN', 'PIGP_HUMAN', 'INAVA_HUMAN', 'CACP_HUMAN', 'KMT2A_HUMAN', 'G6PD_HUMAN', 'MUTYH_HUMAN', 'MATN4_HUMAN', 'HECW2_HUMAN', 'ZP2_HUMAN', 'G6PC_HUMAN', 'H6ST2_HUMAN', 'KNL1_HUMAN', 'NCYM_HUMAN', 'COIA1_HUMAN', 'FGF12_HUMAN', 'PUS3_HUMAN', 'MTO1_HUMAN', 'ASCC1_HUMAN', 'ANK1_HUMAN', 'SON_HUMAN', 'NALP7_HUMAN', 'KCNC1_HUMAN', 'MEF2A_HUMAN', 'LRMDA_HUMAN', 'MMP23_HUMAN', 'HGNAT_HUMAN', 'PKP1_HUMAN', 'CGAT1_HUMAN', 'ZEB1_HUMAN', 'KLRG1_HUMAN', 'FZR1_HUMAN', 'ANC2_HUMAN', 'TRRAP_HUMAN', 'OXA1L_HUMAN', 'DYR1A_HUMAN', 'KCD11_HUMAN', 'MCPH1_HUMAN', 'PEX5_HUMAN', 'CFA91_HUMAN', 'SEM3D_HUMAN', 'GBRB2_HUMAN', 'DFFB_HUMAN', 'FGF8_HUMAN', 'ATD3A_HUMAN', 'MPP3_HUMAN', 'WT1_HUMAN', 'CTBP1_HUMAN', 'GRIA2_HUMAN', 'NLS1_HUMAN', 'RBX2_HUMAN', 'WWTR1_HUMAN', 'DMD_HUMAN', 'APRIO_HUMAN', 'UD110_HUMAN', 'TNNT3_HUMAN', 'STIL_HUMAN', 'LAP2A_HUMAN', '.ipynb_checkpoints', 'MYO6_HUMAN', 'GNAL_HUMAN', 'CCD50_HUMAN', 'DYL1_HUMAN', 'SYCC_HUMAN', 'CMC1_HUMAN', 'UBP11_HUMAN', '

In [65]:
set3 = set(df_uniprotid['ID'])
unique_to_list3 = set1 - set3
unique_to_list3

{'.ipynb_checkpoints',
 'AFG2H_HUMAN',
 'BRCA1_BRCT_HUMAN',
 'BRCA1_RING_HUMAN',
 'CH037_HUMAN',
 'CL065_HUMAN',
 'CMC1_HUMAN',
 'CMC2_HUMAN',
 'DDX58_HUMAN',
 'G6PC_HUMAN',
 'GLCM_HUMAN',
 'K1109_HUMAN',
 'MLRS_HUMAN',
 'S22A5_HUMAN',
 'SPYA_HUMAN',
 'TILB_HUMAN',
 'TRUA_HUMAN'}

In [67]:
print(len(unique_to_list3)-1)

16


In [66]:
df_uniprotid[df_uniprotid['ID'] == 'BRCA1_BRCT_HUMAN']

Unnamed: 0,UniProtKB-AC,ID
