In [1]:
import pandas as pd
import glob
from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray
import os
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem

In [2]:
csv_list = glob.glob('*_sub_location.csv')
csv_list

['chembl_sub_location.csv',
 'drug_central_sub_location.csv',
 'pharos_sub_location.csv']

In [3]:
df = pd.concat([pd.read_csv(csv_file) for csv_file in csv_list])
df.dropna(subset=['location'],inplace=True)
df

Unnamed: 0,uni_prot_id,location
0,P04053,Nucleus
1,P02766,"Secreted, Cytoplasm"
2,P61026,Cytoplasmic vesicle membrane Lipid-anchor Cyto...
3,P40429,Cytoplasm
4,P49685,Cell membrane Multi-pass membrane protein
...,...,...
952,Q9Y5Y4,Cell membrane Multi-pass membrane protein
953,P26439,Endoplasmic reticulum membrane Single-pass mem...
954,P36888,"Membrane Single-pass type I membrane protein, ..."
955,Q5JUK3,Cell membrane Multi-pass membrane protein


In [4]:
df[df['location'].str.contains('synapse')].loc[101]['location']

'Cell membrane Peripheral membrane protein, Membrane Clathrin-coated pit, Presynapse'

In [5]:
df[~df['location'].str.contains('synapse')]

Unnamed: 0,uni_prot_id,location
0,P04053,Nucleus
1,P02766,"Secreted, Cytoplasm"
2,P61026,Cytoplasmic vesicle membrane Lipid-anchor Cyto...
3,P40429,Cytoplasm
4,P49685,Cell membrane Multi-pass membrane protein
...,...,...
952,Q9Y5Y4,Cell membrane Multi-pass membrane protein
953,P26439,Endoplasmic reticulum membrane Single-pass mem...
954,P36888,"Membrane Single-pass type I membrane protein, ..."
955,Q5JUK3,Cell membrane Multi-pass membrane protein


In [6]:
df_synapse = df[df['location'].str.contains('synapse')]

In [7]:
def all_synapse(s):
    words = s.split(',')
    return all('synapse' in word.lower() for word in words)

df_synapse.loc[df_synapse['location'].apply(all_synapse), 'location'] = np.nan
df_synapse

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_synapse.loc[df_synapse['location'].apply(all_synapse), 'location'] = np.nan


Unnamed: 0,uni_prot_id,location
94,O15056,"Cytoplasm, Cell membrane, Membrane raft, Presy..."
471,Q09470,"Cell membrane Multi-pass membrane protein, Mem..."
496,P42261,"Cell membrane Multi-pass membrane protein, End..."
1353,P35626,
1380,Q9C0H9,"Cytoplasm, Cytoplasm Cytoskeleton, Cell projec..."
1392,Q2M2I8,"Cell membrane Peripheral membrane protein, Mem..."
1694,P28223,"Cell membrane Multi-pass membrane protein, Cel..."
1958,P78352,"Cell membrane Lipid-anchor Cytoplasmic side, P..."
2556,P21554,"Cell membrane Multi-pass membrane protein, Mem..."
2726,P30531,"Cell membrane Multi-pass membrane protein, Pre..."


In [8]:
df_pharos = pd.read_csv('pharos.csv')
df_pharos = df_pharos[['smiles','uniprot']]
df_pharos

Unnamed: 0,smiles,uniprot
0,CCCCC1(CCCC)CN(C2=CC=CC=C2)C2=C(C=C(OCC(=O)N[C...,Q12908
1,CC(C)(CO)C1=CC2=C(C=C(F)C(NC(=O)C3(CC3)C3=CC=C...,P13569
2,,P13569
3,CC(C)(C)C1=CC(=C(NC(=O)C2=CNC3=CC=CC=C3C2=O)C=...,P13569
4,CC1=CC=C(NC(=O)C2(CC2)C2=CC=C3OC(F)(F)OC3=C2)N...,P13569
...,...,...
3862,[H][C@]12C[C@]([H])([C@@H](OC)[C@](C)(O1)N1C3=...,Q16513
3863,[H][C@]12C[C@]([H])([C@@H](OC)[C@](C)(O1)N1C3=...,Q9UHD2
3864,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,O00506
3865,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,Q9P289


In [9]:
df_chembl = pd.read_csv('chembl.csv')
df_chembl = df_chembl[['SMILES','UniProt_ID']]
df_chembl = df_chembl.rename(columns={'SMILES': 'smiles', 'UniProt_ID': 'uniprot'})
df_chembl

Unnamed: 0,smiles,uniprot
0,CC(=N)NCCSCC[C@H](N)C(=O)O,P35228
1,C[C@@H]1O[C@H](C[N+](C)(C)C)C[C@H]1O,P20309
2,C[C@@H]1O[C@H](C[N+](C)(C)C)C[C@H]1O,P08172
3,O=C1c2ccccc2C(=O)c2c1ccc(O)c2O,Q92731
4,O=C1c2ccccc2C(=O)c2c1ccc(O)c2O,P03372
...,...,...
394540,CCn1c(C(=O)N(C2CC2)C2CC2)cc2c3c(ncn3C)c(Nc3cc(...,Q05397
394541,CCn1c(C(=O)N(C2CC2)C2CC2)cc2c3c(ncn3C)c(Nc3cc(...,P07949
394542,CCn1c(C(=O)N(C2CC2)C2CC2)cc2c3c(ncn3C)c(Nc3cc(...,P12931
394543,CCn1c(C(=O)N(C2CC2)C2CC2)cc2c3c(ncn3C)c(Nc3cc(...,Q9Y6E0


In [10]:
df_drug_central = pd.read_csv('drug_central/tchem_drugs_05122020.tsv',sep='\t')
df_drug_central = df_drug_central[['smiles','uniprot']]
df_drug_central

Unnamed: 0,smiles,uniprot
0,COC1=C(OCCCN2CCOCC2)C=CC2=C1N=C(NC(=O)C1=CN=C(...,P42338
1,O=C1NC2=C(C=C1)C=CC(OCCCCN1CCN(CC1)C1=C3C=CSC3...,P21917
2,O=C1NC2=C(C=C1)C=CC(OCCCCN1CCN(CC1)C1=C3C=CSC3...,P50406
3,NC1=NC=NC2=C1N=CN2[C@@H]1O[C@H](COP(O)(=O)OP(O...,P51575
4,NC1=NC=NC2=C1N=CN2[C@@H]1O[C@H](COP(O)(=O)OP(O...,Q99571
...,...,...
638,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,Q9Y6E0
639,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,Q8IVH8
640,COC1=CC(NC2=NC(NC3=CC=C4OC(C)(C)C(=O)N(COP(O)(...,P0DMS8
641,CCN(C1CCOCC1)C1=CC(=CC(C(=O)NCC2=C(C)C=C(C)NC2...,Q92800


In [11]:
df_all = pd.concat([df_chembl, df_pharos, df_drug_central])
df_all = (
    df_all.dropna(subset=['smiles'])
    .drop_duplicates()
    .merge(df, left_on='uniprot', right_on='uni_prot_id', how='left')
    .drop_duplicates()
)
mask = (df_all['uni_prot_id'] != 'P35626') & (df_all['location'].notna()) & (~df_all['location'].str.contains('synapse', case=False, na=False))
df_all = df_all[mask]
df_all['location'] = df_all['location'].str.replace(', ', ',').str.lower().str.split(',')

In [12]:
df_all = pd.concat([df_chembl,df_pharos,df_drug_central])
df_all.dropna(subset=['smiles'],inplace=True)
df_all.drop_duplicates(inplace=True)
df_all = df_all.merge(df, left_on='uniprot', right_on='uni_prot_id', how='left')
df_all.drop_duplicates(inplace = True)
df_all = df_all[df_all['uni_prot_id']!='P35626']
df_all.dropna(subset=['location'],inplace=True)
df_all = df_all[df_all['location']!= np.nan]
df_all = df_all[~df_all['location'].str.contains('synapse', case=False)]
df_all['location'] = df_all['location'].str.replace(', ', ',').str.lower()
df_all['location'] = df_all['location'].str.split(',')
df_all

Unnamed: 0,smiles,uniprot,uni_prot_id,location
0,CC(=N)NCCSCC[C@H](N)C(=O)O,P35228,P35228,[cytoplasm cytosol]
1,C[C@@H]1O[C@H](C[N+](C)(C)C)C[C@H]1O,P20309,P20309,"[cell membrane multi-pass membrane protein, po..."
3,C[C@@H]1O[C@H](C[N+](C)(C)C)C[C@H]1O,P08172,P08172,"[cell membrane multi-pass membrane protein, po..."
5,O=C1c2ccccc2C(=O)c2c1ccc(O)c2O,Q92731,Q92731,[nucleus]
7,O=C1c2ccccc2C(=O)c2c1ccc(O)c2O,P03372,P03372,"[nucleus, cytoplasm, cell membrane peripheral ..."
...,...,...,...,...
675170,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,Q9P289,Q9P289,"[cytoplasm, golgi apparatus]"
675173,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,Q9Y6E0,Q9Y6E0,"[cytoplasm, nucleus, membrane]"
675176,COC1=CC(NC2=NC(NC3=CC=C4OC(C)(C)C(=O)N(COP(O)(...,P0DMS8,P0DMS8,[cell membrane multi-pass membrane protein]
675179,CCN(C1CCOCC1)C1=CC(=CC(C(=O)NCC2=C(C)C=C(C)NC2...,Q92800,Q92800,[nucleus]


In [13]:
df_smiles = pd.DataFrame(df_all['smiles'])

# If you want 'smiles' as the column name in df_smiles
df_smiles.columns = ['smiles']
df_smiles.drop_duplicates(inplace=True)
df_smiles.to_csv('dataset1.csv', index=False)

In [14]:
old_label=False
file_name = 'new-label-all-final-LV.csv'
if old_label:
    df_label = pd.read_csv(file_name,header=None)
else:
    df_label = pd.read_csv(file_name,header=None)
    df_label = df_label.T.reset_index(drop=True)

df_label = df_label.applymap(lambda s: s.lower() if type(s) == str else s)

label_dict = {}
for label_index, row in enumerate(df_label.itertuples(index=False)):
    for location in row:
        if not isinstance(location, float):
            label_dict[location] = label_index

label_dict = {key.strip(): value for key, value in label_dict.items()}

In [15]:
df_label.to_csv('label-all-new.csv',index=False, header=False)

In [16]:
def replace_from_dict(lst, replace_dict):
    return [replace_dict.get(item, item) for item in lst]

In [17]:
df_all['location'] = df_all['location'].apply(replace_from_dict, args=(label_dict,))
df_all

Unnamed: 0,smiles,uniprot,uni_prot_id,location
0,CC(=N)NCCSCC[C@H](N)C(=O)O,P35228,P35228,[5]
1,C[C@@H]1O[C@H](C[N+](C)(C)C)C[C@H]1O,P20309,P20309,"[16, 16, 16, 10]"
3,C[C@@H]1O[C@H](C[N+](C)(C)C)C[C@H]1O,P08172,P08172,"[16, 16]"
5,O=C1c2ccccc2C(=O)c2c1ccc(O)c2O,Q92731,Q92731,[28]
7,O=C1c2ccccc2C(=O)c2c1ccc(O)c2O,P03372,P03372,"[28, 5, 16, 28, 5, 16, 16, 28, 11, 30]"
...,...,...,...,...
675170,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,Q9P289,Q9P289,"[5, 11]"
675173,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,Q9Y6E0,Q9Y6E0,"[5, 28, 30]"
675176,COC1=CC(NC2=NC(NC3=CC=C4OC(C)(C)C(=O)N(COP(O)(...,P0DMS8,P0DMS8,[16]
675179,CCN(C1CCOCC1)C1=CC(=CC(C(=O)NCC2=C(C)C=C(C)NC2...,Q92800,Q92800,[28]


In [18]:
df_all['location'] = df_all['location'].apply(lambda lst: [x for x in lst if not isinstance(x, str)])
df_all

Unnamed: 0,smiles,uniprot,uni_prot_id,location
0,CC(=N)NCCSCC[C@H](N)C(=O)O,P35228,P35228,[5]
1,C[C@@H]1O[C@H](C[N+](C)(C)C)C[C@H]1O,P20309,P20309,"[16, 16, 16, 10]"
3,C[C@@H]1O[C@H](C[N+](C)(C)C)C[C@H]1O,P08172,P08172,"[16, 16]"
5,O=C1c2ccccc2C(=O)c2c1ccc(O)c2O,Q92731,Q92731,[28]
7,O=C1c2ccccc2C(=O)c2c1ccc(O)c2O,P03372,P03372,"[28, 5, 16, 28, 5, 16, 16, 28, 11, 30]"
...,...,...,...,...
675170,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,Q9P289,Q9P289,"[5, 11]"
675173,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,Q9Y6E0,Q9Y6E0,"[5, 28, 30]"
675176,COC1=CC(NC2=NC(NC3=CC=C4OC(C)(C)C(=O)N(COP(O)(...,P0DMS8,P0DMS8,[16]
675179,CCN(C1CCOCC1)C1=CC(=CC(C(=O)NCC2=C(C)C=C(C)NC2...,Q92800,Q92800,[28]


In [19]:
df_all = df_all[df_all['location'].apply(lambda x: len(x) > 0)]
df_all

Unnamed: 0,smiles,uniprot,uni_prot_id,location
0,CC(=N)NCCSCC[C@H](N)C(=O)O,P35228,P35228,[5]
1,C[C@@H]1O[C@H](C[N+](C)(C)C)C[C@H]1O,P20309,P20309,"[16, 16, 16, 10]"
3,C[C@@H]1O[C@H](C[N+](C)(C)C)C[C@H]1O,P08172,P08172,"[16, 16]"
5,O=C1c2ccccc2C(=O)c2c1ccc(O)c2O,Q92731,Q92731,[28]
7,O=C1c2ccccc2C(=O)c2c1ccc(O)c2O,P03372,P03372,"[28, 5, 16, 28, 5, 16, 16, 28, 11, 30]"
...,...,...,...,...
675170,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,Q9P289,Q9P289,"[5, 11]"
675173,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,Q9Y6E0,Q9Y6E0,"[5, 28, 30]"
675176,COC1=CC(NC2=NC(NC3=CC=C4OC(C)(C)C(=O)N(COP(O)(...,P0DMS8,P0DMS8,[16]
675179,CCN(C1CCOCC1)C1=CC(=CC(C(=O)NCC2=C(C)C=C(C)NC2...,Q92800,Q92800,[28]


In [20]:
df_all.drop(['uniprot','uni_prot_id'],axis=1,inplace=True)
df_all['location'] = df_all['location'].apply(tuple)
df_all.drop_duplicates(inplace=True)
df_all

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all.drop(['uniprot','uni_prot_id'],axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all['location'] = df_all['location'].apply(tuple)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all.drop_duplicates(inplace=True)


Unnamed: 0,smiles,location
0,CC(=N)NCCSCC[C@H](N)C(=O)O,"(5,)"
1,C[C@@H]1O[C@H](C[N+](C)(C)C)C[C@H]1O,"(16, 16, 16, 10)"
3,C[C@@H]1O[C@H](C[N+](C)(C)C)C[C@H]1O,"(16, 16)"
5,O=C1c2ccccc2C(=O)c2c1ccc(O)c2O,"(28,)"
7,O=C1c2ccccc2C(=O)c2c1ccc(O)c2O,"(28, 5, 16, 28, 5, 16, 16, 28, 11, 30)"
...,...,...
675167,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,"(5, 11)"
675173,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,"(5, 28, 30)"
675176,COC1=CC(NC2=NC(NC3=CC=C4OC(C)(C)C(=O)N(COP(O)(...,"(16,)"
675179,CCN(C1CCOCC1)C1=CC(=CC(C(=O)NCC2=C(C)C=C(C)NC2...,"(28,)"


In [21]:
df_final = df_all[['smiles','location']]
df_final.to_csv('final_with_smiles_location_LV.csv',index=False)
df_final

Unnamed: 0,smiles,location
0,CC(=N)NCCSCC[C@H](N)C(=O)O,"(5,)"
1,C[C@@H]1O[C@H](C[N+](C)(C)C)C[C@H]1O,"(16, 16, 16, 10)"
3,C[C@@H]1O[C@H](C[N+](C)(C)C)C[C@H]1O,"(16, 16)"
5,O=C1c2ccccc2C(=O)c2c1ccc(O)c2O,"(28,)"
7,O=C1c2ccccc2C(=O)c2c1ccc(O)c2O,"(28, 5, 16, 28, 5, 16, 16, 28, 11, 30)"
...,...,...
675167,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,"(5, 11)"
675173,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,"(5, 28, 30)"
675176,COC1=CC(NC2=NC(NC3=CC=C4OC(C)(C)C(=O)N(COP(O)(...,"(16,)"
675179,CCN(C1CCOCC1)C1=CC(=CC(C(=O)NCC2=C(C)C=C(C)NC2...,"(28,)"


In [22]:
max_values = df_final['location'].apply(lambda x: max(x) if isinstance(x, tuple) else None)
max_value = max_values.max()
max_value

33

In [23]:
min_values = df_final['location'].apply(lambda x: min(x) if isinstance(x, tuple) else None)
min_value = min_values.min()
min_value

1

In [24]:
directory = "final_label_to_train_LV"
df_dict = {}
for i in range(max_value+1): 
    df_dict[i] = df_final[df_final['location'].apply(lambda x: i in x)].copy()
    df_dict[i]['label'] = i
    df_dict[i]= df_dict[i][['smiles','label']]
    if not os.path.exists(directory):
        os.makedirs(directory)
    df_dict[i].to_csv(f'{directory}/00_label_{i:02}.csv'.format(i),index=False)

In [25]:
df_label

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,actin_filaments,actin cytoplasmic 1,actin cytoplasmic 2,gamma-enteric smooth muscle,actin,alpha skeletal muscle,,,,,...,,,,,,,,,,
1,cell_junctions,cell junction,cell junction adherens junction,cell junction desmosome,cell junction focal adhesion,cell junction gap junction,cell junction hemidesmosome,cell junction paranodal septate junction,cell junction tight junction,,...,,,,,,,,,,
2,centrosome,cytoplasm cytoskeleton microtubule organizing ...,cytoplasm cytoskeleton microtubule organizing ...,,,,,,,,...,,,,,,,,,,
3,cytoplasmic_bodies,cytoplasm p-body,,,,,,,,,...,,,,,,,,,,
4,cytoskeleton,cytoplasm cytoskeleton,cytoplasm cytoskeleton cilium axoneme,cytoplasm cytoskeleton cilium basal body,cytoplasm cytoskeleton flagellum axoneme,cytoplasm cytoskeleton spindle,cytoplasm cytoskeleton spindle pole,cytoplasm cytoskeleton stress fiber,dynein axonemal particle,,...,,,,,,,,,,
5,cytosol,cytolytic granule,cytolytic granule membrane,cytoplasm,cytoplasm cell cortex,cytoplasm cytosol,cytoplasm perinuclear region,cytoplasm stress granule,cytoplasmic granule,inflammasome,...,,,,,,,,,,
6,endoplasmic_reticulum,endoplasmic reticulum,endoplasmic reticulum-golgi intermediate compa...,microsome,rough endoplasmic reticulum,sarcoplasmic reticulum,,,,,...,,,,,,,,,,
7,endosome,endosome,endosome multivesicular body,early endosome,late endosome,recycling endosome,,,,,...,,,,,,,,,,
8,endosome_membrane,endosome membrane,endosome membrane multi-pass membrane protein,endosome membrane peripheral membrane protein,endosome membrane single-pass type i membrane ...,endosome membrane single-pass type ii membrane...,early endosome membrane,early endosome membrane multi-pass membrane pr...,early endosome membrane peripheral membrane pr...,early endosome membrane peripheral membrane pr...,...,,,,,,,,,,
9,er_lumen,endoplasmic reticulum lumen,sarcoplasmic reticulum lumen,,,,,,,,...,,,,,,,,,,


In [26]:
new_df_label = pd.DataFrame()
new_df_label['sublocation'] = df_label[[0]].copy()
new_df_label['label'] = df_label.index
new_df_label

Unnamed: 0,sublocation,label
0,actin_filaments,0
1,cell_junctions,1
2,centrosome,2
3,cytoplasmic_bodies,3
4,cytoskeleton,4
5,cytosol,5
6,endoplasmic_reticulum,6
7,endosome,7
8,endosome_membrane,8
9,er_lumen,9


In [27]:
new_df_label.to_csv('label_subloc_LV.csv', index=False)

In [28]:
df_label[[0]]

Unnamed: 0,0
0,actin_filaments
1,cell_junctions
2,centrosome
3,cytoplasmic_bodies
4,cytoskeleton
5,cytosol
6,endoplasmic_reticulum
7,endosome
8,endosome_membrane
9,er_lumen
