In [1]:
from io import StringIO 
import sys
from rdkit import Chem
Chem.WrapLogs() 

from rdkit.Chem import PandasTools 
PandasTools.RenderImagesInAllDataFrames(images=True)  
import pandas as pd 
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None) 
import math, json
show = lambda df, n=5: display(HTML(df.head(n).to_html()))
df = pd.read_pickle('data/df_latest.pkl') 
df = df[df['InChIKey'].notna()]
df = df[df['warhead_name'].notna()]  

df_sifts = pd.read_csv("data/sifts.matched.csv", 
                     dtype={'unp_accessionid': str,
                            'unp_resnum': str,
                            'adduct_id': str}
                    ) 
df_sifts = df_sifts[['unp_accessionid', 'unp_resnum', 'adduct_id']] 
df_sifts = df_sifts.set_index('adduct_id')

df.update(df_sifts)
df['unp_accessionid'] = df.apply(lambda r: r.unp_accessionid if isinstance(r.unp_accessionid, str) else r.pdb_id, axis=1) 
df['unp_resnum'] = df.apply(lambda r: r.unp_resnum if isinstance(r.unp_resnum, str) else r.res_num, axis=1)  

achor = Chem.MolFromSmarts('[#52,#83,#84]')  
sub = {
'ASP_OD1': Chem.MolFromSmiles('[O][At]'               ),
'ASP_OD2': Chem.MolFromSmiles('[O][At]'               ),
'CYS_SG' : Chem.MolFromSmiles('[S][At]'               ),
'GLU_OE1': Chem.MolFromSmiles('[O][At]'               ),
'GLU_OE2': Chem.MolFromSmiles('[O][At]'               ),
'HIS_ND1': Chem.MolFromSmiles('[N+]1=CNC=C1([At])'  ),
'HIS_NE2': Chem.MolFromSmiles('[N]1C=C([At])[NH+]=C1' ),
'LYS_NZ' : Chem.MolFromSmiles('[N][At]'               ),
'MET_SD' : Chem.MolFromSmiles('[S+](C)[At]'         ),
'THR_OG1': Chem.MolFromSmiles('[O][At]'               ),
'SER_OG' : Chem.MolFromSmiles('[O][At]'               ),
'TYR_OH' : Chem.MolFromSmiles('[O][At]'               )
}  

'''
sio = sys.stderr = StringIO() 
Chem.MolToInchiKey(a) 
error = sio.getvalue() 
'''

df['adduct_InChIKey'] = df.apply(lambda x: Chem.MolToInchiKey(Chem.ReplaceSubstructs(x.adduct_pdb, achor, sub[f'{x.res_name}_{x.res_atom_name}'])[0]), axis=1) 

df['res_quad'] = df.apply(lambda r: r.pdb_id+'_'+r.chain_id+'_'+r.res_num+'_'+r.res_name, axis=1)  
df['binder_quad'] = df.apply(lambda r: r.pdb_id+'_'+r.covalent_bond_record.split(',')[12]+'_'+r.covalent_bond_record.split(',')[16]+'_'+r.binder_id_in_adduct, axis=1) 
df['binder_atom_alt'] = df.apply(lambda r: r.covalent_bond_record.split(',')[19], axis=1) 
df['res_atom_alt'] = df.apply(lambda r: r.covalent_bond_record.split(',')[9], axis=1)
df['binder_chain_id'] = df.apply(lambda r: r.covalent_bond_record.split(',')[12], axis=1)
df['binder_num'     ] = df.apply(lambda r: r.covalent_bond_record.split(',')[16], axis=1) 
    
mapping = {row.InChIKey: row.drugbank_id for i,row in pd.read_csv('data/drugbank.csv').iterrows()} 
df['drugbank_id']=df.apply(lambda r: mapping[r.InChIKey] if r.InChIKey in mapping else math.nan,axis=1)
mapping = {row.InChIKey: row.groups for i,row in pd.read_csv('data/drugbank.csv').iterrows()} 
df['drugbank_groups']=df.apply(lambda r: mapping[r.InChIKey] if r.InChIKey in mapping else math.nan,axis=1) 

def MolToSmiles(mol, s):
    if isinstance(s, str):
        return s
    else:
        s = Chem.MolToSmiles(mol)
        if s:
            return s
        else:
            raise 
df['binder_smiles'] = df.apply(lambda x: None if isinstance(x.binder_mol, float) else MolToSmiles(x.binder_mol, x.binder_smiles), axis=1)
df['adduct_smiles'] = df.apply(lambda x: MolToSmiles(x.adduct_pdb, x.adduct_smiles), axis=1)

with open('data/db_id.json', 'r') as fr: db_id = json.load(fr)
    
df['record_id'] = df.apply(lambda r: db_id[r.name]['record_id'] if r.name in db_id else math.nan, axis=1) 

def record_id_generator():
    i = max([int(i['record_id'][3:]) for _,i in db_id.items()])
    while 1:
        i += 1
        yield f"CBR{i:0>6}" 
r_gen = record_id_generator() 
df['record_id'] = df.apply(lambda r: r.record_id if isinstance(r.record_id, str)  else next(r_gen), axis=1)  

_ = {i['InChIKey']:i['binder_id'] for _,i in db_id.items()}
df['binder_id'] = df.apply(lambda r: _[r.InChIKey] if r.InChIKey in _ else math.nan, axis=1)   

def covalent_binder_id_generator():
    i = max([int(i['binder_id'][2:]) for _,i in db_id.items()])
    while 1:
        i += 1
        yield f"CB{i:0>7}"

cb_gen = covalent_binder_id_generator()   
binder_id_dict = {binder_id: next(cb_gen) for binder_id in df[(df['InChIKey'].notna()) & (df['binder_id'].isna())]['InChIKey'].unique()} 

df['binder_id'] = df.apply(lambda r: binder_id_dict[r.InChIKey] if isinstance(r.binder_id, float) and isinstance(r.InChIKey, str) else r.binder_id, axis=1)  

res_map_long_to_short = {"Alanine" :"ALA", "Arginine" :"ARG", "Asparagine" :"ASN", "Aspartic Acid" :"ASP", "Cysteine" :"CYS", "Glutamine" :"GLN", "Glutamic Acid" :"GLU", "Glycine" :"GLY", "Histidine" :"HIS", "Isoleucine" :"ILE", "Leucine" :"LEU", "Lysine" :"LYS", "Methionine" :"MET", "Phenylalanine" :"PHE", "Proline" :"PRO", "Serine" :"SER", "Threonine" :"THR", "Tryptophan" :"TRP", "Tyrosine" :"TYR", "Valine" :"VAL"}
Acceptable_Nucleophile = [('ASP', 'OD1'), ('ASP', 'OD2'), ('CYS', 'SG'), ('GLU', 'OE1'), ('GLU', 'OE2'), ('HIS', 'ND1'), ('HIS', 'NE2'), ('LYS', 'NZ'), ('MET', 'SD'),  ('THR', 'OG1'), ('SER', 'OG'), ('TYR', 'OH')]
res_map_short_to_long = dict([(v,k) for k,v in res_map_long_to_short.items()]) 
df['full_residue_name'] = df.apply(lambda r: res_map_short_to_long[r.res_name], axis=1)  


df['InChIKey'].nunique(), df['binder_type'].unique(), df['recovery_strategy'].unique()

(2201,
 array(['inhibitor', 'substrate', 'binder', 'linker', 'probe',
        'intermediate', 'stabilizer', 'activator', 'substrate analogue',
        'marker', 'product', 'modifier', 'metabolite', 'protector',
        'product analogue'], dtype=object),
 array(['manual'], dtype=object))

In [2]:
print(len(df[df['binder_smiles'].isna()]), len(df[df['adduct_smiles'].isna()]), len(df[df['binder_mol'].isna()]), len(df[df['adduct_pdb'].isna()]))

for idx, row in df.iterrows():
    mol = row['binder_mol']
    mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol))
    Chem.SanitizeMol(mol)
    mol = row['adduct_pdb']
    mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol))
    Chem.SanitizeMol(mol)   

0 0 0 0


In [3]:
df['binder_mol'] = df.apply(lambda r: Chem.MolFromSmiles(Chem.MolToSmiles(r.binder_mol)), axis=1)  
df['adduct_pdb'] = df.apply(lambda r: Chem.MolFromSmiles(Chem.MolToSmiles(r.adduct_pdb)), axis=1)  
df['binder_smiles'] = df.apply(lambda r: Chem.MolToSmiles(r.binder_mol), axis=1)  
df['adduct_smiles'] = df.apply(lambda r: Chem.MolToSmiles(r.adduct_pdb), axis=1)  

df.to_pickle('data/df_processed.pkl')