# Varibles and Modules  

In [1]:
import time, shlex, subprocess, os, pickle, pathlib, csv, datetime  # datetime.datetime.fromtimestamp(1566330422.9590576)
from multiprocessing import Pool
from urllib.request import urlopen 
import urllib.error 
  
from data.pub import env  
Get_DOI           = env().Get_DOI
doi_pkl           = env().doi_pkl
rcsb_cif_dir      = env().rcsb_cif_dir
adduct_cif_dir    = env().adduct_cif_dir
rcsb_webpage_pkl  = env().rcsb_webpage_pkl 
enw_from_rcsb_pkl = env().enw_from_rcsb_pkl 



# Get DOI so we can manually curate the ligand

### Let's get DOI from CIF first (we have 5189 CIF in total)

In [2]:
if os.path.isfile(doi_pkl) and os.stat(doi_pkl).st_size != 0:
    with open(doi_pkl, 'rb') as fr: DOI_collector = pickle.load(fr)
else: DOI_collector = {} 
pdb_ids = set([name[:4].lower() for name in os.listdir(adduct_cif_dir) if name[:4] not in DOI_collector])
   
def getDoiFromCIF(pdbid):
    mmcif = f'{rcsb_cif_dir}/{pdbid[1:3]}/{pdbid}.cif'
    proc  = subprocess.run(args=[Get_DOI, mmcif], stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Get CIF  
    return proc.stdout.decode()
getDoiFromCIF(list(pdb_ids)[0])

'3BHL|Structural Studies of Early Events in Catalysis by Thymidylate Synthase|||'

In [3]:
nCores = 2 
with Pool(nCores) as p: # mining in 3 cores   
    mining_result = p.map(getDoiFromCIF, pdb_ids, nCores) 

DOI_collector.update(dict([(i[0], i[4]) for i in [entry.split('|') for entry in mining_result] if i[4][:3]=='10.']))
DOI_collector.update({'6QQI': '10.1107/S205225251900616X', # problem in CIF, manually checked
            '6QQK': '10.1107/S205225251900616X', 
            '6QSA': '10.1107/S205225251900616X',
            '1GRQ': '10.1002/pro.101508',
            '1B54': '10.1107/s0907444902018012',
            '1FCD': '10.1126/science.7939681',
            '1BMQ': '10.1248/cpb.47.11',
            '1DKA': '10.1126/science.8342040',
            '1PYG': '10.1126/science.1962195',
            '1BTC': '10.1021/bi00078a006',
            '1YGP': '10.1126/science.273.5281.1539',
            '4EF9': '10.2174/1381612811319140011',
            '1IDC': '10.1126/science.7761851',
            '4EF8': '10.2174/1381612811319140011',
            '2DKB': '10.1126/science.8342040',
            '1CGH': '10.1002/j.1460-2075.1996.tb00933.x',
            '1AUT': '10.1002/j.1460-2075.1996.tb01073.x',
            '1TEM': '10.1021/ja9609718',
            '1LPS': '10.1021/ja00087a002',
            '1ZOV': '10.2183/pjab.81.220',
            '5SWH': '10.1016/j.molcel.2019.02.003',
            '5TEH': '10.1016/j.molcel.2019.02.003',
            '1PPD': '10.1107/S010876738409930X',
            '5NHN': '10.1038/s42004-019-0185-5',
            '3F8A': '10.1002/cplu.201100082',
            '1AXR': '10.1002/hlca.19980810507',
            '2YPN': '10.1107/S0909049599006342',
            '1LPM': '10.1021/ja00087a002',
            '1TPP': '10.1107/S010876818300275X',
            '5B13': '10.1111/jfbc.12301',
            '6NXD': '10.1038/s41598-019-46432-0',
            '1BT5': '10.1021/ja9818001',
            '1GPB': '10.1142/1319',
            '1CV8': '10.1107/s0108767378097081',
            '1AU0': '10.1021/ja972204u',
            '1Z62': '10.2174/1570180054405839',
            '2ZP0': '10.2174/1570180053765200',
            '6NXC': '10.1038/s41598-019-46432-0',
            '4EST': '10.1021/ja00191a039', 
            '1CPI': '10.1021/ja00146a007', 
            '1AMN': '10.1021/ja952232h', 
            '1SRE': '10.1021/ja00086a004',
            '1AU2': '10.1021/ja972204u',
            '1QGL': '10.1021/ja991729e',
            '1DUI': '10.1016/S0022-0248(00)00250-5',
            '1SUE': '10.1016/S0022-0248(98)00536-3',
            '1TYN': '10.1021/ja00079a065',
            '5SYS': '10.1016/j.molcel.2019.02.003',
            '1SRH': '10.1021/ja00086a004',
            '1PPB': '10.1002/j.1460-2075.1989.tb08511.x',
            '1BGO': '10.1021/ja981171v',
            '1JIM': '10.1021/ja00299a063',
            '1L26': '10.1126/science.3277275',
            '1SRI': '10.1021/ja00086a004',
            '1QFI': '10.1515/znb-1999-0518',
            '148L': '10.1126/science.8266098',
            '1BAV': '10.1021/ja963234k',
            '1XY2': '10.1126/science.3008332',
            '1XY1': '10.1126/science.3008332',
            '1BUL': '10.1021/ja9817996',
            '1PPP': '10.1042/bj2870797', 
            '1GW1': '10.1002/1521-3773(20020802)41:15<2824::AID-ANIE2824>3.0.CO;2-G',
            '207L': '10.1093/oxfordjournals.jbchem.a021445',
            '1QDQ': '10.1093/oxfordjournals.jbchem.a022651',
            '1PE6': 'https://www.jbc.org/content/266/22/14771',
            '1ETB': 'https://www.jbc.org/content/268/4/2416',
            '6CHA': 'https://www.jbc.org/content/262/16/7737',
            '9RUB': 'https://www.jbc.org/content/266/19/12604',
            '7ADH': 'https://www.jbc.org/content/258/9/5537',
            '1ETA': 'https://www.jbc.org/content/268/4/2416',
            '1DWE': 'https://www.jbc.org/content/266/30/20085',
            '6HP8': 'https://ediss.sub.uni-hamburg.de/volltexte/2019/9702/',
            '1G2W': 'https://link.springer.com/chapter/10.1007/978-3-0348-8397-9_56',
            '4POK': 'https://www.semanticscholar.org/paper/BNP7787-Forms-Novel-Covalent-Adducts-on-Human-and-Parker-Nienaber/3951c4e5582c7c043d4370a3cc49a263aa00f48f',
            '4POL': 'https://www.semanticscholar.org/paper/BNP7787-Forms-Novel-Covalent-Adducts-on-Human-and-Parker-Nienaber/3951c4e5582c7c043d4370a3cc49a263aa00f48f',
            '4POM': 'https://www.semanticscholar.org/paper/BNP7787-Forms-Novel-Covalent-Adducts-on-Human-and-Parker-Nienaber/3951c4e5582c7c043d4370a3cc49a263aa00f48f',
            '3PRK': 'https://www.jbc.org/content/266/26/17695.full.pdf',   
            '1ARC': 'https://www.jbc.org/content/264/7/3832.full.pdf',
            '1ETN': 'https://www.jbc.org/content/266/9/5934.full.pdf',
            '5KMC': 'https://conservancy.umn.edu/handle/11299/182796', 
            '6TD0': '10.1039/C9MD00557A',
            '6TD1': '10.1039/C9MD00557A',
            '5NI3': '10.1038/s42004-019-0185-5',
            '6VBD': '10.1074/jbc.RA120.012617', 
            '6KD5': '10.1101/2020.06.12.149229',      
            '6W8W': '10.1038/s41467-020-17531-8',         
            '6U0J': '10.1101/2020.05.15.098798',      
            '6WP8': '10.1074/jbc.RA120.014118',       
            '6XL3': '10.1074/jbc.RA120.014118',  
            '6YPT': '10.1074/jbc.RA120.014628',
            '7JVF': '10.1039/D0SC01651A',
            
           })

with open(doi_pkl, 'wb') as fw: pickle.dump(DOI_collector, fw) 

In [4]:
noDoiFromCIF = set([name[:4] for name in os.listdir(adduct_cif_dir) if name[:4] not in DOI_collector]) 

##### OK, all CIFs belong to one group. We got 4503 DOIs and noDoiFromCIF stores the 682 PDBids which might not have DOI

#### Get citation file from RCSB

In [5]:
if os.path.isfile(enw_from_rcsb_pkl) and os.stat(enw_from_rcsb_pkl).st_size != 0: 
    with open(enw_from_rcsb_pkl, 'rb') as fr:  
        enw = pickle.load(fr)
else:  
    enw = {} 
rcsbEndNote = lambda pdbid: urlopen('https://www.rcsb.org/pdb/explore/medlineExplore.do?structureId='+ pdbid).read().decode()
for pdbid in noDoiFromCIF:
    if pdbid in enw: continue
    time.sleep(0.3)
    try:
        enw.update({pdbid: rcsbEndNote(pdbid)}) 
    except HTTPError as e:
        print(pdbid, e)
with open(enw_from_rcsb_pkl, 'wb') as fw: pickle.dump(enw, fw) 

##### From 682 pdbid in noDoiFromCIF we got 682 new_citations. Each new_citation is actually a enw file

In [6]:
noDoiFromENW = [] 

for pdb in noDoiFromCIF:  
    cit = enw[pdb] 
    parsed_cit = []
    for line in cit.split('\n'):
        if not line: continue
        if line[:6] == '      ':
            parsed_cit.append([line.strip()])
        else:
            line = [item.strip() for item in [line[:4], line[5:]]]
            line.insert(1, '$')  
            line.insert(0, '\n')
            parsed_cit.append(line)
            
    parsed_cit = ''.join([ele for line in parsed_cit for ele in line])
    parsed_cit = dict([[ele for ele in line.split('$')] for line in parsed_cit.split('\n') if line]) 

    if 'LID' in parsed_cit and parsed_cit['LID'].find('[doi]') > -1:
        LID = parsed_cit['LID'].replace('[doi]', '').strip()
        if LID[:8] != '10.2210/':
            DOI_collector.update({pdb: LID})
            continue
    if 'AID' in parsed_cit and parsed_cit['AID'].find('[doi]') > -1:
        AID = parsed_cit['AID'].replace('[doi]', '').strip()
        if AID[:8] != '10.2210/':
            DOI_collector.update({pdb: AID}) 
            continue 
    noDoiFromENW.append(pdb) 

with open(doi_pkl, 'wb') as fw: pickle.dump(DOI_collector, fw)   

##### Now there are still 468 pdbids which cannot be found DOI at this time. Let's check RCSB website directly if it is marked as to_be_published

In [7]:
import bs4, re, time

if os.path.isfile(rcsb_webpage_pkl) and os.stat(rcsb_webpage_pkl).st_size != 0:
    with open(rcsb_webpage_pkl, 'rb') as fr: 
        webpages = pickle.load(fr)
else:   webpages = {} 
getWebpage = lambda pdbid: urlopen('https://www.rcsb.org/structure/'+ pdbid).read().decode()

for pdbid in noDoiFromENW :
    if pdbid in webpages: continue
    time.sleep(0.3)
    webpages[pdbid] = getWebpage(pdbid)

import pickle
with open(rcsb_webpage_pkl, 'wb') as fw: pickle.dump(webpages, fw) 

In [8]:
class myException(Exception):
    def __init__(self, msg=None):
        Exception.__init__(self, msg) 
invalid_DOI = {}
for pdbid in noDoiFromENW:  
    citation = webpages[pdbid]
    try: 
        soup = bs4.BeautifulSoup(citation, 'lxml')
        primarycitation = soup.find_all(id="primarycitation")
        if len(primarycitation) != 1: 
            raise myException('primarycitation')
        if primarycitation[0].text[-16:] == 'To be published.':  
            invalid_DOI.update({pdbid:'To be published.'})
            continue  
        primarycitation = primarycitation[0].find_all(id="pubmedDOI") 
        if len(primarycitation) != 1:  
            raise myException('pubmedDOI') 
        DOI = primarycitation[0].text[9:]   
        if DOI_collector[:8] == '10.2210/': 
            raise myException('10.2210')  
        DOI_collector.update({pdbid:DOI})
    except Exception as e:
        print(f"pls curate this manually '{pdbid}':'',", e, f'http://www.rcsb.org/structure/{pdbid}')
        invalid_DOI.update({pdbid:'Inaccessable work.'})

with open(doi_pkl, 'wb') as fw: pickle.dump(DOI_collector, fw)
DOI_collector.update(invalid_DOI)

pls curate this manually '6QVM':'', pubmedDOI http://www.rcsb.org/structure/6QVM
pls curate this manually '1EWP':'', pubmedDOI http://www.rcsb.org/structure/1EWP
pls curate this manually '6P69':'', pubmedDOI http://www.rcsb.org/structure/6P69
pls curate this manually '1UAZ':'', pubmedDOI http://www.rcsb.org/structure/1UAZ


### all 5189 involved articles are classified except 

In [9]:
from rdkit import Chem
import os, math, pickle, pandas as pd
adduct_pkl     = env().adduct_pkl
adduct_cif_dir = env().adduct_cif_dir 
covalent_csv   = env().covalent_csv  
with open(adduct_pkl, 'rb') as fr: adduct = pickle.load(fr)
len(adduct)

11179

### Remove adduct related to the fake covalent bond

In [10]:
with open('data/faulty_adduct.txt', 'r') as fr:
    faulty_adduct = [i.strip() for i in fr.readlines()] 

for cif in os.scandir(adduct_cif_dir):
    if cif.name in faulty_adduct:
        os.remove(cif.path)

# Build up a Dataframe

In [11]:
columns = ['adduct_smiles', 'adduct_pdb', 'warhead', 'ligand_smiles', 'ligand_pdb', 'common_name', 'reaction_type', 'recovery_strategy', 'ligand_type', 'note']  
df = pd.DataFrame(columns=columns, index=os.listdir(adduct_cif_dir)) 
df.index.name = 'adduct_name'  

#### Update with recovered adduct pdb

In [12]:
for name, pdb in adduct.items():
    df.update(pd.DataFrame(data={'adduct_pdb':pdb}, index=[name])) 

#### Update with curated knowledge 

In [13]:
import ast
with open('data/knowledge_base.txt', 'r') as fr:
    for line in fr.readlines():  
        dict_ = ast.literal_eval(line)
        name = dict_['adduct_name'] 
        dict_['recovery_strategy'] = 'manual'
        df.update(pd.DataFrame(data=dict_, index=[name]))  

In [14]:
for name, row in df.iterrows():
    dict_ = row.to_dict()
    if isinstance(dict_['ligand_smiles'], str): 
        pdb = Chem.MolFromSmiles(dict_['ligand_smiles'])
        if not pdb: 
            print(dict_)
            raise   
        dict_['ligand_pdb'] = pdb
        df.update(pd.DataFrame(data=dict_, index=[name]))
    if isinstance(dict_['adduct_smiles'], str):  
        pdb = Chem.MolFromSmiles(dict_['adduct_smiles'])
        if not pdb: 
            print(name, dict_)
            raise   
        dict_['adduct_pdb'] = pdb
        df.update(pd.DataFrame(data=dict_, index=[name])) 

### Update knowledge base 

# Begin

In [15]:
covRec = {}
with open(covalent_csv, 'r') as fr: 
    for entry in fr.readlines():
        entries = entry.split(',')
        unique = [entries[i] for i in [0,2,4,6,7,8,9,12,14,16,17,18,19]] 
        covRec.update({'_'.join(unique)+'_cif': entry.strip()})    

In [16]:
from IPython.core.display import display, HTML, SVG 
display(HTML("<style>.container { width:100% !important; }</style>"))   

import pandas as pd 
from rdkit import Chem
from rdkit.Chem import AllChem, PandasTools
pd.set_option('display.max_colwidth', None)
from rdkit.Chem.Draw import IPythonConsole 
PandasTools.RenderImagesInAllDataFrames(images=True) 
df['covalent_record'        ] = df.apply(lambda r: covRec[r.name], axis=1)  
df['ligand_name'            ] = df.apply(lambda r: r.name.split('_')[8], axis=1) 
df['ligand_bond_atom_name'  ] = df.apply(lambda r: r.name.split('_')[11], axis=1)
df['aa_name'                ] = df.apply(lambda r: r.name.split('_')[2], axis=1) 
df['aa_bond_atom_name'      ] = df.apply(lambda r: r.name.split('_')[5], axis=1) 
df['doi'] = df.apply(lambda r:  DOI_collector[r.name.split('_')[0]], axis=1) 
df = df.sort_values(by='doi')
URL = lambda doi: 'http://doi.org/' + doi if doi[:3]=='10.' else doi 
df['url'] = df.apply(lambda r: URL(r.doi) , axis=1) 

In [17]:
df = df[(df['doi'].str.match('10')) | (df['doi'].str.match('http'))] 

In [18]:
def ToSmiles(m, s):
    try: 
        if type(s) == str: return s
    except:
        pass
    try:
        if math.isnan(m): return math.nan
    except TypeError:
        pass
    try:
        return Chem.MolToSmiles(m)
    except: 
        return math.nan

df['ligand_smiles'] = df.apply(lambda r: ToSmiles(r.ligand_pdb, r.ligand_smiles) , axis=1)  

code2smiles = {} 
for index, ligand_name in df['ligand_name'].to_dict().items():
    smiles = df.loc[index,'ligand_smiles']
    if type(smiles) != str: continue
    try:
        smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
    except:
        #print(index, smiles)
        pass
    if ligand_name not in code2smiles:
        code2smiles[ligand_name] = set()
    code2smiles[ligand_name].add(smiles)

isna  = df[df['ligand_type'].isna() ] 
notna = df[df['ligand_type'].notna()] 
print(len(notna))

4459


In [19]:
for index, row in isna.iterrows():
    row = row.to_dict() 
    ligand_name           = row['ligand_name'          ] 
    ligand_bond_atom_name = row['ligand_bond_atom_name'] 
    aa_name               = row['aa_name'              ]  
    aa_bond_atom_name     = row['aa_bond_atom_name'    ] 
    tmp_ = notna[(notna['ligand_name']==ligand_name) & (notna['ligand_bond_atom_name']==ligand_bond_atom_name) & (notna['aa_name']==aa_name) \
                 & (notna['aa_bond_atom_name']==aa_bond_atom_name) & (notna['recovery_strategy']=='manual')]  #
    if len(tmp_):  
        adduct_pdb,warhead,ligand_pdb,reaction_type,ligand_type = df.loc[tmp_.index[0], ['adduct_pdb','warhead','ligand_pdb','reaction_type','ligand_type']]
        row['warhead'] = warhead
        row['ligand_pdb'] = ligand_pdb
        row['ligand_type'] = ligand_type 
        row['reaction_type'] = reaction_type
        row['recovery_strategy'] = 'adduct_matching'
        if ligand_name in code2smiles:
            smiles = code2smiles[ligand_name]
            row['note'] = '.'.join(smiles) 
        df.update( pd.DataFrame(data=row, index=[index]))  
print(len(df[df['ligand_type'].notna()] ))

7725


In [20]:
def subSearching(m, s):
    try:
        if math.isnan(m): return False 
    except TypeError:
        return m.HasSubstructMatch(Chem.MolFromSmarts(s))

## Boron

In [21]:
df_remaining = df[df['ligand_type'].isna()]      # for those curated, the ligand type is no longer NaN
df_boronic = df_remaining[df_remaining.apply(lambda r: subSearching(r.adduct_pdb, '[At]~[#5]'), axis=1)]
print(len(df_boronic)) 

for index, row in df_boronic.iterrows(): 
    row = row.to_dict()
    mol = row['adduct_pdb']
    At, B = mol.GetSubstructMatch(Chem.MolFromSmarts("[At]~[#5]")) 
    B = mol.GetAtomWithIdx(B)
    if B.GetDegree() == 4:
        B.SetFormalCharge(-1)
    B.UpdatePropertyCache() 

tmp = df_remaining[df_remaining.apply(lambda r: subSearching(r.adduct_pdb, '[At]~[#5;R]'), axis=1)]
#display(tmp)
for index, row in tmp.iterrows(): 
    row = row.to_dict()
    mol = row['adduct_pdb']
    mol = AllChem.DeleteSubstructs(mol, Chem.MolFromSmarts("[At]"))  
    Chem.GetSymmSSSR(mol)
    row['ligand_pdb'    ] = mol
    row['recovery_strategy'] = 'automated'
    row['ligand_type'      ] = 'inhibitor' 
    row['reaction_type'    ] = 'Ring-opening' 
    row['warhead'          ] = 'Cyclic_Boronic_Acid'    
    df.update( pd.DataFrame(data=row, index=[index]) )  

df_remaining = df[df['ligand_type'].isna()]      # for those curated, the ligand type is no longer NaN
tmp = df_remaining[df_remaining.apply(lambda r: subSearching(r.adduct_pdb, '[At]~[#5](O)(O)*'), axis=1)]
#display(tmp)

for index, row in tmp.iterrows(): 
    row = row.to_dict()
    mol = row['adduct_pdb']
    mol = AllChem.DeleteSubstructs(mol, Chem.MolFromSmarts("[At]"))  
    row['ligand_pdb'    ] = mol
    row['recovery_strategy'] = 'automated'
    row['ligand_type'      ] = 'inhibitor' 
    row['reaction_type'    ] = 'Addition' 
    row['warhead'          ] = 'Boronic_Acid'    
    df.update( pd.DataFrame(data=row, index=[index]) )

df_remaining = df[df['ligand_type'].isna()]      # for those curated, the ligand type is no longer NaN
tmp = df_remaining[df_remaining.apply(lambda r: subSearching(r.adduct_pdb, '[At]~[#5D3]'), axis=1)]
#display(tmp)

for index, row in tmp.iterrows(): 
    row = row.to_dict()
    mol = row['adduct_pdb']
    mol = Chem.Mol(mol)
    At  = mol.GetSubstructMatch(Chem.MolFromSmarts("[At]"))[0]
    mol.GetAtomWithIdx(At).SetAtomicNum(8) 
    row['ligand_pdb'    ] = mol
    row['recovery_strategy'] = 'automated'
    row['ligand_type'      ] = 'inhibitor' 
    row['reaction_type'    ] = 'Addition' 
    row['warhead'          ] = 'Boronic_Acid'    
    df.update( pd.DataFrame(data=row, index=[index]) )   

df_remaining = df[df['ligand_type'].isna()]      # for those curated, the ligand type is no longer NaN
tmp = df_remaining[df_remaining.apply(lambda r: subSearching(r.adduct_pdb, '[At]~[#5]'), axis=1)]
print(len(tmp))

372
0


# β-lactam

In [22]:
df_remaining = df[df['ligand_type'].isna()]      # for those curated, the ligand type is no longer NaN
df_beta_lactam = df_remaining[df_remaining['adduct_pdb'] >= Chem.MolFromSmarts('[At]C(=O)CCN=[C;R][C;R]')] 
print(len(df_beta_lactam))

38


In [23]:
tmp = df_beta_lactam
for index, row in tmp.iterrows():
    #if index != '3M6H_SER_84_OG_P0C5C1_2RG_308_CAA.pdb' : continue
    row = row.to_dict()
    mol = row['adduct_pdb'] 
    At, C1, _, _, _, N, C2, C3 = mol.GetSubstructMatch(Chem.MolFromSmarts('[At]C(=O)CCN=[C;R][C;R]')) 
    mol.GetBondBetweenAtoms(N, C2).SetBondType(Chem.rdchem.BondType.SINGLE)  
    mol.GetBondBetweenAtoms(C2, C3).SetBondType(Chem.rdchem.BondType.DOUBLE)
    mol.GetAtomWithIdx(C3).SetNumExplicitHs(0)
    edMol = Chem.EditableMol(mol)
    edMol.AddBond(C1, N, Chem.rdchem.BondType.SINGLE) 
    edMol.RemoveAtom(At)
    mol = edMol.GetMol()  
    row['ligand_pdb'       ] = mol
    row['warhead'          ] = 'β-lactam' 
    row['ligand_type'      ] = 'inhibitor' 
    row['recovery_strategy'] = 'automated'
    row['reaction_type'    ] = 'Ring-opening'  
    df.update( pd.DataFrame(data=row, index=[index]) )  

In [24]:
df_remaining = df[df['ligand_type'].isna()]      # for those curated, the ligand type is no longer NaN
df_beta_lactam = df_remaining[df_remaining['adduct_pdb'] >= Chem.MolFromSmarts('[At]C(=O)CCN')] 
print(len(df_beta_lactam))

270


In [25]:
tmp = df_beta_lactam
for index, row in tmp.iterrows():
    row = row.to_dict()
    mol = row['adduct_pdb'] 
    At, C, _, _, _, N = mol.GetSubstructMatch(Chem.MolFromSmarts('[At]C(=O)CCN')) 
    edMol = Chem.EditableMol(mol)
    edMol.AddBond(C, N, Chem.rdchem.BondType.SINGLE) 
    edMol.RemoveAtom(At)
    mol = edMol.GetMol()
    row['ligand_pdb'    ] = mol
    row['warhead'          ] = 'β-lactam' 
    row['ligand_type'      ] = 'inhibitor' 
    row['recovery_strategy'] = 'automated'
    row['reaction_type'    ] = 'Ring-opening' 
    df.update( pd.DataFrame(data=row, index=[index]) )  

# β-lactone

In [26]:
df_remaining = df[df['ligand_type'].isna()]
df_beta_lactone = df_remaining[df_remaining['adduct_pdb'] >= Chem.MolFromSmarts('[At]C(=O)CCO')]
print(len(df_beta_lactone)) 

79


In [27]:
tmp = df_beta_lactone
for index, row in tmp.iterrows():
    row = row.to_dict()
    mol = row['adduct_pdb'] 
    At, C, _, _, _, O = mol.GetSubstructMatch(Chem.MolFromSmarts('[At]C(=O)CCO')) 
    edMol = Chem.EditableMol(mol)
    edMol.AddBond(C, O, Chem.rdchem.BondType.SINGLE) 
    edMol.RemoveAtom(At)
    mol = edMol.GetMol()
    row['ligand_pdb'    ] = mol
    row['warhead'          ] = 'β-lactone' 
    row['ligand_type'      ] = 'inhibitor' 
    row['recovery_strategy'] = 'automated' 
    row['reaction_type'    ] = 'Ring-opening' 
    df.update( pd.DataFrame(data=row, index=[index]) )  

# Epoxy

In [28]:
df_remaining = df[df['ligand_type'].isna()]      # for those curated, the ligand type is no longer NaN
df_epoxy = df_remaining[df_remaining['adduct_pdb'] >= Chem.MolFromSmarts('[At]C(C=O)C(O)C=O')]
print(len(df_epoxy)) 

71


In [29]:
tmp = df_epoxy
for index, row in tmp.iterrows():
    try:
        row = row.to_dict()
        mol = row['adduct_pdb'] 
        At, C, _, _, _, O, _, _ = mol.GetSubstructMatch(Chem.MolFromSmarts('[At]C(C=O)C(O)C=O')) 
        edMol = Chem.EditableMol(mol)
        edMol.AddBond(C, O, Chem.rdchem.BondType.SINGLE) 
        edMol.RemoveAtom(At)
        mol = edMol.GetMol()
        row['ligand_pdb'    ] = mol
        row['warhead'          ] = 'Epoxy' 
        row['ligand_type'      ] = 'inhibitor' 
        row['reaction_type'    ] = 'Ring-opening' 
        row['recovery_strategy'] = 'automated' 
        df.update( pd.DataFrame(data=row, index=[index]) )  
    except:
        pass

## Conjugated_Pi_Bond

In [30]:
df_remaining = df[df['ligand_type'].isna()]      # for those curated, the ligand type is no longer NaN
df_conjugated_pi_bond = df_remaining[df_remaining['adduct_pdb'] >= Chem.MolFromSmarts('[At]C~CC=O')]
print(len(df_conjugated_pi_bond)) 

709


In [31]:
tmp = df_conjugated_pi_bond
for index, row in tmp.iterrows(): 
    row = row.to_dict()
    mol = row['adduct_pdb'] 
    At, Cb, Ca, _, _ = mol.GetSubstructMatch(Chem.MolFromSmarts('[At]C~CC=O')) 
    mol.GetBondBetweenAtoms(Ca, Cb).SetBondType(Chem.rdchem.BondType.SINGLE)
    mol = Chem.Mol(mol)
    mol.GetBondBetweenAtoms(Ca, Cb).SetBondType(Chem.rdchem.BondType.DOUBLE)
    mol.GetAtomWithIdx(Cb).SetNumExplicitHs(0)
    Chem.SanitizeMol(mol, sanitizeOps=Chem.SANITIZE_SETHYBRIDIZATION)
    Ca = mol.GetAtomWithIdx(Ca)
    Ca.SetNumExplicitHs(0)
    edMol = Chem.EditableMol(mol) 
    edMol.RemoveAtom(At)
    mol = edMol.GetMol()
    row['ligand_pdb'    ] = mol
    row['warhead'          ] = 'Conjugated_Pi_Bond' 
    row['ligand_type'      ] = 'inhibitor' 
    row['recovery_strategy'] = 'automated'
    row['reaction_type'    ] = 'Addition'  
    df.update( pd.DataFrame(data=row, index=[index]) )  

## Nitrile

In [32]:
df_remaining = df[df['ligand_type'].isna()]      # for those curated, the ligand type is no longer NaN
df_nitrile = df_remaining[df_remaining['adduct_pdb'] >= Chem.MolFromSmarts('[At]~[CD3](~[ND1])')]
len(df_nitrile)

147

In [33]:
tmp = df_nitrile
for index, row in tmp.iterrows():
    row = row.to_dict()
    mol = row['adduct_pdb']
    At, C, N = mol.GetSubstructMatch(Chem.MolFromSmarts('[At]~[CD3](~[ND1])'))
    mol.GetBondBetweenAtoms(C, N).SetBondType(Chem.rdchem.BondType.DOUBLE)  
    mol.GetBondBetweenAtoms(C, At).SetBondType(Chem.rdchem.BondType.SINGLE)  
    mol.GetAtomWithIdx(C).SetNumExplicitHs(0)
    mol = Chem.Mol(mol) 
    mol.GetBondBetweenAtoms(C, N).SetBondType(Chem.rdchem.BondType.TRIPLE)  
    edMol = Chem.EditableMol(mol)
    edMol.RemoveAtom(At)
    mol = edMol.GetMol() 
    Chem.SanitizeMol(mol, sanitizeOps=Chem.SANITIZE_SETHYBRIDIZATION)
    mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol))
    row['ligand_pdb'    ] = mol
    row['warhead'          ] = 'Nitrile' 
    row['ligand_type'      ] = 'inhibitor' 
    row['recovery_strategy'] = 'automated'
    row['reaction_type'    ] = 'Addition'  
    df.update( pd.DataFrame(data=row, index=[index])) 

## Conjugated_Pi_Bond

In [34]:
df_remaining = df[df['ligand_type'].isna()]      # for those curated, the ligand type is no longer NaN
df_conjugated_pi_bond = df_remaining[df_remaining['adduct_pdb'] >= Chem.MolFromSmarts('[At]C~CS=O')]
print(len(df_conjugated_pi_bond)) 

172


In [35]:
tmp = df_conjugated_pi_bond
for index, row in tmp.iterrows():
    row = row.to_dict()
    mol = row['adduct_pdb']
    At, Cb, Ca, _, _ = mol.GetSubstructMatch(Chem.MolFromSmarts('[At]C~CS=O'))
    mol.GetBondBetweenAtoms(Cb, Ca).SetBondType(Chem.rdchem.BondType.DOUBLE)  
    edMol = Chem.EditableMol(mol)
    edMol.RemoveAtom(At)
    mol = edMol.GetMol() 
    Chem.SanitizeMol(mol, sanitizeOps=Chem.SANITIZE_SETHYBRIDIZATION)
    row['ligand_pdb'    ] = mol
    row['warhead'          ] = 'Conjugated_Pi_Bond'
    row['ligand_type'      ] = 'inhibitor' 
    row['recovery_strategy'] = 'automated' 
    row['reaction_type'    ] = 'Addition' 
    df.update( pd.DataFrame(data=row, index=[index])) 

## Ketone

In [36]:
df_remaining = df[df['ligand_type'].isna()]      # for those curated, the ligand type is no longer NaN
df_ketone = df_remaining[df_remaining['adduct_pdb'] >= Chem.MolFromSmarts('[At][#6](~[OD1])([#6])[#6]')]
len(df_ketone)

390

In [37]:
tmp = df_ketone
for index, row in tmp.iterrows():
    row = row.to_dict()
    adduct = row['adduct_pdb'] 
    At, C, O, _, _ = adduct.GetSubstructMatch(Chem.MolFromSmarts('[At][#6](~[OD1])([#6])[#6]'))
    adduct.GetBondBetweenAtoms(C, O).SetBondType(Chem.rdchem.BondType.SINGLE)
    row['adduct_pdb'] = Chem.MolFromSmiles(Chem.MolToSmiles(adduct)) 
    adduct.GetAtomWithIdx(C).SetNumExplicitHs(0)   
    adduct.GetBondBetweenAtoms(C, O).SetBondType(Chem.rdchem.BondType.DOUBLE)
    edMol = Chem.EditableMol(adduct)
    edMol.RemoveAtom(At)
    inhibitor = edMol.GetMol() 
    Chem.SanitizeMol(inhibitor, sanitizeOps=Chem.SANITIZE_SETHYBRIDIZATION)
    row['ligand_pdb'    ] = inhibitor
    row['warhead'          ] = 'Ketone'
    row['ligand_type'      ] = 'inhibitor' 
    row['recovery_strategy'] = 'automated' 
    row['reaction_type'    ] = 'Addition' 
    df.update( pd.DataFrame(data=row, index=[index])) 

## Diazabicyclooctane

In [38]:
df_remaining = df[df['ligand_type'].isna()]      # for those curated, the ligand type is no longer NaN
df_diazabicyclooctane = df_remaining[df_remaining['adduct_pdb'] >= Chem.MolFromSmarts('[At][#6](~[OD1])NCCNOS')]
len(df_diazabicyclooctane),len(df_remaining), len(df)

(25, 177, 10150)

In [39]:
tmp = df_diazabicyclooctane
for index, row in tmp.iterrows():
    row = row.to_dict()
    mol = row['adduct_pdb']
    At, C, _, _, _, _, N, _, _ = mol.GetSubstructMatch(Chem.MolFromSmarts('[At][#6](~[OD1])NCCNOS'))
    #mol.GetAtomWithIdx(C).SetNumExplicitHs(0)   
    edMol = Chem.EditableMol(mol)
    edMol.AddBond(C, N, Chem.rdchem.BondType.SINGLE)
    edMol.RemoveAtom(At)
    mol = edMol.GetMol() 
    Chem.SanitizeMol(mol, sanitizeOps=Chem.SANITIZE_SETHYBRIDIZATION)
    row['ligand_pdb'    ] = mol
    row['warhead'          ] = 'γ-lactam'
    row['note'          ] = 'Diazabicyclooctane' 
    row['ligand_type'      ] = 'inhibitor' 
    row['recovery_strategy'] = 'automated' 
    row['reaction_type'    ] = 'Ring-opening' 
    row['ligand_smiles'    ] = Chem.MolToSmiles(mol)
    df.update( pd.DataFrame(data=row, index=[index]))  

## Schiff Base

In [40]:
df_remaining = df[df['ligand_type'].isna()]      # for those curated, the ligand type is no longer NaN
df_schiff_base = df_remaining[df_remaining['adduct_pdb'] >= Chem.MolFromSmarts('[At]=[#6;D3]')]
print(len(df_schiff_base),len(df_remaining), len(df)) 

124 152 10150


In [41]:
tmp = df_schiff_base
for index, row in tmp.iterrows():
    row = row.to_dict()
    mol = row['adduct_pdb']
    At, = mol.GetSubstructMatch(Chem.MolFromSmarts('[At]'))
    mol.GetAtomWithIdx(At).SetFormalCharge(1) 
    mol = Chem.Mol(mol)
    mol.GetAtomWithIdx(At).SetAtomicNum(8)
    row['ligand_pdb'    ] = mol
    row['warhead'          ] = 'Carbonyl_Oxygen'
    row['ligand_type'      ] = 'inhibitor' 
    row['recovery_strategy'] = 'automated' 
    row['note'             ] = 'Schiff Base'
    row['reaction_type'    ] = 'Addition-Elimination' 
    df.update( pd.DataFrame(data=row, index=[index]))  

## Isothiocyanate

In [42]:
df_remaining = df[df['ligand_type'].isna()]      # for those curated, the ligand type is no longer NaN
df_isothiocyanate = df_remaining[df_remaining['adduct_pdb'] >= Chem.MolFromSmarts('[At]~[#6;D3](~[#16])~[#7]')]
print(len(df_isothiocyanate),len(df_remaining), len(df)) 
#display(df_isothiocyanate)
#df_remaining

28 28 10150


In [43]:
tmp = df_isothiocyanate
for index, row in tmp.iterrows():
    row = row.to_dict()
    mol = row['adduct_pdb']
    At, C, S, N = mol.GetSubstructMatch(Chem.MolFromSmarts('[At]~[#6;D3](~[#16])~[#7]'))
    #mol.GetAtomWithIdx(C).SetNumExplicitHs(0)   
    mol.GetBondBetweenAtoms(C, N).SetBondType(Chem.rdchem.BondType.DOUBLE) 
    mol.GetBondBetweenAtoms(C, S).SetBondType(Chem.rdchem.BondType.SINGLE)  
    mol = Chem.Mol(mol)
    mol.GetBondBetweenAtoms(C, S).SetBondType(Chem.rdchem.BondType.DOUBLE)   
    edMol = Chem.EditableMol(mol)
    edMol.RemoveAtom(At)
    mol = edMol.GetMol() 
    Chem.SanitizeMol(mol, sanitizeOps=Chem.SANITIZE_SETHYBRIDIZATION)
    row['ligand_pdb'    ] = mol
    row['warhead'          ] = 'Isothiocyanate'
    row['ligand_type'      ] = 'inhibitor' 
    row['recovery_strategy'] = 'automated'
    row['reaction_type'    ] = 'Addition'  
    df.update( pd.DataFrame(data=row, index=[index])) 

## AdductMatch

In [44]:
df['ligand_smiles'] = df.apply(lambda r: ToSmiles(r.ligand_pdb, r.ligand_smiles) , axis=1)  

code2smiles = {} 
for index, ligand_name in df['ligand_name'].to_dict().items():
    smiles = df.loc[index,'ligand_smiles']
    if type(smiles) != str: continue
    try:
        smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
    except:
        #print(index, smiles)
        pass
    if ligand_name not in code2smiles:
        code2smiles[ligand_name] = set()
    code2smiles[ligand_name].add(smiles)

isna = df[df['ligand_type'].isna()] 
notna = df[df['ligand_type'].notna()] 
print(len(notna))
for index, row in isna.iterrows():
    row = row.to_dict() 
    row['recovery_strategy'] = 'adduct_matching'
    ligand_name           = row['ligand_name'          ] 
    ligand_bond_atom_name = row['ligand_bond_atom_name'] 
    aa_name               = row['aa_name'              ]  
    aa_bond_atom_name     = row['aa_bond_atom_name'    ] 
    #tmp_ = notna[(notna['ligand_name']==ligand_name) & (notna['ligand_bond_atom_name']==ligand_bond_atom_name) & (notna['aa_name']==aa_name) & (notna['aa_bond_atom_name']==aa_bond_atom_name)]  #
    tmp_ = notna[(notna['ligand_name']==ligand_name) & (notna['ligand_bond_atom_name']==ligand_bond_atom_name) & (notna['aa_name']==aa_name) \
                 & (notna['aa_bond_atom_name']==aa_bond_atom_name) & (notna['recovery_strategy']=='manual')] 
    if len(tmp_):  
        adduct_pdb,warhead,ligand_pdb,reaction_type,ligand_type = df.loc[tmp_.index[0], ['adduct_pdb','warhead','ligand_pdb','reaction_type','ligand_type']]
        row['warhead'] = warhead
        row['ligand_pdb'] = ligand_pdb
        row['reaction_type'] = reaction_type
        row['ligand_type'] = ligand_type 
        if ligand_name in code2smiles:
            smiles = code2smiles[ligand_name]
            row['note'] = '.'.join(smiles) 
        df.update( pd.DataFrame(data=row, index=[index]))  
print(len(df[df['ligand_type'].notna()]), len(df[df['ligand_type'].isna()] ))

RDKit ERROR: [01:49:35] Explicit valence for atom # 11 N, 4, is greater than permitted


10150
10150 0


In [45]:
len(df[df['ligand_type'].isna()])

0

In [46]:
df.to_pickle('data/df.pkl')  