In [1]:
from data.pub import env, MolFromCpdCifFile, downloader, RN, AN, BN, WhatIsAtomInside, WhatIsBondInside
adduct_cif_dir         = env().adduct_cif_dir  
cpd_sdf_dir            = env().cpd_sdf_dir 
cpd_cif_dir            = env().cpd_cif_dir   
cpd_ref_pkl            = env().cpd_ref_pkl  
cpd_mol_pkl            = env().cpd_mol_pkl
adduct_pkl             = env().adduct_pkl  
import urllib
from IPython.display import SVG, HTML
import io, os, sys, pickle 
from rdkit import Chem
from rdkit.Chem import AllChem 
from rdkit.Chem.Draw import IPythonConsole  



### Download compound_cif according to adduct_cif

In [2]:
collector = []
for cif in os.scandir(adduct_cif_dir):
    cpd_id = cif.name.split('_')[8]  
    link = f'http://files.rcsb.org/ligands/view/{cpd_id}.cif' 
    fname = f'{cpd_cif_dir}/{cpd_id}.cif'  
    collector.append(f'{cpd_id}.cif')
    downloader(link, fname) 

print(set(os.listdir(cpd_cif_dir)).difference(set(collector)), len(set(collector)))

set() 2516


### Download reference, which doesn't have pdb information like atom name

In [3]:
with open(cpd_ref_pkl, 'rb') as fr: Reference = pickle.load(file=fr)

import urllib, bs4  
def getMol(cpd_id): 
    html = urllib.request.urlopen(f'http://www.rcsb.org/ligand/{cpd_id}').read().decode() 
    soup = bs4.BeautifulSoup(html, 'lxml')
    isomeric_smiles = soup.find(id="chemicalIsomeric")
    inchi           = soup.find(id="chemicalInChI"   )
    if isomeric_smiles: 
        mol = Chem.MolFromSmiles(isomeric_smiles.td.text)
        if mol: return mol
        if inchi:
            mol = Chem.MolFromInchi(inchi.td.text)
            if mol: return mol
    print(f'Failed to get Mol for {cpd_id}')
    return None 

import time 
failed_cpds = [] 
for cif in os.scandir(cpd_cif_dir): 
    cpd_id = cif.name[:-4]  
    if cpd_id in Reference: continue
    try:
        Reference[cpd_id]=getMol(cpd_id)
    except Exception as e:
        failed_cpds.append(cpd_id)  
        print(cpd_id, e) 
    time.sleep(1)

In [4]:
for cpd_id in failed_cpds:
    sdf = Chem.SDMolSupplier(f'{cpd_sdf_dir}/{cpd_id}_model.sdf')[0] 
    if sdf:
        sdf.RemoveAllConformers()  
        Reference[cpd_id]=sdf
    else:
        print(cpd_id)
        
import pickle
with open(cpd_ref_pkl, 'wb') as fw: pickle.dump(Reference, file=fw)
new_reference = set([cif.name[:-4] for cif in os.scandir(cpd_cif_dir)])
set(Reference.keys() ).difference(new_reference), set(new_reference).difference(Reference.keys())

(set(), set())

### Read in compound_cif 

In [5]:
with open(cpd_mol_pkl, 'rb') as fr: MolFromCpdCifFile_collector = pickle.load(fr) 
for cif in os.scandir(cpd_cif_dir):
    cpd_id = cif.name[:-4]  
    if cpd_id in MolFromCpdCifFile_collector: continue
    try: 
        mol = MolFromCpdCifFile(cif.path)  
        MolFromCpdCifFile_collector[cpd_id] = mol 
    except Exception as e:
        print(cpd_id, e) 

### Failed in 5 cases

### Compare the MolFromCpdCifFile with SMILES from RCSB

In [6]:
sdf_equal_pdb, sdf_not_equal_pdb = [], [] 
for cpd_id, mol in MolFromCpdCifFile_collector.items():  
    ref = Reference[cpd_id]  
    ref = Chem.RemoveHs(ref)
    if mol.HasSubstructMatch(ref) and ref.HasSubstructMatch(mol): # if this is satisfied which mean the tPDB we comeup equals tSDF from RCSB 
        sdf_equal_pdb.append(cpd_id) 
    else: 
        sdf_not_equal_pdb.append(cpd_id)  
print(len(sdf_equal_pdb), len(sdf_not_equal_pdb))

2496 20


### Again, failed in cases. But that is acceptable.

In [7]:
Inspected = ['IF6', 'JQG', '1G1', '4EY', 'NO', '2NO', 'RCY', '3GE', 'RXR', 'MLX', '1G6', 'QT4', 'P5F', 'R7A', 'E07', 'R1F', '5BF', 'HUJ', 'R1B', 'V1A']
for cpd_id in sdf_not_equal_pdb: # 
    if cpd_id in Inspected: continue
    print(cpd_id) 
    display(MolFromCpdCifFile_collector[cpd_id])
    display(SVG( urllib.request.urlopen(f'https://cdn.rcsb.org/images/ccd/labeled/{cpd_id[0]}/{cpd_id}.svg').read().decode()))

In [8]:
with open(cpd_mol_pkl, 'wb') as fw: pickle.dump(MolFromCpdCifFile_collector, file=fw)

### Now, all cpd are in the MolFromCpdCifFile_collector

In [9]:
from rdkit.Chem import Draw, rdDepictor
from PIL import Image, ImageChops, ImageOps 

Draw.DrawingOptions.dotsPerAngstrom = 40           # default is 30, bigger for fewer clash   
def targetbbox(pdb, size):
    im = Draw.MolToImage(pdb,size=size)
    bg = Image.new('RGBA', size, '#ffffff') # mode, size, color string 
    diff = ImageChops.difference(im, bg)
    return diff.getbbox()  

def drawEachAtomName(tPDB, size=(4000,2000)):  
    bbox = targetbbox(tPDB, size=size) # bbox: left, upper, right, and lower 
    target_size = (bbox[2]-bbox[0])*2, (bbox[3]-bbox[1])*2   
    drawer = Draw.MolDraw2DSVG(target_size[0], target_size[1])  
    opts = drawer.drawOptions()
    ligEndAtomIdx = [atom.GetIdx() for atom in tPDB.GetAtoms() if AN(atom)==name_list[5]+'_'+name_list[7][:-4]]
    [opts.atomLabels.__setitem__(atom.GetIdx(), str(atom.GetIdx())+'_'+AN(atom)) for atom in tPDB.GetAtoms()]  
    drawer.SetFontSize(0.4)  
    rdDepictor.Compute2DCoords(tPDB)       
    drawer.DrawMolecule(tPDB, highlightAtoms=ligEndAtomIdx) 
    drawer.FinishDrawing()   
    return drawer.GetDrawingText().replace(u'svg:',u'')

RN = lambda atom: atom.GetMonomerInfo().GetResidueName().strip() # Residue Name        
AN = lambda atom: RN(atom)+'_'+atom.GetMonomerInfo().GetName().strip() # Atom Name
WhatIsAtomInside = lambda pdb: set([AN(atom) for atom in pdb.GetAtoms() if atom.GetSymbol() != 'H']) 

def drawEachAtomName(mol, size=(1000,1000)):    
    drawer = Draw.MolDraw2DSVG(size[0], size[1])
    opts = drawer.drawOptions()
    [ opts.atomLabels.__setitem__(a.GetIdx(), AN(a)) for a in mol.GetAtoms()]
    drawer.SetFontSize(0.6)  
    rdDepictor.Compute2DCoords(mol)       
    drawer.DrawMolecule(mol) 
    drawer.FinishDrawing()   
    return drawer.GetDrawingText().replace(u'svg:',u'')  

## 2D Adduct Recovery 

In [10]:
%%html 
<div style="height:600px;width: 750px;"><?xml version="1.0"?><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="71.75 60.51 363 286.5" preserveAspectRatio="xMidYMid" width="100%" height="100%"><clipPath id="238"><rect x="0.5" y="0.5" width="539" height="719" /></clipPath><g id="drawing"><rect x="71.75" y="60.51" width="100%" height="100%" fill="rgb(255, 255, 255)" /><g id="page" clip-path="url(#238)"><g id="Dash"><path d="M237.47 116.7 L237.47 116.7 L236.97 115.83 L234.63 117.18 L234.63 117.76 L234.63 118.34 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M258.27 104.69 L258.27 104.69 L257.77 103.82 L260.11 102.47 L260.61 102.76 L260.61 103.34 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M239.76 115.38 L239.76 115.38 L242.1 114.03 L241.6 113.16 L239.26 114.51 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M244.39 112.7 L244.39 112.7 L246.72 111.35 L246.22 110.49 L243.89 111.84 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M249.02 110.03 L249.02 110.03 L251.35 108.68 L250.85 107.82 L248.52 109.17 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M253.64 107.36 L253.64 107.36 L255.98 106.01 L255.48 105.14 L253.14 106.49 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /></g><g id="Dash"><path d="M262.95 104.69 L262.95 104.69 L263.45 103.82 L261.11 102.47 L260.61 102.76 L260.61 103.34 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M284 116.84 L284 116.84 L284.5 115.98 L286.84 117.33 L286.59 117.76 L286.34 118.19 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M265.29 106.04 L265.29 106.04 L267.63 107.39 L268.13 106.52 L265.79 105.17 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M269.97 108.74 L269.97 108.74 L272.3 110.09 L272.8 109.22 L270.47 107.87 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M274.65 111.44 L274.65 111.44 L276.98 112.79 L277.48 111.92 L275.15 110.57 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M279.32 114.14 L279.32 114.14 L281.66 115.49 L282.16 114.63 L279.82 113.28 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /></g><g id="Dash"><path d="M261.11 99.77 L261.11 99.77 L260.11 99.77 L260.11 102.47 L260.61 102.76 L261.11 102.47 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M261.11 75.46 L261.11 75.46 L260.11 75.46 L260.11 72.76 L260.61 72.76 L261.11 72.76 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M261.11 97.07 L261.11 97.07 L261.11 94.37 L260.11 94.37 L260.11 97.07 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M261.11 91.67 L261.11 91.67 L261.11 88.97 L260.11 88.97 L260.11 91.67 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M261.11 86.26 L261.11 86.26 L261.11 83.56 L260.11 83.56 L260.11 86.26 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M261.11 80.86 L261.11 80.86 L261.11 78.16 L260.11 78.16 L260.11 80.86 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /></g><g id="Dash"><path d="M232.29 115.83 L232.29 115.83 L231.79 116.7 L234.63 118.34 L234.63 117.76 L234.63 117.18 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M211.23 103.68 L211.23 103.68 L210.73 104.54 L208.39 103.19 L208.64 102.76 L208.89 102.33 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M229.95 114.48 L229.95 114.48 L227.61 113.13 L227.11 114 L229.45 115.35 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M225.27 111.78 L225.27 111.78 L222.93 110.43 L222.43 111.3 L224.77 112.65 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M220.59 109.08 L220.59 109.08 L218.25 107.73 L217.75 108.6 L220.09 109.95 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /><path d="M215.91 106.38 L215.91 106.38 L213.57 105.03 L213.07 105.89 L215.41 107.24 Z" stroke="rgb(191, 191, 191)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(191, 191, 191)" shape-rendering="geometricPrecision" /></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="340.02" y="229.82" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">C</tspan><tspan font-family="Arial" dy="2.26" font-size="7.5" fill="rgb(255, 0, 0)">XYZ_C1</tspan></text></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="366.01" y="214.82" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">C</tspan><tspan font-family="Arial" dy="2.26" font-size="7.5" fill="rgb(255, 0, 0)">XYZ_C2</tspan></text></g><g id="Solid"><path d="M348.78 223.52 L348.78 223.52 L348.53 223.09 L348.28 222.66 L364.62 213.22 L364.87 213.66 L365.12 214.09 Z" stroke="rgb(0, 0, 0)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(0, 0, 0)" shape-rendering="geometricPrecision" /></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="391.99" y="229.82" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">C</tspan><tspan font-family="Arial" dy="2.26" font-size="7.5" fill="rgb(255, 0, 0)">XYZ_C3</tspan></text></g><g id="Solid"><path d="M381.1 218.13 L381.1 218.13 L381.35 217.7 L381.6 217.27 L391.22 222.82 L390.97 223.25 L390.72 223.68 Z" stroke="rgb(0, 0, 0)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(0, 0, 0)" shape-rendering="geometricPrecision" /></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="365.71" y="184.82" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">O</tspan><tspan font-family="Arial" dy="2.26" font-size="7.5" fill="rgb(255, 0, 0)">XYZ_O1</tspan></text></g><g id="Solid"><line x1="367.81" y1="205.6" x2="367.81" y2="188.7" stroke="rgb(0, 0, 0)" stroke-width="1" shape-rendering="auto" /><path d="M370.91 205.6 L370.91 205.6 L371.91 205.6 L371.91 188.7 L370.91 188.7 Z" stroke="rgb(0, 0, 0)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(0, 0, 0)" shape-rendering="geometricPrecision" /><path d="M367.31 205.6 L367.31 205.6 L368.31 205.6 L368.31 188.7 L367.31 188.7 Z" stroke="rgb(0, 0, 0)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(0, 0, 0)" shape-rendering="geometricPrecision" /></g><g><text xml:space="preserve" text-anchor="end" text-rendering="geometricPrecision"><tspan x="318.74" y="214.82" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">Cl</tspan></text></g><g id="Solid"><path d="M339.32 222.85 L339.32 222.85 L339.07 223.29 L338.82 223.72 L319.58 212.62 L319.83 212.19 L320.08 211.75 Z" stroke="rgb(0, 0, 0)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(0, 0, 0)" shape-rendering="geometricPrecision" /></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="205.74" y="105.41" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">O</tspan></text></g><g id="Rectangle"><path d="M175.25 61.51 L320.25 61.51 C323.56 61.51 326.25 64.2 326.25 67.51 L326.25 126.51 C326.25 129.82 323.56 132.51 320.25 132.51 L175.25 132.51 C171.94 132.51 169.25 129.82 169.25 126.51 L169.25 67.51 C169.25 64.2 171.94 61.51 175.25 61.51 " stroke="rgb(0, 0, 0)" stroke-width="1" fill-rule="evenodd" stroke-dasharray="2.7" stroke-linejoin="bevel" fill="none" shape-rendering="geometricPrecision" /></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="188.63" y="77.26" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">adduct.cif</tspan></text></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="231.25" y="122.14" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">C</tspan><tspan font-family="Arial" dy="2.26" font-size="7.5" fill="rgb(255, 0, 0)">XYZ_C1</tspan></text></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="283.75" y="122.64" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">C</tspan><tspan font-family="Arial" dy="2.26" font-size="7.5" fill="rgb(255, 0, 0)">XYZ_C3</tspan></text></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="257.38" y="107.51" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">C</tspan><tspan font-family="Arial" dy="2.26" font-size="7.5" fill="rgb(255, 0, 0)">XYZ_C2</tspan></text></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="256.75" y="76.51" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">O</tspan><tspan font-family="Arial" dy="2.26" font-size="7.5" fill="rgb(255, 0, 0)">XYZ_O1</tspan></text></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="285.25" y="191.67" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">compound2.cif</tspan></text></g><g id="Rectangle"><path d="M282.75 170.67 L427.75 170.67 C431.06 170.67 433.75 173.36 433.75 176.67 L433.75 235.67 C433.75 238.98 431.06 241.67 427.75 241.67 L282.75 241.67 C279.44 241.67 276.75 238.98 276.75 235.67 L276.75 176.67 C276.75 173.36 279.44 170.67 282.75 170.67 " stroke="rgb(0, 0, 0)" stroke-width="1" fill-rule="evenodd" stroke-dasharray="2.7" stroke-linejoin="bevel" fill="none" shape-rendering="geometricPrecision" /></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="282.75" y="216.17" font-family="Arial" dy="2.26" font-size="7.5" fill="rgb(255, 0, 0)">XYZ_Cl</tspan></text></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="136.02" y="230.15" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">C</tspan></text></g><g id="Solid"><path d="M144.78 223.85 L144.78 223.85 L144.53 223.42 L144.28 222.99 L160.72 213.5 L160.97 213.93 L161.22 214.36 Z" stroke="rgb(0, 0, 0)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(0, 0, 0)" shape-rendering="geometricPrecision" /></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="162.01" y="215.15" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">C</tspan><tspan font-family="Arial" dy="2.26" font-size="7.5" fill="rgb(255, 0, 0)">XYZ_C2</tspan></text></g><g id="Solid"><path d="M175.48 217.53 L175.48 217.53 L175.73 217.09 L175.98 216.66 L187.32 223.21 L187.07 223.64 L186.82 224.07 Z" stroke="rgb(0, 0, 0)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(0, 0, 0)" shape-rendering="geometricPrecision" /></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="187.99" y="230.15" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">C</tspan><tspan font-family="Arial" dy="2.26" font-size="7.5" fill="rgb(255, 0, 0)">XYZ_C3</tspan></text></g><g id="Solid"><line x1="163.81" y1="205.9" x2="163.81" y2="189" stroke="rgb(0, 0, 0)" stroke-width="1" shape-rendering="auto" /><path d="M166.91 205.9 L166.91 205.9 L167.91 205.9 L167.91 189 L166.91 189 Z" stroke="rgb(0, 0, 0)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(0, 0, 0)" shape-rendering="geometricPrecision" /><path d="M163.31 205.9 L163.31 205.9 L164.31 205.9 L164.31 189 L163.31 189 Z" stroke="rgb(0, 0, 0)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(0, 0, 0)" shape-rendering="geometricPrecision" /></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="161.71" y="185.15" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">O</tspan><tspan font-family="Arial" dy="2.26" font-size="7.5" fill="rgb(255, 0, 0)">XYZ_O1</tspan></text></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="81.25" y="192" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">compound1.cif</tspan></text></g><g id="Rectangle"><path d="M78.75 171 L223.75 171 C227.06 171 229.75 173.69 229.75 177 L229.75 236 C229.75 239.31 227.06 242 223.75 242 L78.75 242 C75.44 242 72.75 239.31 72.75 236 L72.75 177 C72.75 173.69 75.44 171 78.75 171 " stroke="rgb(0, 0, 0)" stroke-width="1" fill-rule="evenodd" stroke-dasharray="2.7" stroke-linejoin="bevel" fill="none" shape-rendering="geometricPrecision" /></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="107.75" y="231" font-family="Arial" dy="2.26" font-size="7.5" fill="rgb(255, 0, 0)">XYZ_C1</tspan></text></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="175.25" y="107.51" font-family="Arial" dy="2.26" font-size="7.5" fill="rgb(255, 0, 0)">SER_OG</tspan></text></g><g id="Hollow"><path d="M155.99 247.76 L164.52 256.28 L168.76 252.04 L168.76 269.01 L151.79 269.01 L156.03 264.77 L147.51 256.24 Z" stroke="rgb(0, 0, 0)" stroke-width="1" fill-rule="evenodd" stroke-linejoin="miter" fill="none" shape-rendering="geometricPrecision" /></g><g id="Rectangle"><path d="M179.15 275.01 L324.15 275.01 C327.46 275.01 330.15 277.7 330.15 281.01 L330.15 340.01 C330.15 343.32 327.46 346.01 324.15 346.01 L179.15 346.01 C175.84 346.01 173.15 343.32 173.15 340.01 L173.15 281.01 C173.15 277.7 175.84 275.01 179.15 275.01 " stroke="rgb(0, 0, 0)" stroke-width="1" fill-rule="evenodd" stroke-dasharray="2.7" stroke-linejoin="bevel" fill="none" shape-rendering="geometricPrecision" /></g><g><text xml:space="preserve" text-anchor="middle" text-rendering="geometricPrecision"><tspan x="217.78" y="290.26" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">recovered</tspan></text></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="203.32" y="301.7" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">adduct</tspan></text></g><g id="Hollow"><path d="M160.05 149.83 L164.29 154.07 L164.29 137.1 L147.32 137.1 L151.56 141.34 L145.58 147.32 L141.34 143.08 L141.34 160.05 L158.31 160.05 L154.07 155.81 L160.05 149.83 Z" stroke="rgb(0, 0, 0)" stroke-width="1" fill-rule="evenodd" stroke-linejoin="miter" fill="none" shape-rendering="geometricPrecision" /></g><g id="Hollow"><path d="M344.36 143 L348.6 138.75 L331.63 138.75 L331.63 155.72 L335.87 151.48 L341.77 157.38 L337.53 161.63 L354.5 161.63 L354.5 144.66 L350.26 148.9 L344.36 143 Z" stroke="rgb(0, 0, 0)" stroke-width="1" fill-rule="evenodd" stroke-linejoin="miter" fill="none" shape-rendering="geometricPrecision" /></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="96.92" y="264.49" font-family="Arial" font-size="12" fill="rgb(0, 0, 0)">Addition</tspan></text></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="361.66" y="264.49" font-family="Arial" font-size="12" fill="rgb(0, 0, 0)">Substitution</tspan></text></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="236.13" y="335.72" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">C</tspan><tspan font-family="Arial" dy="2.26" font-size="7.5" fill="rgb(255, 0, 0)">XYZ_C1</tspan></text></g><g id="Solid"><path d="M244.88 329.42 L244.88 329.42 L244.63 328.99 L244.38 328.56 L260.72 319.12 L260.97 319.56 L261.22 319.99 Z" stroke="rgb(0, 0, 0)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(0, 0, 0)" shape-rendering="geometricPrecision" /></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="262.11" y="320.72" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">C</tspan><tspan font-family="Arial" dy="2.26" font-size="7.5" fill="rgb(255, 0, 0)">XYZ_C2</tspan></text></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="288.09" y="335.72" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">C</tspan><tspan font-family="Arial" dy="2.26" font-size="7.5" fill="rgb(255, 0, 0)">XYZ_C3</tspan></text></g><g id="Solid"><path d="M277.2 324.03 L277.2 324.03 L277.45 323.6 L277.7 323.17 L287.32 328.72 L287.07 329.15 L286.82 329.58 Z" stroke="rgb(0, 0, 0)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(0, 0, 0)" shape-rendering="geometricPrecision" /></g><g id="Solid"><line x1="263.91" y1="311.5" x2="263.91" y2="294.6" stroke="rgb(0, 0, 0)" stroke-width="1" shape-rendering="auto" /><path d="M267.01 311.5 L267.01 311.5 L268.01 311.5 L268.01 294.6 L267.01 294.6 Z" stroke="rgb(0, 0, 0)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(0, 0, 0)" shape-rendering="geometricPrecision" /><path d="M263.41 311.5 L263.41 311.5 L264.41 311.5 L264.41 294.6 L263.41 294.6 Z" stroke="rgb(0, 0, 0)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(0, 0, 0)" shape-rendering="geometricPrecision" /></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="261.81" y="290.72" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">O</tspan><tspan font-family="Arial" dy="2.26" font-size="7.5" fill="rgb(255, 0, 0)">XYZ_O1</tspan></text></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="209.85" y="320.72" font-family="Arial" font-size="10" fill="rgb(0, 0, 0)">O</tspan></text></g><g id="Solid"><path d="M235.42 328.75 L235.42 328.75 L235.17 329.19 L234.92 329.62 L218.58 320.19 L218.83 319.75 L219.08 319.32 Z" stroke="rgb(0, 0, 0)" stroke-width="0" fill-rule="evenodd" stroke-linejoin="miter" fill="rgb(0, 0, 0)" shape-rendering="geometricPrecision" /></g><g><text xml:space="preserve" text-anchor="start" text-rendering="geometricPrecision"><tspan x="178.85" y="322.07" font-family="Arial" dy="2.26" font-size="7.5" fill="rgb(255, 0, 0)">SER_OG</tspan></text></g><g id="Hollow"><path d="M352.91 256.33 L344.39 264.86 L348.63 269.1 L331.66 269.1 L331.66 252.13 L335.9 256.37 L344.43 247.85 Z" stroke="rgb(0, 0, 0)" stroke-width="1" fill-rule="evenodd" stroke-linejoin="miter" fill="none" shape-rendering="geometricPrecision" /></g></g></g></svg></div> 

### Strategy
##### This is a fictional scenario but it conveys the gist of our strategy. <br>
* Compounds 1&2 cif are the product form of 
* Files which one can get from RCSB PDB are compounds 1&2 cif and mmcif (from which we extracted adduct.cif).  
* The 2D structure of compounds 1 (residue and modulator) can be rebuilt based on the compound cif since compound cif contains both atom info and bond info (see compounds 1&2 cif above).  
* However, 2D structure of adduct cannot be rebuilt based on adduct.cif (see above) since it only contains atom info like residue/modulator name and atom name. 
* It is a fact that an adduct is produced by a reaction between residue and modulator.
* Therefore, one could reproduced 2D adduct by combining the 2D structure of modulator and residue.
* Generally speaking, the adduct either contain all atoms from modulator (addition) or all atoms excluding those belongs the leaving group (substituion). 
* One can infer the reaction mechanism by comparison of atoms in modulator with those in adduct.
* Add an additional atom SER_OG to XYZ_C1 when the mechanism is addition while replace the unfound atom XYZ_Cl with SER_OG when mechanism is substitution 

In [11]:
from CifFile import ReadCif  
need_Additional_At            = []
need_Subst_with_At            = {} # substitution, which at shoud be sub
manualCuration                = []  
for adduct_cif in os.scandir(adduct_cif_dir):   
    name_list = adduct_cif.name.split('_')  
    cpd_id = name_list[8]
    LSCBEA = name_list[8]+'_'+name_list[11] # ligand-side-covalent-bond-end-atom
    #     0 1   2 3 4   5 6 7   8  9 10 11 12
    # '4QZ2_K_THR_1_?_OG1_?_K_04C_301_?_C9_?_cif' 
    with open(adduct_cif.path) as fr: 
        block = ReadCif(fr).first_block()
        auth_comp_id = block['_atom_site.auth_comp_id'] 
        auth_atom_id = block['_atom_site.auth_atom_id'] 
        atom_name_in_adduct = [resn+'_'+atmn for resn, atmn in zip(auth_comp_id, auth_atom_id)] 
    template = MolFromCpdCifFile_collector[cpd_id]                      # Template, generated from compound cif
    # We begin with the ligand-side-covalent-bond-end-atom on ligand 
    ligEndAtom_template = [atom for atom in template.GetAtoms() if AN(atom) == LSCBEA][0]          # Atom In tPDB
    atom_name_around_ligand_end_atom_in_template = set([AN(atom) for atom in ligEndAtom_template.GetNeighbors()])
    more_atom_in_template = set(atom_name_around_ligand_end_atom_in_template).difference(set(atom_name_in_adduct))
    if   len(more_atom_in_template)==1: # Substitution
        need_Subst_with_At[adduct_cif.name] = more_atom_in_template.pop()
    elif len(more_atom_in_template)==0: # Reaction mechanism tend to be addition 
        need_Additional_At.append(adduct_cif.name)
    else:
        manualCuration.append(adduct_cif.name)
        #display(SVG(drawEachAtomName(template))) 

In [12]:
len(need_Additional_At), len(need_Subst_with_At), len(manualCuration), len(need_Additional_At)+len(need_Subst_with_At)+len(manualCuration), len(os.listdir(adduct_cif_dir))

(7310, 4127, 120, 11557, 11557)

### Add a At

In [13]:
Passed, Failed = {}, []
for cif_name in need_Additional_At:  
    #                   0 1   2 3 4   5 6 7   8  9 10 11 12
    #if cif_name != '4QZ2_K_THR_1_?_OG1_?_K_04C_301_?_C9_?_cif': continue    
    name_list = cif_name.split('_')    
    LSCBEA = name_list[8]+'_'+name_list[11] # ligand-side-covalent-bond-end-atom
    template  = MolFromCpdCifFile_collector[name_list[8]] 
    ligEndAtomIdx = [atom.GetIdx() for atom in template.GetAtoms() if AN(atom)==LSCBEA][0] 
    ed = Chem.EditableMol(template)
    aaEndAtomIdx = ed.AddAtom(Chem.Atom(85)) 
    ed.AddBond(ligEndAtomIdx, aaEndAtomIdx, order=Chem.rdchem.BondType.SINGLE)
    template = ed.GetMol()
    try:
        sio = sys.stderr = io.StringIO() 
        Chem.Draw.MolToImage(template)
        template.RemoveAllConformers()
        Passed[cif_name] = template
    except Exception as e:
        #print(cif_name, e)
        Failed.append(cif_name) 

In [14]:
len(Passed), len(Failed)

(7061, 249)

### Substitute One Atom

In [15]:
for cif_name, name_of_atom_to_be_substituted in need_Subst_with_At.items(): 
    #               0 1   2 3 4   5 6 7   8  9 10 11 12
    #if cif_name != '4QZ2_K_THR_1_?_OG1_?_K_04C_301_?_C9_?_cif': continue   
    name_list = cif_name.split('_')
    LSCBEA = name_list[8]+'_'+name_list[11] # ligand-side-covalent-bond-end-atom
    template = Chem.Mol(MolFromCpdCifFile_collector[name_list[8]])   
    #display(SVG(drawEachAtomName(template)))  
    assumedAminoAcidEndAtom = [atom for atom in template.GetAtoms() if AN(atom) == name_of_atom_to_be_substituted][0] # Atom In tPDB  
    assumedAminoAcidEndAtom.SetAtomicNum(85) 
    sio = sys.stderr = io.StringIO()
    try:  
        atom_idxs = [atom.GetIdx() for atom in assumedAminoAcidEndAtom.GetNeighbors() if AN(atom)!=LSCBEA] 
        if atom_idxs:    
            #print(atom_idxs)
            edMol = Chem.EditableMol(template)
            [edMol.RemoveBond(i, assumedAminoAcidEndAtom.GetIdx()) for i in sorted(atom_idxs, reverse=True)]
            template = edMol.GetMol()
            AtIdx = template.GetSubstructMatch(Chem.MolFromSmarts('[At]'))[0] 
            atom_idxs_to_remove = [atom_idxs for atom_idxs in Chem.GetMolFrags(template) if AtIdx not in atom_idxs]
            atom_idxs_to_remove = sorted([j for i in atom_idxs_to_remove for j in i], reverse=True) 
            edMol = Chem.EditableMol(template) 
            [edMol.RemoveAtom(i) for i in atom_idxs_to_remove]
            template = edMol.GetMol()  
            AtIdx = template.GetSubstructMatch(Chem.MolFromSmarts('[At]'))[0]
            assumedAminoAcidEndAtom = template.GetAtomWithIdx(AtIdx)
            if assumedAminoAcidEndAtom.GetIsAromatic(): assumedAminoAcidEndAtom.SetIsAromatic(False) # ValueError: Sanitization error: non-ring atom 14 marked aromatic
            Chem.SanitizeMol(template)  
        Passed[cif_name]=template 
    except Exception as e:  
        #print(cif_name, e)
        Failed.append(cif_name)   

In [16]:
print(len(Passed), len(Failed), len(manualCuration))
print(len(Passed)+ len(Failed)+ len(manualCuration), len(os.listdir(adduct_cif_dir)))

11179 258 120
11557 11557


In [17]:
with open(adduct_pkl, 'wb') as fw: pickle.dump(Passed, file=fw)  

### Summary
* We tried to read the compound cif downloaded from RCSB PDB as the template. 
* Based on the template, we tried to recover the 2D structure of adduct via two strategy. Although encountered failures, we got pretty good result.