In [1]:
from rdkit import Chem
import pandas as pd
from rdkit.Chem import PandasTools
from tqdm import tqdm
import torch

In [2]:
def mol_with_atom_index( mol ):
    atoms = mol.GetNumAtoms()
    for idx in range( atoms ):
        mol.GetAtomWithIdx( idx ).SetProp( 'molAtomMapNumber', str( mol.GetAtomWithIdx( idx ).GetIdx() ) )
    return mol

In [3]:
novartis_original = PandasTools.LoadSDF('../data/novartis/00_novartis_testdata.sdf')

In [4]:
novartis_final = torch.load('../data/novartis/novartis_processed.pt')

In [5]:
novartis_original.shape, novartis_final.shape

((280, 7), (274, 12))

In [7]:
novartis_final.head()

Unnamed: 0,exp_id,pka,center,prot_smiles,deprot_smiles,prot_charge,deprot_charge,prot_mol,deprot_mol,acid_base_type,acid_base_check,acid_base_string,ROMol
0,mol0,8.55,13,Brc1ccc(C2CN3C=CSC3=[NH+]2)cc1,Brc1ccc(C2CN3C=CSC3=N2)cc1,1,0,<rdkit.Chem.PropertyMol.PropertyMol object at ...,<rdkit.Chem.PropertyMol.PropertyMol object at ...,basic,consistent,B,<rdkit.Chem.PropertyMol.PropertyMol object at ...
1,mol1,5.5,4,C/C(=C\c1cc(C)cc[nH+]1)[C@@H]1C[C@@H]2O[C@]2(C...,C/C(=C\c1cc(C)ccn1)[C@@H]1C[C@@H]2O[C@]2(C)CCC...,1,0,<rdkit.Chem.PropertyMol.PropertyMol object at ...,<rdkit.Chem.PropertyMol.PropertyMol object at ...,basic,consistent,B,<rdkit.Chem.PropertyMol.PropertyMol object at ...
2,mol2,4.5,5,C/C(=C\c1ccc(C)c[nH+]1)[C@@H]1C[C@@H]2O[C@]2(C...,C/C(=C\c1ccc(C)cn1)[C@@H]1C[C@@H]2O[C@]2(C)CCC...,1,0,<rdkit.Chem.PropertyMol.PropertyMol object at ...,<rdkit.Chem.PropertyMol.PropertyMol object at ...,basic,consistent,B,<rdkit.Chem.PropertyMol.PropertyMol object at ...
3,mol3,5.5,6,C/C(=C\c1cccc(C)[nH+]1)[C@@H]1C[C@@H]2O[C@]2(C...,C/C(=C\c1cccc(C)n1)[C@@H]1C[C@@H]2O[C@]2(C)CCC...,1,0,<rdkit.Chem.PropertyMol.PropertyMol object at ...,<rdkit.Chem.PropertyMol.PropertyMol object at ...,basic,consistent,B,<rdkit.Chem.PropertyMol.PropertyMol object at ...
4,mol4,5.0,32,C/C(=C\c1cccc[nH+]1)[C@@H]1C[C@@H]2O[C@]2(C)CC...,C/C(=C\c1ccccn1)[C@@H]1C[C@@H]2O[C@]2(C)CCC[C@...,1,0,<rdkit.Chem.PropertyMol.PropertyMol object at ...,<rdkit.Chem.PropertyMol.PropertyMol object at ...,basic,consistent,B,<rdkit.Chem.PropertyMol.PropertyMol object at ...


In [6]:
use_id = [int(i.split('mol')[1]) for i in novartis_final['exp_id']]

not_use_id = list(set(range(novartis_original.shape[0])) - set(use_id))

In [7]:
len(not_use_id)

6

In [8]:
exp_makeup = novartis_original.loc[not_use_id, :]

In [9]:
prot_mols = []
deprot_mols = []
for idx, pka_type, mol, center in zip(exp_makeup.index, exp_makeup['marvin_pKa_type'], exp_makeup['ROMol'], exp_makeup['marvin_atom']):
    mol_changed = Chem.RWMol(mol)
    atom = mol_changed.GetAtomWithIdx(int(center))
    charge = atom.GetFormalCharge()
    Ex_Hs = atom.GetNumExplicitHs()
    Tot_Hs = atom.GetTotalNumHs()
   
    if pka_type == 'basic':
        if charge == 0:
            print('here')
            deprot_mols.append(mol)
            atom.SetFormalCharge(charge + 1)
            atom.SetNumExplicitHs(Ex_Hs + 1)
            prot_mols.append(mol_changed)
            
        elif charge == 1:
            prot_mols.append(mol)
            atom.SetFormalCharge(charge - 1)
            atom.SetNumExplicitHs(Ex_Hs - 1)
            deprot_mols.append(mol_changed)
    
    
    if pka_type == 'acidic':
        if charge == 0:
            prot_mols.append(mol)
            atom.SetFormalCharge(charge - 1)
            atom.SetNumExplicitHs(Ex_Hs - 1)
            deprot_mols.append(mol_changed)
        elif charge == -1:
            deprot_mols.append(mol)
            atom.SetFormalCharge(charge + 1)
            atom.SetNumExplicitHs(Ex_Hs + 1)
            prot_mols.append(mol_changed)
    atom.UpdatePropertyCache()
    #break
            

here
here
here
here


In [10]:
exp_makeup['prot_mol'] = prot_mols
exp_makeup['deprot_mol'] = deprot_mols


In [11]:
from rdkit.Chem.MolStandardize import rdMolStandardize
un = rdMolStandardize.Uncharger()

from collections import defaultdict

from molvs import standardize_smiles

In [13]:
acid_base_smarts = pd.read_csv('../data/pka_smarts/smarts_pattern.tsv', delimiter='\t')

mol_acid_base = defaultdict(list)
for idx, center, prot_mol, deprot_mol in zip(exp_makeup.index, exp_makeup['marvin_atom'], exp_makeup['prot_mol'], exp_makeup['deprot_mol']):
    #p = Chem.MolFromSmiles(standardize_smiles(Chem.MolToSmiles(mol_with_atom_index(deprot_mol))))
    #mol = Chem.MolFromSmiles(smi)
    #p = deprot_mol
    prot_mol = un.uncharge(prot_mol)
    #mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol_with_atom_index(mol)))
    mol = Chem.AddHs(prot_mol)
    for smart, a in zip(acid_base_smarts['    SMARTS'], acid_base_smarts['Acid_or_base']):
        if mol.HasSubstructMatch(Chem.MolFromSmarts(smart)):
            matches = sum(mol.GetSubstructMatches(Chem.MolFromSmarts(smart)), ())
            #print(smart, matches)
            center = int(center)
            if center in matches:
                #print('here')
                if deprot_mol.GetAtomWithIdx(center).GetFormalCharge() == -1: 
                    if a == 'A':
                        mol_acid_base[idx].append([smart, a])
                        break
                elif deprot_mol.GetAtomWithIdx(center).GetFormalCharge() == 0:
                    if a == 'B':
                        mol_acid_base[idx].append([smart, a])
                        break
                        

[18:46:29] Running Uncharger
[18:46:29] Running Uncharger
[18:46:29] Running Uncharger
[18:46:29] Running Uncharger
[18:46:29] Running Uncharger
[18:46:29] Running Uncharger


In [14]:
len(mol_acid_base)

6

In [15]:
c = 0 
use_i = []
for i in list(mol_acid_base.keys()):
    if exp_makeup.loc[i, 'marvin_pKa_type'] == 'acidic' and mol_acid_base[i][0][1] == 'A':
        c += 1
        use_i.append(i)
    elif exp_makeup.loc[i, 'marvin_pKa_type'] == 'basic' and mol_acid_base[i][0][1] == 'B':
        c += 1
        use_i.append(i)
        

In [16]:
c

6

In [17]:
exp_makeup['acid_base_string'] = str

In [18]:
exp_makeup.loc[list(mol_acid_base.keys()), 'acid_base_string'] = [i[0][1] for i in list(mol_acid_base.values())]

In [19]:
ROMol = []
#acid_base_num = []
for i in exp_makeup.index:
    acid_base = exp_makeup.loc[i, 'acid_base_string']
    #acid_base_string.append(acid_base)
    if acid_base == 'A':
        ROMol.append(exp_makeup.loc[i, 'prot_mol'])
    elif acid_base == 'B':
        ROMol.append(exp_makeup.loc[i, 'deprot_mol'])

In [20]:
exp_makeup['ROMol_rev'] = ROMol

In [21]:
exp_makeup['exp_id'] = [f'mol{i}' for i in exp_makeup.index]

In [22]:
from rdkit.Chem.Descriptors import ExactMolWt, HeavyAtomCount, NumHAcceptors, NumHDonors, MolLogP, NumRotatableBonds

In [23]:
mw, hac, nhd, nha, logp, nrb = [], [], [], [], [], []
for mol in tqdm(exp_makeup['ROMol']):
    mw.append(ExactMolWt(mol))
    hac.append(HeavyAtomCount(mol))
    nhd.append(NumHDonors(mol))
    nha.append(NumHAcceptors(mol))
    logp.append(MolLogP(mol))
    nrb.append(NumRotatableBonds(mol))

100%|██████████| 6/6 [00:00<00:00, 1413.10it/s]


In [24]:
for i,j in zip(['MolWt', 'HeavyAtomCount', 'NumHDonors', 'NumHAcceptors', 'MolLogP', 'NumRotatableBonds'], [mw, hac, nhd, nha, logp, nrb]):
    exp_makeup[i] = j

In [26]:
exp_makeup_rev = exp_makeup[['exp_id', 'pKa', 'marvin_atom', 'prot_mol', 'deprot_mol', 'marvin_pKa_type',  'acid_base_string', 'ROMol_rev', 'MolWt', 'HeavyAtomCount', 'NumHDonors', 'NumHAcceptors', 'MolLogP',
       'NumRotatableBonds']]

In [27]:
exp_makeup_rev.rename(columns={'pKa': 'pka', 'marvin_atom': 'center', 'marvin_pKa_type': 'acid_base_type', 'ROMol_rev': 'ROMol'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exp_makeup_rev.rename(columns={'pKa': 'pka', 'marvin_atom': 'center', 'marvin_pKa_type': 'acid_base_type', 'ROMol_rev': 'ROMol'}, inplace=True)


In [28]:
from rdkit.Chem.rdmolops import GetFormalCharge

In [29]:
exp_makeup_rev['prot_charge'] = exp_makeup_rev['prot_mol'].apply(lambda x: GetFormalCharge(x))

exp_makeup_rev['deprot_charge'] = exp_makeup_rev['deprot_mol'].apply(lambda x: GetFormalCharge(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exp_makeup_rev['prot_charge'] = exp_makeup_rev['prot_mol'].apply(lambda x: GetFormalCharge(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exp_makeup_rev['deprot_charge'] = exp_makeup_rev['deprot_mol'].apply(lambda x: GetFormalCharge(x))


In [30]:
sum(exp_makeup_rev['prot_charge'] > exp_makeup_rev['deprot_charge'])

6

In [31]:
exp_makeup_rev['prot_smiles'] = exp_makeup_rev['prot_mol'].apply(lambda x: Chem.MolToSmiles(x))

exp_makeup_rev['deprot_smiles'] = exp_makeup_rev['deprot_mol'].apply(lambda x: Chem.MolToSmiles(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exp_makeup_rev['prot_smiles'] = exp_makeup_rev['prot_mol'].apply(lambda x: Chem.MolToSmiles(x))


In [32]:
exp_all = pd.concat([novartis_final, exp_makeup_rev[novartis_final.columns]])

In [33]:
exp_all.shape

(280, 12)

In [35]:
c = 0 
smi = []
for mol in exp_all['ROMol']:
    try:
        smi.append(mol.GetProp('mol-smiles'))
    except:
        c += 1
        smi.append(Chem.MolToSmiles(Chem.MolFromInchi(Chem.MolToInchi(mol))))
        
#exp_all['SMILES'] = exp_all['ROMol'].apply(lambda x:Chem.MolToSmiles(Chem.MolFromInchi(Chem.MolToInchi(x))))

In [36]:
c

6

In [37]:
len(smi)

280

In [38]:
exp_all['SMILES'] = smi

In [39]:
len(set(exp_all['SMILES']))

280

In [40]:
from rdkit.Chem.Descriptors import ExactMolWt, HeavyAtomCount, NumHAcceptors, NumHDonors, MolLogP, NumRotatableBonds

In [41]:
mw, hac, nhd, nha, logp, nrb = [], [], [], [], [], []
for mol in tqdm(exp_all['ROMol']):
    mw.append(ExactMolWt(mol))
    hac.append(HeavyAtomCount(mol))
    nhd.append(NumHDonors(mol))
    nha.append(NumHAcceptors(mol))
    logp.append(MolLogP(mol))
    nrb.append(NumRotatableBonds(mol))

100%|██████████| 280/280 [00:00<00:00, 2380.81it/s]


In [42]:
for i,j in zip(['MolWt', 'HeavyAtomCount', 'NumHDonors', 'NumHAcceptors', 'MolLogP', 'NumRotatableBonds'], [mw, hac, nhd, nha, logp, nrb]):
    exp_all[i] = j

In [44]:
exp_all['id'] = exp_all['exp_id'].apply(lambda x: int(x.split('mol')[1]))

In [45]:
exp_all = exp_all.sort_values('id')[['exp_id', 'pka', 'center', 'prot_smiles', 'deprot_smiles',
       'prot_charge', 'deprot_charge', 'prot_mol', 'deprot_mol',
       'acid_base_type', 'acid_base_string', 'ROMol',
       'MolWt', 'HeavyAtomCount', 'NumHDonors', 'NumHAcceptors', 'MolLogP',
       'NumRotatableBonds', 'SMILES']]

In [46]:
exp_all = exp_all.reset_index(drop=True)

In [47]:
torch.save(exp_all, '../data/novartis/novartis_processed_all.pt')