In [1]:
from rdkit import Chem
import pandas as pd
from rdkit.Chem import PandasTools
import pickle 
from tqdm import tqdm
import torch

In [2]:
def mol_with_atom_index( mol ):
    atoms = mol.GetNumAtoms()
    for idx in range( atoms ):
        mol.GetAtomWithIdx( idx ).SetProp( 'molAtomMapNumber', str( mol.GetAtomWithIdx( idx ).GetIdx() ) )
    return mol

In [3]:
lit_original = PandasTools.LoadSDF('../data/liter/00_AvLiLuMoVe_testdata.sdf')

In [4]:
len(lit_original)

123

In [5]:
lit_original.head()

Unnamed: 0,pKa,marvin_pKa,marvin_atom,marvin_pKa_type,ISO_SMI,ID,ROMol
0,7.42,8.6,10,basic,C#CC[NH+](C)[C@H](C)Cc1ccc(F)cc1,353,<rdkit.Chem.rdchem.Mol object at 0x16a8db200>
1,9.5,9.86,9,basic,CC(=O)CC1CCCC[NH2+]1,181,<rdkit.Chem.rdchem.Mol object at 0x16a8db350>
2,9.1,9.05,6,basic,CC(=O)CCC[NH+](C)C,177,<rdkit.Chem.rdchem.Mol object at 0x16a8db2e0>
3,3.6,4.09,8,acidic,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,587,<rdkit.Chem.rdchem.Mol object at 0x16a8db3c0>
4,9.4,9.78,8,basic,CC(=O)OCCCC[NH+](C)C,190,<rdkit.Chem.rdchem.Mol object at 0x16a8db430>


In [6]:
with open('../data/liter/04_AvLiLuMoVe_testdata_mols.pkl', 'rb') as f:
    raw_lit = pickle.load(f)

In [7]:
len(raw_lit)

123

In [8]:
with open('../data/liter/05_AvLiLuMoVe_testdata_pyg_data.pkl', 'rb') as f:
    lit = pickle.load(f)

In [9]:
len(lit)

123

In [10]:
prot_smiles = []
deprot_smiles = []
prot_charges = []
deprot_charges = []
pka = []
chembl_ids = []
centers = []

for data in tqdm(lit):
    prot_smiles.append(data.smiles_prop)
    deprot_smiles.append(data.smiles_deprop)
    prot_charges.append(data.charge_prot)
    deprot_charges.append(data.charge_deprot)
    pka.append(data.reference_value.item())
    chembl_ids.append(data.chembl_id)
    centers.append(data.reaction_center)


100%|██████████| 123/123 [00:00<00:00, 13545.29it/s]


In [61]:
exp_df = pd.DataFrame({'exp_id': chembl_ids,
                          'pka': pka,
                          'center': centers,
                          'prot_smiles': prot_smiles,
                          'deprot_smiles': deprot_smiles,
                          'prot_charge': prot_charges,
                          'deprot_charge': deprot_charges})

In [62]:
unstack_mols = []
seen = set()
c = 0
p = []
for id_ in tqdm(exp_df['exp_id']):
    if id_ not in seen:
        seen.add(id_)
        mols = raw_lit[id_]['mols']
        c += len(mols)
        for mol in mols:
            unstack_mols.append([mol[0], mol[1]])
            p.append(mol[0].GetProp('pKa_number'))

100%|██████████| 123/123 [00:00<00:00, 130047.74it/s]


In [63]:
exp_df['prot_mol'] = [mol_with_atom_index(i[0]) for i in unstack_mols]
exp_df['deprot_mol'] = [mol_with_atom_index(i[1]) for i in unstack_mols]

In [64]:
charge_idx = []
for idx, prot_charge, deprot_charge in zip(exp_df.index, exp_df['prot_charge'], exp_df['deprot_charge']):
    
    if prot_charge > deprot_charge:
        continue 
    charge_idx.append(idx)

In [65]:
len(charge_idx)

0

In [66]:
from rdkit.Chem.rdmolops import GetFormalCharge
mol_idx = []
for idx, prot_mol, deprot_mol in zip(exp_df.index, exp_df['prot_mol'], exp_df['deprot_mol']):
    
    if GetFormalCharge(prot_mol) > GetFormalCharge(deprot_mol):
        continue 
    mol_idx.append(idx)

In [67]:
len(mol_idx)

7

In [68]:
exp_df.loc[mol_idx, ['prot_mol', 'deprot_mol']] = exp_df.loc[mol_idx, ['deprot_mol', 'prot_mol']].values

In [69]:
from rdkit.Chem.rdmolops import GetFormalCharge
mol_idx = []
for idx, prot_mol, deprot_mol in zip(exp_df.index, exp_df['prot_mol'], exp_df['deprot_mol']):
    
    if GetFormalCharge(prot_mol) > GetFormalCharge(deprot_mol):
        continue 
    mol_idx.append(idx)

In [70]:
len(mol_idx)

0

In [71]:
exp_df['acid_base_type'] = exp_df['exp_id'].apply(lambda x: lit_original.loc[int(x.split('mol')[1]), 'marvin_pKa_type'])

In [72]:
from rdkit.Chem.MolStandardize import rdMolStandardize
un = rdMolStandardize.Uncharger()

from collections import defaultdict

from molvs import standardize_smiles

In [73]:
acid_base_smarts = pd.read_csv('../data/pka_smarts/smarts_pattern.tsv', delimiter='\t')

In [49]:
mol_acid_base = defaultdict(list)
for idx, center, prot_mol, deprot_mol in zip(range(exp_df.shape[0]), exp_df['center'], exp_df['prot_mol'], exp_df['deprot_mol']):
    #p = Chem.MolFromSmiles(standardize_smiles(Chem.MolToSmiles(mol_with_atom_index(deprot_mol))))
    #mol = Chem.MolFromSmiles(smi)
    #p = deprot_mol
    prot_mol = un.uncharge(prot_mol)
    #mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol_with_atom_index(mol)))
    mol = Chem.AddHs(prot_mol)
    for smart, a in zip(acid_base_smarts['    SMARTS'], acid_base_smarts['Acid_or_base']):
        if mol.HasSubstructMatch(Chem.MolFromSmarts(smart)):
            matches = sum(mol.GetSubstructMatches(Chem.MolFromSmarts(smart)), ())
            #print(smart, matches)
            center = int(center)
            if center in matches:
                #print('here')
                if deprot_mol.GetAtomWithIdx(center).GetFormalCharge() == -1: 
                    if a == 'A':
                        mol_acid_base[idx].append([smart, a])
                        break
                elif deprot_mol.GetAtomWithIdx(center).GetFormalCharge() == 0:
                    if a == 'B':
                        mol_acid_base[idx].append([smart, a])
                        break
                        

In [50]:
len(mol_acid_base)

121

In [74]:
c = 0 
use_i = []
for i in list(mol_acid_base.keys()):
    if exp_df.loc[i, 'acid_base_type'] == 'acidic' and mol_acid_base[i][0][1] == 'A':
        c += 1
        use_i.append(i)
    elif exp_df.loc[i, 'acid_base_type'] == 'basic' and mol_acid_base[i][0][1] == 'B':
        c += 1
        use_i.append(i)
        

In [75]:
not_use_i = set(list(mol_acid_base.keys())) - set(use_i)

In [76]:
len(not_use_i)

0

In [77]:
not_use_i1 = set(exp_df.index) - set(list(mol_acid_base.keys()))

In [78]:
len(not_use_i1)

2

In [79]:
exp_df['acid_base_string'] = str

In [80]:
exp_df.loc[list(mol_acid_base.keys()), 'acid_base_string'] = [i[0][1] for i in list(mol_acid_base.values())]

In [81]:
res = []
#check2 = []
for i in not_use_i1:
    print(i)
    prot_mol = exp_df.loc[i, 'prot_mol']
    deprot_mol = exp_df.loc[i, 'deprot_mol']
    prot_charge = exp_df.loc[i, 'prot_charge']
    deprot_charge = exp_df.loc[i, 'deprot_charge']
    center = int(exp_df.loc[i, 'center'])

    if prot_mol.GetAtomWithIdx(center).GetFormalCharge() == 0 and deprot_mol.GetAtomWithIdx(center).GetFormalCharge() == -1: 
        #if prot_charge != 0 or deprot_charge != -1:
        #    check2.append(i)
        res.append('A')
    
    if prot_mol.GetAtomWithIdx(center).GetFormalCharge() == 1 and deprot_mol.GetAtomWithIdx(center).GetFormalCharge() == 0: 
        #if prot_charge != 1 or deprot_charge != 0:
        #    check2.append(i)
        res.append('B')

97
90


In [84]:
exp_df.loc[list(not_use_i1), 'acid_base_string'] = res

In [85]:
ROMol = []
#acid_base_num = []
for i in exp_df.index:
    #print(i)
    acid_base = exp_df.loc[i, 'acid_base_string']
    print(acid_base)
    #acid_base_string.append(acid_base)
    if acid_base == 'A':
        ROMol.append(exp_df.loc[i, 'prot_mol'])
    elif acid_base == 'B':
        ROMol.append(exp_df.loc[i, 'deprot_mol'])

B
B
B
A
B
B
B
A
B
B
A
A
B
B
B
A
B
B
B
A
B
B
B
B
A
B
A
A
A
A
B
B
B
B
B
B
B
B
B
B
B
B
B
B
B
B
B
A
B
B
B
B
B
B
A
B
A
A
A
B
B
B
B
B
B
B
B
B
B
B
B
B
B
B
B
B
B
B
B
B
A
A
B
B
B
B
B
B
A
B
A
B
B
B
B
B
B
A
B
B
B
B
A
A
A
A
A
A
A
B
B
B
B
B
B
B
B
B
B
B
B
B
B


In [86]:
len(ROMol)

123

In [87]:
exp_df['ROMol'] = ROMol

In [88]:
from rdkit.Chem.Descriptors import ExactMolWt, HeavyAtomCount, NumHAcceptors, NumHDonors, MolLogP, NumRotatableBonds

In [89]:
mw, hac, nhd, nha, logp, nrb = [], [], [], [], [], []
for mol in tqdm(exp_df['ROMol']):
    mw.append(ExactMolWt(mol))
    hac.append(HeavyAtomCount(mol))
    nhd.append(NumHDonors(mol))
    nha.append(NumHAcceptors(mol))
    logp.append(MolLogP(mol))
    nrb.append(NumRotatableBonds(mol))

100%|██████████| 123/123 [00:00<00:00, 1953.71it/s]


In [90]:
exp_df.groupby('acid_base_string').count()

Unnamed: 0_level_0,exp_id,pka,center,prot_smiles,deprot_smiles,prot_charge,deprot_charge,prot_mol,deprot_mol,acid_base_type,ROMol
acid_base_string,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
A,28,28,28,28,28,28,28,28,28,28,28
B,95,95,95,95,95,95,95,95,95,95,95


In [91]:
exp_df.groupby('acid_base_type').count()

Unnamed: 0_level_0,exp_id,pka,center,prot_smiles,deprot_smiles,prot_charge,deprot_charge,prot_mol,deprot_mol,acid_base_string,ROMol
acid_base_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
acidic,26,26,26,26,26,26,26,26,26,26,26
basic,97,97,97,97,97,97,97,97,97,97,97


In [92]:
exp_df['SMILES'] = exp_df['ROMol'].apply(lambda x: x.GetProp('mol-smiles'))

In [93]:
len(set(exp_df['SMILES']))

123

In [94]:
torch.save(exp_df, '../data/liter/literature_processed.pt')