In [1]:
from os import listdir
import pickle
import math
import random

import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product

from rdkit import Chem
from rdkit.Chem import ChemicalFeatures, MolFromSmiles, MolToSmiles, AllChem
from rdkit import Geometry
from rdkit.Chem.Pharm3D import Pharmacophore, EmbedLib
from rdkit.RDPaths import RDDataDir
import os.path

from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV
from scipy.stats import entropy
from scipy.spatial.distance import jensenshannon as jsd
from scipy.stats import ks_2samp

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 200
%matplotlib inline

from ipywidgets import interact

from frag_funcs import return_random_dataframe, return_pcore_dataframe, get_pair_distances, get_trip_distances

fdefFile = os.path.join(RDDataDir,'BaseFeatures.fdef')
featFactory = ChemicalFeatures.BuildFeatureFactory(fdefFile)

with open('frag_pair_distance_dict.pickle', 'rb') as handle:
    frag_pair_distance_dict = pickle.load(handle)
    
with open('kde_dict_opt.pickle', 'rb') as handle:
    kde_dict = pickle.load(handle)  

In [2]:
interesting_pcores = ['Donor', 'Acceptor', 'Aromatic']

df = pd.read_csv('data/activity_data-3.csv').sort_values(by='f_avg_IC50')
# df = df[df['f_avg_IC50']>3]        
 
hits = []
for i,row in df.iterrows():
    hits.append([MolFromSmiles(row['SMILES'])])    
    
hit_mols = [None]*len(hits)
for i,hit in tqdm(enumerate(hits), total=len(hits)):
    try:
        mol = MolFromSmiles(MolToSmiles(hit[0]))

        #constrained conformer generation
        mol = Chem.AddHs(mol)
        AllChem.EmbedMolecule(mol)

        mol = Chem.RemoveHs(mol)

        #calculate properties from surviving conformers
        conf = mol.GetConformer()

        mol_data = [mol]
        for j,atom in enumerate(mol.GetAtoms()):
            mol_data.append([atom.GetSymbol(),
                                conf.GetPositions()[j]
                                ])
        hit_mols[i] = mol_data
    except Exception as ex:
        print(ex)
        print(str(i)+' failed')
        continue
hit_df = return_pcore_dataframe(hit_mols, interesting_pcores, hit=False)

hit_pairs = [None]*len(set(hit_df['mol_id']))
for j,i in tqdm(enumerate(set(hit_df['mol_id'])), total=len(hit_pairs)):
    hit_pair_individual = {}
    
    for pcore_pair in product(interesting_pcores,repeat=2):
        core_a,core_b = pcore_pair
        combo = core_a+'-'+core_b
        hit_pair_individual[combo]= get_pair_distances(hit_df[hit_df['mol_id']==i], core_a, core_b, frag=False, active=None)
    hit_pairs[j] = hit_pair_individual

 76%|███████▌  | 1062/1396 [00:54<00:22, 14.59it/s]RDKit ERROR: [09:46:59] UFFTYPER: Unrecognized atom type: S_6+6 (10)
 99%|█████████▊| 1377/1396 [01:10<00:00, 29.83it/s]RDKit ERROR: [09:47:15] UFFTYPER: Unrecognized atom type: Au6 (13)
 99%|█████████▉| 1381/1396 [01:10<00:00, 29.72it/s]RDKit ERROR: [09:47:15] UFFTYPER: Unrecognized atom type: Au6 (7)
RDKit ERROR: [09:47:15] UFFTYPER: Unrecognized atom type: Au6 (6)
100%|██████████| 1396/1396 [01:11<00:00, 19.60it/s]
100%|██████████| 1396/1396 [00:54<00:00, 25.51it/s]
100%|██████████| 1396/1396 [00:50<00:00, 27.43it/s]


In [3]:
important = ['Donor-Aromatic',
             'Donor-Acceptor',
            'Aromatic-Aromatic']
unimportant = ['Donor-Donor',
               'Aromatic-Acceptor',
               'Acceptor-Acceptor']


# important = ['Donor-Aromatic',
#             'Aromatic-Aromatic']
# unimportant = ['Donor-Donor',
#                'Donor-Acceptor',
#                'Aromatic-Acceptor',
#                'Acceptor-Acceptor']

pairs = important+unimportant

hit_imp = np.empty((len(hit_pairs), len(important)))
hit_nonimp = np.empty((len(hit_pairs), len(unimportant)))

m = 0
p = 0 
for combo in tqdm(pairs):
    core_a,core_b = combo.split('-')
    
    num_weight = len(frag_pair_distance_dict[combo])
    kde = kde_dict[combo]

    for i in range(len(hit_pairs)):
        try:
            ith_score = np.abs(kde.score_samples(hit_pairs[i][combo][0].reshape(-1,1))) #absolute log-prob (smaller = higher prob)

            if combo in important:

                hit_imp[i,m] = np.mean(ith_score)

            else:
                hit_nonimp[i,p] = np.mean(ith_score)

        except Exception as ex:
            if combo in important:
                hit_imp[i,m] = np.nan
            else:
                hit_nonimp[i,p] = np.nan

    if combo in important:
        m+=1
    else:
        p+=1
        
final_hit_imp = np.mean(hit_imp, axis=1)
df = df[['SMILES', 'CID','f_avg_IC50', 'f_avg_pIC50']]

df['score'] = -final_hit_imp
print(df)

100%|██████████| 6/6 [00:03<00:00,  1.65it/s]

                                                 SMILES                  CID  \
874   CC(C)C[C@H](NC(=O)OCc1ccccc1)C(=O)N[C@@H](CC1C...   JOH-MSK-46727e7b-1   
421     C=CC(=O)N(C(=O)C1COc2ccc(Cl)cc21)c1cncc2ccccc12   MAT-POS-e69ad64a-2   
1171                 O=C(Oc1cncc(Cl)c1)c1cccc2[nH]ccc12   ALP-POS-c59291d4-5   
216   C#CCCCC(=O)N[C@H](C(=O)N[C@H](CC(C)C)C(=O)NN(C...   STE-KUL-d79e3d6a-2   
217   C#CCCCC(=O)N[C@H](C(=O)N[C@H](CC(C)C)C(=O)NN(C...   STE-KUL-d79e3d6a-3   
...                                                 ...                  ...   
1359    Cc1ccn2c(-c3ccc4[nH]ncc4c3)c(-c3ccc(F)cc3)nc2c1  ALV-UNI-7ff1a6f9-14   
1363                              CC1C(O)CCCN1Cc1ccccc1   MAK-UNK-6435e6c2-9   
1372      O=C(CCl)N1CCN(Cc2cccc(Cl)c2)C[C@@H]1Cc1ccccc1  DAN-LON-a5fc619e-10   
1379         CS(=O)(=O)N(CCc1ccccc1)CC1CCN(C(=O)CCl)CC1  DUN-NEW-f8ce3686-25   
1381            O=C(Nc1cccnc1)N(CCN1CCOCC1)c1cccc(Cl)c1   JOR-UNI-2fc98d0b-6   

      f_avg_IC50     score  
874     0.




In [86]:
df = pd.read_csv('data/activity_data-3.csv').sort_values(by='f_avg_IC50')
df = df[['SMILES', 'CID','f_avg_IC50', 'f_avg_pIC50']]
df['score'] = -final_hit_imp
df = df[df['score'].notnull()]
print(len(df))
for i, row in df.iterrows():
    if row['f_avg_IC50'] < 10:
        df.loc[i, 'label'] = 'active'
    else:
        df.loc[i, 'label'] = 'inactive'
print(df['label'].value_counts())
print(df[df['label']=='inactive'])

1286
inactive    815
active      471
Name: label, dtype: int64
                                                 SMILES                  CID  \
316   O=C(Cc1ccc(-n2cnnn2)cc1)NC(C(=O)Nc1cncc2ccccc1...  EDJ-MED-ee07cf00-15   
436   Cn1nc(C(=O)NC(C)(C(=O)Nc2cncc3ccccc23)c2ccc(Cl...   MAT-POS-e9e99895-4   
746   N#Cc1ccc(N(C(=O)c2ccco2)C(C(=O)NCCc2cccc(F)c2)...  ALP-POS-b0bc6a46-24   
514   CC(C)(C)c1ccc(N(C(=O)c2cn(CCN3CCOCC3)nn2)C(C(=...  ALP-POS-305f6ec3-11   
107         CNCC1(C(=O)Nc2cncc3ccccc23)CCOc2ccc(Cl)cc21   MAT-POS-45b13633-2   
...                                                 ...                  ...   
1359    Cc1ccn2c(-c3ccc4[nH]ncc4c3)c(-c3ccc(F)cc3)nc2c1  ALV-UNI-7ff1a6f9-14   
1363                              CC1C(O)CCCN1Cc1ccccc1   MAK-UNK-6435e6c2-9   
1372      O=C(CCl)N1CCN(Cc2cccc(Cl)c2)C[C@@H]1Cc1ccccc1  DAN-LON-a5fc619e-10   
1379         CS(=O)(=O)N(CCc1ccccc1)CC1CCN(C(=O)CCl)CC1  DUN-NEW-f8ce3686-25   
1381            O=C(Nc1cccnc1)N(CCN1CCOCC1)c1cccc(Cl)c1  

In [None]:
ax = sns.scatterplot("f_avg_IC50","score",data=df)
ax.set(xscale="log")
ax.set(ylim=[-10,0]);

In [77]:

def f(x):
    return x



interactive(children=(IntSlider(value=10, description='x', max=30, min=-10), Output()), _dom_classes=('widget-…

In [88]:
df = pd.read_csv('data/activity_data-3.csv').sort_values(by='f_avg_IC50')
df = df[['SMILES', 'CID','f_avg_IC50', 'f_avg_pIC50']]
df['score'] = -final_hit_imp
df = df[df['score'].notnull()]

def plot_hist(threshold):
    for i, row in df.iterrows():
        if row['f_avg_IC50'] < threshold:
            df.loc[i, 'label'] = 'active'
        else:
            df.loc[i, 'label'] = 'inactive'

    plt.style.use('default')
    # sns.set(rc={"figure.dpi":100})
    x = df[df['label']=='inactive']['score'].values
    y = df[df['label']=='active']['score'].values
    # sns.histplot(data = df, x="score", hue='label', multiple="stack", stat='density', common_norm=False)
    # _, bins, _ = plt.hist(df[df['label']=='active']['score'].values, density=True, bins='auto', alpha=0.7, histtype='stepfilled')
    plt.hist(x, density=True, bins=bins, alpha=0.7, histtype='stepfilled', 
             label='IC50>{}uM / inactive'.format(threshold))
    plt.hist(y, density=True, bins=bins, alpha=0.7, histtype='stepfilled', 
             label='IC50<{}uM'.format(threshold))
    # plt.hist([x, y], density=True, bins=bins)
    plt.xlabel('2body_score')
    plt.ylabel('density')
    plt.legend(loc='upper left')
    plt.xlim(left=-8, right=-3)
    plt.show()
    
interact(plot_hist, threshold=(0.0,100.0, 0.5));

interactive(children=(FloatSlider(value=50.0, description='threshold', step=0.5), Output()), _dom_classes=('wi…

In [95]:
df = pd.read_csv('data/activity_data-3.csv').sort_values(by='f_avg_IC50')
df = df[['SMILES', 'CID','f_avg_IC50', 'f_avg_pIC50']]
df['score'] = final_hit_imp
df = df[df['score'].notnull()]

def enrichment(score_df, n=10, score='score', index='active', log=False, ascending=True):
    
    df = score_df[score_df[score].notna()]
    orig_prop = len(df[df[index]])/len(df)
    
    if log:
        print('orig proportion of {}: {:.3f}%'.format(index, orig_prop*100))
    
    sorted_df = df.sort_values(by=score, ascending=ascending).iloc[:n]
#     print(sorted_df)
    new_prop = len(sorted_df[sorted_df[index]])/len(sorted_df)
    if log:
        print('N = {}, n_hits = {}, new proportion of {}: {:.3f}%'.format(n, len(sorted_df[sorted_df[index]]),index, new_prop*100))
    EF = new_prop/orig_prop
    return EF

def plot_EF(threshold):
    for i, row in df.iterrows():
        if row['f_avg_IC50'] < threshold:
            df.loc[i, 'active'] = True
        else:
            df.loc[i, 'active'] = False
    
    n_list = np.logspace(start=np.log10(5), stop=2, num=20) # only check top 100
    EF = [enrichment(df, n=int(n), index='active') for n in n_list]

    orig_prop = len(df[df['active']])/len(df)

    fig = plt.figure(dpi=300)
    plt.plot(n_list, EF, 'b-', label='All')
    plt.title(r'2-Body $EF(n)$ - Moonshot, RDKit confs')
    plt.legend(prop={'size': 8})
    plt.plot(n_list, np.ones_like(n_list), 'k:')
    extraticks = [1]
    plt.yticks(list(plt.yticks()[0]) + extraticks)
    plt.ylim(bottom=0)
    plt.xlabel('n')
    plt.ylabel('EF \n(base rate = {:.1f}%)'.format(orig_prop*100))
    plt.show()

interact(plot_EF, threshold=(0.0,100.0, 0.5));

interactive(children=(FloatSlider(value=50.0, description='threshold', step=0.5), Output()), _dom_classes=('wi…