In [1]:
from os import listdir
import pickle
import math
import random

import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product

from rdkit import Chem
from rdkit.Chem import ChemicalFeatures, MolFromSmiles, MolToSmiles, AllChem
from rdkit import Geometry
from rdkit.Chem.Pharm3D import Pharmacophore, EmbedLib
from rdkit.RDPaths import RDDataDir
import os.path

from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV
from scipy.stats import entropy
from scipy.spatial.distance import jensenshannon as jsd
from scipy.stats import ks_2samp

import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 100
%matplotlib inline

fdefFile = os.path.join(RDDataDir,'BaseFeatures.fdef')
featFactory = ChemicalFeatures.BuildFeatureFactory(fdefFile)

with open('all_ligands.list', 'rb') as pickle_file:
    frags = pickle.load(pickle_file)
with open('hits_new.list', 'rb') as pickle_file:
    hits = pickle.load(pickle_file)
    
import mols2grid

mols = []
smiles_list = []
name_list = []

filenames = [ f for f in listdir('data/frags/') if f[-5:]=='0.pdb']
df = pd.read_csv('data/frags/Mpro_hits_summary.csv')
df = df.set_index('Dataset')
for f in filenames:
    dataname = f[:-6]

    smiles = df.loc[dataname]['Compound SMILES']
    mol = MolFromSmiles(smiles)
    mols.append(mol)
    smiles_list.append(MolToSmiles(mol))
    name_list.append(dataname)


mol_df = pd.DataFrame({"mol": mols,
                       "name": name_list,
                        "smiles": smiles_list})
mols2grid.display(mol_df, 
                  mol_col='mol', 
                  subset=['img', 'name'], 
                  tooltip=["smiles"], 
                  selection=True, 
                  n_cols=5,
                  template='table')

In [2]:
from frag_funcs import return_random_dataframe, return_pcore_dataframe, get_pair_distances, get_trip_distances

In [3]:
n_rand = 10

frag_pair_distance_dict = {} 
real_pair_dicts = [{} for i in range(n_rand)]  
rand_pair_dicts = [{} for i in range(n_rand)]  

interesting_pcores = ['Donor', 'Acceptor', 'Aromatic']

fragpcore_df = return_pcore_dataframe(frags, interesting_pcores)
hit_df = return_pcore_dataframe(hits, interesting_pcores)
# fragpcore_dfs = [return_pcore_dataframe(frags, interesting_pcores, jiggle=True) for i in range(n_rand)]
# rand_dfs = [return_random_dataframe(frags, interesting_pcores) for i in range(n_rand)]

for pcore_pair in tqdm(product(interesting_pcores,repeat=2)):
    core_a,core_b = pcore_pair
    combo = core_a+'-'+core_b
    
    frag_pair_distance_dict[combo] = np.hstack(get_pair_distances(fragpcore_df, core_a, core_b, frag=True))
#     for i,fragpcore_df in enumerate(fragpcore_dfs):
#         real_pair_dicts[i][combo] = np.hstack(get_pair_distances(fragpcore_df, core_a, core_b, frag=True))
        
#     for i,rand_df in enumerate(rand_dfs):
#         rand_pair_dicts[i][combo] = np.hstack(get_pair_distances(rand_df, core_a, core_b, frag=True))

# frag_trip_distance_dict = {}
# real_trip_dicts = [{} for i in range(n_rand)]  
# rand_trip_dicts = [{} for i in range(n_rand)]  

# for pcore_trip in tqdm(product(interesting_pcores,repeat=3)):
#     core_a, core_b, core_c = pcore_trip
#     combo = core_a+'-'+core_b+'-'+core_c
    
#     frag_trip_distance_dict[combo] = np.hstack(get_trip_distances(fragpcore_df, core_a, core_b, core_c, frag=True, active=False))
#     for i,fragpcore_df in enumerate(fragpcore_dfs):
#         real_trip_dicts[i][combo] = np.hstack(get_trip_distances(fragpcore_df, core_a, core_b, core_c, frag=True))
        
#     for i,rand_df in enumerate(rand_dfs):
#         rand_trip_dicts[i][combo] = np.hstack(get_trip_distances(rand_df, core_a, core_b, core_c, frag=True))
# #     rand_trip_distance_dict[combo] = np.hstack(get_trip_distances(rand_df, core_a, core_b, core_c, frag=True, active=False))

100%|██████████| 23/23 [00:00<00:00, 36.70it/s]
100%|██████████| 213/213 [00:06<00:00, 30.96it/s]
9it [00:00, 21.15it/s]


In [4]:
with open('frag_pair_distance_dict.pickle', 'wb') as handle:
    pickle.dump(frag_pair_distance_dict, handle)        
with open('real_pair_dicts.pickle', 'wb') as handle:
    pickle.dump(real_pair_dicts, handle)     
with open('rand_pair_dicts.pickle', 'wb') as handle:
    pickle.dump(rand_pair_dicts, handle)     
with open('frag_trip_distance_dict.pickle', 'wb') as handle:
    pickle.dump(frag_trip_distance_dict, handle)     
with open('real_trip_dicts.pickle', 'wb') as handle:
    pickle.dump(real_trip_dicts, handle)     
with open('rand_trip_dicts.pickle', 'wb') as handle:
    pickle.dump(rand_trip_dicts, handle)     
    


Check area under histograms

In [None]:
# for pcore_pair in tqdm(product(interesting_pcores,repeat=2), total=9):
#     core_a, core_b = pcore_pair
#     combo = core_a+'-'+core_b
# #     print(combo, np.sum(frag_pair_distance_dict[combo]))
#     print(combo, entropy(np.sort(frag_pair_distance_dict[combo]), np.sort(frag_pair_distance_dict[core_b+'-'+core_a])) )

print('combo\tentropy')
for pcore_trip in tqdm(product(interesting_pcores,repeat=3), total=27):
    core_a, core_b, core_c = pcore_trip
    combo = core_a+'-'+core_b+'-'+core_c
#     print(combo, '\t', np.sum(frag_trip_distance_dict[combo]))
    print(combo, '\t', entropy(frag_trip_distance_dict[combo]))
# print(np.sum(frag_trip_distance_dict['Aromatic-Acceptor-Aromatic']))
# print(np.sum(frag_trip_distance_dict['Acceptor-Aromatic-Aromatic']))

# print(np.sum(frag_pair_distance_dict['Acceptor-Aromatic']))
# print(np.sum(frag_pair_distance_dict['Aromatic-Acceptor']))


### Generating KernelDensity Estimators

In [5]:
with open('kde_dict.pickle', 'rb') as pickle_file:
    kde_dict = pickle.load(pickle_file)
with open('rand_kde_dicts.pickle', 'rb') as pickle_file:
    rand_kde_dicts = pickle.load(pickle_file)   

In [5]:
with open('frag_pair_distance_dict.pickle', 'rb') as handle:
    frag_pair_distance_dict = pickle.load(handle)

def fit_pair_kde(data):
    params = {'bandwidth': np.logspace(-3, 3, 500)}
    
    grid = GridSearchCV(KernelDensity(kernel='gaussian'), params)
    grid.fit(data.reshape(-1,1))
    
    kde = grid.best_estimator_
    print(kde.get_params())
    return kde

# def fit_trip_kde(data):
# #     params = {'bandwidth': np.logspace(-3, 2, 20)}
    
# #     grid = GridSearchCV(KernelDensity(kernel='gaussian', rtol=1e-4), params)
# #     grid.fit(data.T)
    
# #     kde = grid.best_estimator_
#     kde = KernelDensity(kernel='gaussian', rtol=1e-4)
#     kde.fit(data.T)
# #     print(kde.get_params())
#     return kde

combo_list = []

# kde_dict = {}
# real_kde_dicts = [{} for i in range(n_rand)]
# rand_kde_dicts = [{} for i in range(n_rand)]

important = ['Donor-Aromatic',
            'Aromatic-Acceptor',
            'Aromatic-Aromatic']
unimportant = ['Donor-Donor',
               'Donor-Acceptor',
               'Acceptor-Acceptor']

pairs = important+unimportant

kde_dict_opt = {}
for combo in tqdm(pairs):
    kde_dict_opt[combo] = fit_pair_kde(frag_pair_distance_dict[combo])
    
# with open('kde_dict_opt.pickle', 'wb') as handle:
#     pickle.dump(kde_dict_opt, handle)    
# for pcore_trip in tqdm(product(interesting_pcores,repeat=3), total=27):
#     core_a, core_b, core_c = pcore_trip
#     combo = core_a+'-'+core_b+'-'+core_c
#     combo_list.append(combo)
    
#     pair1 = core_a+'-'+core_b
#     pair2 = core_b+'-'+core_c
#     pair3 = core_c+'-'+core_a
    
#     if pair1 not in kde_dict:
#         kde_dict[pair1] = fit_pair_kde(frag_pair_distance_dict[pair1])
#         for i in range(n_rand):
# #             real_kde_dicts[i][pair1] = fit_pair_kde(real_pair_dicts[i][pair1])
#             rand_kde_dicts[i][pair1] = fit_pair_kde(rand_pair_dicts[i][pair1])
#     if pair2 not in kde_dict:t
#         kde_dict[pair2] = fit_pair_kde(frag_pair_distance_dict[pair2])
#         for i in range(n_rand):
# #             real_kde_dicts[i][pair2] = fit_pair_kde(real_pair_dicts[i][pair2])
#             rand_kde_dicts[i][pair2] = fit_pair_kde(rand_pair_dicts[i][pair2])
#     if pair3 not in kde_dict:
#         kde_dict[pair3] = fit_pair_kde(frag_pair_distance_dict[pair3])
#         for i in range(n_rand):
# #             real_kde_dicts[i][pair3] = fit_pair_kde(real_pair_dicts[i][pair3])
#             rand_kde_dicts[i][pair3] = fit_pair_kde(rand_pair_dicts[i][pair3])
#     if combo not in kde_dict:
# #         print(frag_trip_distance_dict[combo].T.shape)
#         kde_dict[combo] = fit_trip_kde(frag_trip_distance_dict[combo])
#         for i in range(n_rand):
#             rand_kde_dicts[i][combo] = fit_trip_kde(rand_trip_dicts[i][combo])
            
# with open('kde_dict.pickle', 'wb') as handle:
#     pickle.dump(kde_dict, handle)        
# with open('rand_kde_dicts.pickle', 'wb') as handle:
#     pickle.dump(rand_kde_dicts, handle)        



 17%|█▋        | 1/6 [00:19<01:36, 19.32s/it]

{'algorithm': 'auto', 'atol': 0, 'bandwidth': 0.8353023195026779, 'breadth_first': True, 'kernel': 'gaussian', 'leaf_size': 40, 'metric': 'euclidean', 'metric_params': None, 'rtol': 0}


 33%|███▎      | 2/6 [00:49<01:29, 22.43s/it]

{'algorithm': 'auto', 'atol': 0, 'bandwidth': 0.42980012064907985, 'breadth_first': True, 'kernel': 'gaussian', 'leaf_size': 40, 'metric': 'euclidean', 'metric_params': None, 'rtol': 0}


 50%|█████     | 3/6 [00:59<00:56, 18.73s/it]

{'algorithm': 'auto', 'atol': 0, 'bandwidth': 0.050981760644204246, 'breadth_first': True, 'kernel': 'gaussian', 'leaf_size': 40, 'metric': 'euclidean', 'metric_params': None, 'rtol': 0}


 67%|██████▋   | 4/6 [01:28<00:44, 22.05s/it]

{'algorithm': 'auto', 'atol': 0, 'bandwidth': 0.006945177773823696, 'breadth_first': True, 'kernel': 'gaussian', 'leaf_size': 40, 'metric': 'euclidean', 'metric_params': None, 'rtol': 0}


 83%|████████▎ | 5/6 [02:17<00:30, 30.05s/it]

{'algorithm': 'auto', 'atol': 0, 'bandwidth': 0.5991769669310619, 'breadth_first': True, 'kernel': 'gaussian', 'leaf_size': 40, 'metric': 'euclidean', 'metric_params': None, 'rtol': 0}


100%|██████████| 6/6 [03:47<00:00, 37.89s/it]

{'algorithm': 'auto', 'atol': 0, 'bandwidth': 0.24705040554568256, 'breadth_first': True, 'kernel': 'gaussian', 'leaf_size': 40, 'metric': 'euclidean', 'metric_params': None, 'rtol': 0}





### TIMING SCORING

In [None]:
### Generate random ZINC conformer 26/02/2021
import random 

zinc_dir = '/rds-d2/user/wjm41/hpc-work/datasets/ZINC/purchasable'
zinc_smi = pd.read_csv(zinc_dir+'/zinc_subset.smi', delim_whitespace=True)['smiles'].values


rand_smi = random.sample(list(zinc_smi), k=100)

zinc_mols = [None]*len(rand_smi)
for i,smi in tqdm(enumerate(rand_smi), total=len(rand_smi)):
    try:
        mol = MolFromSmiles(smi)

        #constrained conformer generation
        mol = Chem.AddHs(mol)
        AllChem.EmbedMolecule(mol)

        mol = Chem.RemoveHs(mol)

        #calculate properties from surviving conformers
        conf = mol.GetConformer()

        mol_data = [mol]
        for j,atom in enumerate(mol.GetAtoms()):
            mol_data.append([atom.GetSymbol(),
                                conf.GetPositions()[j]
                                ])
        zinc_mols[i] = mol_data
    except Exception as ex:
        print(ex)
        print(str(i)+' failed')
        continue

print('Number of sampled molecules :{}'.format(len(zinc_mols)))

zinccore_df = return_pcore_dataframe(zinc_mols, interesting_pcores)

zinc_pair = [None]*len(set(zinccore_df['mol_id']))
for j,i in tqdm(enumerate(set(zinccore_df['mol_id'])), total=len(zinc_mols)):
    zinc_pair_individual = {}
    
    for pcore_pair in product(interesting_pcores,repeat=2):
        core_a,core_b = pcore_pair
        combo = core_a+'-'+core_b
        zinc_pair_individual[combo]= get_pair_distances(zinccore_df[zinccore_df['mol_id']==i], core_a, core_b, frag=False, active=None)
    zinc_pair[j] = zinc_pair_individual
    
zinc_trip = [None]*len(set(zinccore_df['mol_id']))
for j,i in tqdm(enumerate(set(zinccore_df['mol_id'])), total=len(zinc_mols)):
    zinc_trip_individual = {}
    
    for pcore_trip in product(interesting_pcores,repeat=3):
        core_a,core_b,core_c = pcore_trip
        combo = core_a+'-'+core_b+'-'+core_c
        zinc_trip_individual[combo]= get_trip_distances(zinccore_df[zinccore_df['mol_id']==i], core_a, core_b, core_c, frag=False, active=False)
    zinc_trip[j] = zinc_trip_individual

In [None]:
import timeit

intervals = (
    ('weeks', 604800),  # 60 * 60 * 24 * 7
    ('days', 86400),    # 60 * 60 * 24
    ('hours', 3600),    # 60 * 60
    ('minutes', 60),
    ('seconds', 1),
    )

def display_time(seconds, granularity=2):
    result = []

    for name, count in intervals:
        value = seconds // count
        if value:
            seconds -= value * count
            if value == 1:
                name = name.rstrip('s')
            result.append("{} {}".format(value, name))
    return ', '.join(result[:granularity])

def score_mol(kde_dict):
    choice = random.choice(range(len(zinc_mols)))

    pair_score = 0
    trip_score = 0
    
    pair = zinc_pair[choice]
    trip = zinc_trip[choice]
    for pcore_pair in product(interesting_pcores,repeat=2):
        core_a, core_b = pcore_pair
        combo = core_a+'-'+core_b
#         print(zinccore_df[zinccore_df['mol_id']==choice])
#         print(zinc_pair[choice][combo])
        kde_pair = kde_dict[combo]
        try:
            pair_score += np.mean(np.exp(kde_pair.score_samples(pair[combo][0].reshape(-1,1))))
        except:
            pass
    return pair_score
#     for pcore_trip in product(interesting_pcores,repeat=3):
#         core_a, core_b, core_c = pcore_trip
#         combo = core_a+'-'+core_b+'-'+core_c
#         kde_trip = kde_dict[combo]
#         trip_score += np.mean(np.exp(kde_trip.score_samples(trip[combo][0].T)))
#     return pair_score, trip_score

n=100
timetaken = timeit.repeat(lambda: score_mol(kde_dict), number=n, repeat=10)
print('Average time taken per mol: {:.3f}s'.format(np.mean(timetaken)/n))
print('Estimate for 1B molecules: {}'.format(display_time(1e9*np.mean(timetaken)/n)))

### Shannon Entropies

In [None]:
combo_list = []

real_entropy = []
real_std = []

rand_entropy = []
rand_std = []

jsds = []

nx = 500

for pcore_pair in tqdm(product(interesting_pcores,repeat=2), total=9):
    core_a, core_b = pcore_pair
    combo = core_a+'-'+core_b
    combo_list.append(combo)
    
#     kde_pair = kde_dict[combo]
    
    
    x = np.linspace(0, np.amax(frag_pair_distance_dict[combo]), nx)
    
#     pair_dist = np.exp(kde_pair.score_samples(x.reshape(-1,1)))

#     pair_dist = pair_dist.flatten()
# #     pair_dist = np.where(pair_dist<1e-9, 0.0, pair_dist)
#     pair_dist = np.where(pair_dist==0, 1e-19, pair_dist)
#     pair_dist = pair_dist/np.sum(pair_dist)
    
#     pair_entropy = entropy(pair_dist)
    entropy_list = []
    rand_list = []
    jsd_list = []
    
    for i in range(n_rand):
        kde_pair = real_kde_dicts[i][combo]
        
        pair_dist = np.exp(kde_pair.score_samples(x.reshape(-1,1)))

        pair_dist = pair_dist.flatten()
    #     pair_dist = np.where(pair_dist<1e-9, 0.0, pair_dist)
#         pair_dist = np.where(pair_dist==0, 1e-19, pair_dist)
        pair_dist = pair_dist/np.sum(pair_dist)

        entropy_list.append(entropy(pair_dist))
        
#         y = np.linspace(0, np.amax(rand_pair_dicts[i][combo]), nx)
        kde_rand = rand_kde_dicts[i][combo]
        rand_dist = np.exp(kde_rand.score_samples(x.reshape(-1,1)))

        rand_dist = rand_dist.flatten()
    #     rand_dist = np.where(rand_dist==0, 1e-19, rand_dist)
        rand_dist = rand_dist/np.sum(rand_dist)
    
        rand_list.append(entropy(rand_dist))
        
        jsd_list.append(jsd(pair_dist, rand_dist))
        
    real_entropy.append(np.mean(entropy_list))
    real_std.append(np.std(entropy_list))
    
    rand_entropy.append(np.mean(rand_list))
    rand_std.append(np.std(rand_list))
    jsds.append(np.mean(jsd_list))
# excess_df.to_csv('entropy_df.csv', index=False)

In [None]:
entropy_df = pd.DataFrame(list(zip(combo_list, real_entropy, real_std, rand_entropy, rand_std, jsds)), 
                          columns = ['combo','real_mean', 'real_std', 'rand_mean', 'rand_std', 'jsd'])
entropy_df['sig'] = (entropy_df['real_mean'] - entropy_df['rand_mean']).abs().ge(entropy_df['rand_std'])
print('n_rand = {}, nx = {}'.format(n_rand, nx))
print(entropy_df.round({'real_mean':2, 'real_std':2, 'rand_mean':2, 'rand_std':2, 'jsd':3}))


In [None]:
from matplotlib.patches import Rectangle

for i, pair in enumerate(product(interesting_pcores,repeat=2)):
    combo = pair[0]+'-'+pair[1]
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(111)
    if entropy_df.iloc[i]['sig']:
        ax.set_title(combo+' distance histogram - SIGNIFICANT')
    else:
        ax.set_title(combo+' distance histogram')
    ax.set_xlabel('Distance (angstrom)')
    ax.set_ylabel('Relative Frequency')
#     ax.hist(frag_pair_distance_dict[combo], bins=30, alpha = 0.5, density=True, color='orange', label = 'Real')
    for i in range(n_rand):
        ax.hist(real_pair_dicts[i][combo], bins=30, alpha = 0.1, density=True, color='orange')
        ax.hist(rand_pair_dicts[i][combo], bins=30, alpha=0.1, density=True, color='grey')
        
    legend_elements = [Rectangle((0,0), 1,1 , color='orange', label='Measured'),
                   Rectangle((0,0), 1,1,  color='grey', label='Random')]
    ax.legend(handles=legend_elements, loc='upper right')
    fig.show()
#     except:
#         pass

### KL Divergences

In [None]:
import sys
np.set_printoptions(threshold=sys.maxsize)
combo_list = []
entropy_list = []
jsd_list = []

nx, ny, nz = 2, 2, 2

for pcore_trip in tqdm(product(interesting_pcores,repeat=3), total=27):
    core_a, core_b, core_c = pcore_trip
    combo = core_a+'-'+core_b+'-'+core_c
    combo_list.append(combo)
    
    pair1 = core_a+'-'+core_b
    pair2 = core_b+'-'+core_c
    pair3 = core_c+'-'+core_a
    
    kde_pair1 = kde_dict[pair1]
    kde_pair2 = kde_dict[pair2]
    kde_pair3 = kde_dict[pair3]
    
    x = np.linspace(0, np.amax(frag_pair_distance_dict[pair1]), nx)
    y = np.linspace(0, np.amax(frag_pair_distance_dict[pair2]), ny)
    z = np.linspace(0, np.amax(frag_pair_distance_dict[pair3]), nz)
    xv, yv, zv = np.meshgrid(x, y, z)
    
#     print(x.reshape(-1,1))
#     print(y.reshape(-1,1))
#     print(z.reshape(-1,1))

    x_score = np.exp(kde_pair1.score_samples(xv.reshape(-1,1)))
#     pair_dist = pair_dist * x_score.reshape([-1,1,1])

    y_score = np.exp(kde_pair2.score_samples(yv.reshape(-1,1)))
#     pair_dist = pair_dist * y_score.reshape([1,-1,1])

    z_score = np.exp(kde_pair3.score_samples(zv.reshape(-1,1)))
#     pair_dist = pair_dist * z_score.reshape([1,1,-1])

#     pair_dist = np.array([x_score.ravel(), y_score.ravel(), z_score.ravel()]).T.flatten()
    pair_dist = (x_score.ravel()*y_score.ravel()*z_score.ravel()).flatten()
    pair_dist = pair_dist/np.sum(pair_dist)
#     print(x_score.shape)
#     print(y_score.shape)
#     print(z_score.shape)
#     print(pair_dist.shape)

    # triplet 
    kde_trip = kde_dict[combo]
    trip_dist = np.ones((nx, ny, nz))
    

    xyz = np.array([xv.ravel(), yv.ravel(), zv.ravel()])
#     print(xyz.T)
#     raise Exception
    trip_dist = np.exp(kde_trip.score_samples(xyz.T)).flatten()
    trip_dist = trip_dist/np.sum(trip_dist)
#     print('{:.3f}'.format(entropy(trip_dist)))
    
#     print(trip_dist)

#     entropy_list.append(entropy(trip_dist))
    entropy_list.append(entropy(trip_dist, pair_dist))
    jsd_list.append(jsd(trip_dist, pair_dist))


excess_df = pd.DataFrame(list(zip(combo_list, entropy_list, jsd_list)), 
                         columns = ['combo','KL Divergence', 'JS Distance'])
print('nx, ny, nz = {},{},{}'.format(nx, ny, nz))
print(excess_df.round({'KL Divergence': 2, 'JS Distance':3}))
# excess_df.to_csv('entropy_df.csv', index=False)

# excess_df = pd.DataFrame(list(zip(combo_list, entropy_list)), 
#                          columns = ['combo','Shannon Entropy'])
# print('nx, ny, nz = {},{},{}'.format(nx, ny, nz))
# print(excess_df)
# print(excess_df.round({'Shannon Entropy': 2}))



### 2-body vs random KS

In [6]:
from scipy.stats import ks_2samp

combo_list = []

pval_mean = []
pval_std = []

nx = 500

for pcore_pair in tqdm(product(interesting_pcores,repeat=2), total=9):
    core_a, core_b = pcore_pair
    combo = core_a+'-'+core_b
    combo_list.append(combo)
    
    kde_pair = kde_dict[combo]
    
    x = np.linspace(0, np.amax(frag_pair_distance_dict[combo]), nx)
    
    pair_dist = np.exp(kde_pair.score_samples(x.reshape(-1,1)))
    pair_dist = pair_dist.flatten()
    pair_dist = pair_dist/np.sum(pair_dist)
    
    pval_list = []
    
    for i in range(n_rand):

        kde_rand = rand_kde_dicts[i][combo]
        rand_dist = np.exp(kde_rand.score_samples(x.reshape(-1,1)))

        rand_dist = rand_dist.flatten()
        rand_dist = rand_dist/np.sum(rand_dist)
    
        pval_list.append(ks_2samp(pair_dist, rand_dist)[1])
        
    pval_mean.append(np.mean(pval_list))
    pval_std.append(np.std(pval_list))
    
pval_df = pd.DataFrame(list(zip(combo_list, pval_mean, pval_std)), 
                         columns = ['combo','p-value mean', 'p-value std'])
print(pval_df.round({'p-value mean': 4, 'p-value std': 4}))

100%|██████████| 9/9 [00:02<00:00,  4.44it/s]



               combo  p-value mean  p-value std
0        Donor-Donor        0.0978       0.2300
1     Donor-Acceptor        0.0230       0.0318
2     Donor-Aromatic        0.0091       0.0183
3     Acceptor-Donor        0.0331       0.0762
4  Acceptor-Acceptor        0.0401       0.0391
5  Acceptor-Aromatic        0.0806       0.1063
6     Aromatic-Donor        0.0092       0.0138
7  Aromatic-Acceptor        0.0034       0.0049
8  Aromatic-Aromatic        0.0003       0.0004
               combo  p-value mean  p-value std
0        Donor-Donor        0.0978       0.2300
1     Donor-Acceptor        0.0230       0.0318
2     Donor-Aromatic        0.0091       0.0183
3     Acceptor-Donor        0.0331       0.0762
4  Acceptor-Acceptor        0.0401       0.0391
5  Acceptor-Aromatic        0.0806       0.1063
6     Aromatic-Donor        0.0092       0.0138
7  Aromatic-Acceptor        0.0034       0.0049
8  Aromatic-Aromatic        0.0003       0.0004


### 3-body vs random KS

In [14]:
combo_list = []

pval_mean = []
pval_std = []

nx = 4

for pcore_trip in tqdm(product(interesting_pcores,repeat=3), total=27):
    core_a, core_b, core_c = pcore_trip
    combo = core_a+'-'+core_b+'-'+core_c
    combo_list.append(combo)
    
    pair1 = core_a+'-'+core_b
    pair2 = core_b+'-'+core_c
    pair3 = core_c+'-'+core_a
    
    kde_trip = kde_dict[combo]
    
    trip_dist = np.ones((nx, nx, nx))
    
    x = np.linspace(0, np.amax(frag_pair_distance_dict[pair1]), nx)
    y = np.linspace(0, np.amax(frag_pair_distance_dict[pair2]), nx)
    z = np.linspace(0, np.amax(frag_pair_distance_dict[pair3]), nx)
    
    xv, yv, zv = np.meshgrid(x, y, z)
    
    xyz = np.array([xv.ravel(), yv.ravel(), zv.ravel()])
    
    trip_dist = np.exp(kde_trip.score_samples(xyz.T)).flatten()
    trip_dist = trip_dist.flatten()
    trip_dist = trip_dist/np.sum(trip_dist)
    
    pval_list = []
    
    for i in range(n_rand):

        kde_rand = rand_kde_dicts[i][combo]
        rand_dist = np.exp(kde_rand.score_samples(xyz.T)).flatten()
        rand_dist = rand_dist.flatten()
        rand_dist = rand_dist/np.sum(rand_dist)
    
        pval_list.append(ks_2samp(trip_dist, rand_dist)[1])
        
    pval_mean.append(np.mean(pval_list))
    pval_std.append(np.std(pval_list))
    
pval_df = pd.DataFrame(list(zip(combo_list, pval_mean, pval_std)), 
                         columns = ['combo','p-value mean', 'p-value std'])

pval_df.round({'p-value mean': 4, 'p-value std': 4})

100%|██████████| 27/27 [04:14<00:00,  9.42s/it]

                         combo  p-value mean  p-value std
0            Donor-Donor-Donor        0.5940       0.2705
1         Donor-Donor-Acceptor        0.7735       0.2116
2         Donor-Donor-Aromatic        0.5927       0.3584
3         Donor-Acceptor-Donor        0.7735       0.2116
4      Donor-Acceptor-Acceptor        0.6480       0.2932
5      Donor-Acceptor-Aromatic        0.8601       0.1510
6         Donor-Aromatic-Donor        0.5927       0.3584
7      Donor-Aromatic-Acceptor        0.8601       0.1510
8      Donor-Aromatic-Aromatic        0.4974       0.3415
9         Acceptor-Donor-Donor        0.7735       0.2116
10     Acceptor-Donor-Acceptor        0.6480       0.2932
11     Acceptor-Donor-Aromatic        0.8601       0.1510
12     Acceptor-Acceptor-Donor        0.6480       0.2932
13  Acceptor-Acceptor-Acceptor        0.2592       0.3109
14  Acceptor-Acceptor-Aromatic        0.6091       0.3555
15     Acceptor-Aromatic-Donor        0.8601       0.1510
16  Acceptor-A


100%|██████████| 27/27 [04:14<00:00,  9.42s/it]

                         combo  p-value mean  p-value std
0            Donor-Donor-Donor        0.5940       0.2705
1         Donor-Donor-Acceptor        0.7735       0.2116
2         Donor-Donor-Aromatic        0.5927       0.3584
3         Donor-Acceptor-Donor        0.7735       0.2116
4      Donor-Acceptor-Acceptor        0.6480       0.2932
5      Donor-Acceptor-Aromatic        0.8601       0.1510
6         Donor-Aromatic-Donor        0.5927       0.3584
7      Donor-Aromatic-Acceptor        0.8601       0.1510
8      Donor-Aromatic-Aromatic        0.4974       0.3415
9         Acceptor-Donor-Donor        0.7735       0.2116
10     Acceptor-Donor-Acceptor        0.6480       0.2932
11     Acceptor-Donor-Aromatic        0.8601       0.1510
12     Acceptor-Acceptor-Donor        0.6480       0.2932
13  Acceptor-Acceptor-Acceptor        0.2592       0.3109
14  Acceptor-Acceptor-Aromatic        0.6091       0.3555
15     Acceptor-Aromatic-Donor        0.8601       0.1510
16  Acceptor-A




### 2-body random vs 3-body random

In [15]:
combo_list = []

pval_mean = []
pval_std = []

nx = 4

for pcore_trip in tqdm(product(interesting_pcores,repeat=3), total=27):
    core_a, core_b, core_c = pcore_trip
    combo = core_a+'-'+core_b+'-'+core_c
    combo_list.append(combo)
    
    pair1 = core_a+'-'+core_b
    pair2 = core_b+'-'+core_c
    pair3 = core_c+'-'+core_a
    
    
    x = np.linspace(0, np.amax(frag_pair_distance_dict[pair1]), nx)
    y = np.linspace(0, np.amax(frag_pair_distance_dict[pair2]), nx)
    z = np.linspace(0, np.amax(frag_pair_distance_dict[pair3]), nx)
    
    xv, yv, zv = np.meshgrid(x, y, z)
    
    xyz = np.array([xv.ravel(), yv.ravel(), zv.ravel()])
    
    pval_list = []
    
    for i in range(n_rand):
        kde_pair1 = rand_kde_dicts[i][pair1]
        kde_pair2 = rand_kde_dicts[i][pair2]
        kde_pair3 = rand_kde_dicts[i][pair3]
                                      
        x_score = np.exp(kde_pair1.score_samples(xv.reshape(-1,1)))
        y_score = np.exp(kde_pair2.score_samples(yv.reshape(-1,1)))
        z_score = np.exp(kde_pair3.score_samples(zv.reshape(-1,1)))
                                      
        pair_dist = (x_score.ravel()*y_score.ravel()*z_score.ravel()).flatten()
        pair_dist = pair_dist/np.sum(pair_dist)
                                      
        kde_rand = rand_kde_dicts[i][combo]
        trip_dist = np.exp(kde_rand.score_samples(xyz.T)).flatten()
        trip_dist = trip_dist.flatten()
        trip_dist = trip_dist/np.sum(trip_dist)
    
        pval_list.append(ks_2samp(trip_dist, pair_dist)[1])
        
    pval_mean.append(np.mean(pval_list))
    pval_std.append(np.std(pval_list))
    
pval_df = pd.DataFrame(list(zip(combo_list, pval_mean, pval_std)), 
                         columns = ['combo','p-value mean', 'p-value std'])

pval_df.round({'p-value mean': 4, 'p-value std': 4})

100%|██████████| 27/27 [03:49<00:00,  8.51s/it]



Unnamed: 0,combo,p-value mean,p-value std
0,Donor-Donor-Donor,0.05,0.0779
1,Donor-Donor-Acceptor,0.3571,0.302
2,Donor-Donor-Aromatic,0.3036,0.3325
3,Donor-Acceptor-Donor,0.3571,0.302
4,Donor-Acceptor-Acceptor,0.1792,0.2652
5,Donor-Acceptor-Aromatic,0.0766,0.0585
6,Donor-Aromatic-Donor,0.3036,0.3325
7,Donor-Aromatic-Acceptor,0.0994,0.0576
8,Donor-Aromatic-Aromatic,0.3772,0.2317
9,Acceptor-Donor-Donor,0.3571,0.302


Unnamed: 0,combo,p-value mean,p-value std
0,Donor-Donor-Donor,0.05,0.0779
1,Donor-Donor-Acceptor,0.3571,0.302
2,Donor-Donor-Aromatic,0.3036,0.3325
3,Donor-Acceptor-Donor,0.3571,0.302
4,Donor-Acceptor-Acceptor,0.1792,0.2652
5,Donor-Acceptor-Aromatic,0.0766,0.0585
6,Donor-Aromatic-Donor,0.3036,0.3325
7,Donor-Aromatic-Acceptor,0.0994,0.0576
8,Donor-Aromatic-Aromatic,0.3772,0.2317
9,Acceptor-Donor-Donor,0.3571,0.302


### 3-body vs 2-body

In [18]:
combo_list = []

pval_list = []

nx = 4

for pcore_trip in tqdm(product(interesting_pcores,repeat=3), total=27):
    core_a, core_b, core_c = pcore_trip
    combo = core_a+'-'+core_b+'-'+core_c
    combo_list.append(combo)
    
    pair1 = core_a+'-'+core_b
    pair2 = core_b+'-'+core_c
    pair3 = core_c+'-'+core_a
    
    x = np.linspace(0, np.amax(frag_pair_distance_dict[pair1]), nx)
    y = np.linspace(0, np.amax(frag_pair_distance_dict[pair2]), nx)
    z = np.linspace(0, np.amax(frag_pair_distance_dict[pair3]), nx)
    
    xv, yv, zv = np.meshgrid(x, y, z)
    
    xyz = np.array([xv.ravel(), yv.ravel(), zv.ravel()])
    
    kde_pair1 = kde_dict[pair1]
    kde_pair2 = kde_dict[pair2]
    kde_pair3 = kde_dict[pair3]

    x_score = np.exp(kde_pair1.score_samples(xv.reshape(-1,1)))
    y_score = np.exp(kde_pair2.score_samples(yv.reshape(-1,1)))
    z_score = np.exp(kde_pair3.score_samples(zv.reshape(-1,1)))

    pair_dist = (x_score.ravel()*y_score.ravel()*z_score.ravel()).flatten()
    pair_dist = pair_dist/np.sum(pair_dist)

    kde_trip = kde_dict[combo]
    trip_dist = np.exp(kde_trip.score_samples(xyz.T)).flatten()
    trip_dist = trip_dist.flatten()
    trip_dist = trip_dist/np.sum(trip_dist)

    pval_list.append(ks_2samp(trip_dist, pair_dist)[1])
    
pval_df = pd.DataFrame(list(zip(combo_list, pval_list)), 
                         columns = ['combo','p-value'])

pval_df.round({'p-value': 4})

100%|██████████| 27/27 [00:36<00:00,  1.34s/it]
100%|██████████| 27/27 [00:36<00:00,  1.34s/it]


Unnamed: 0,combo,p-value
0,Donor-Donor-Donor,0.0005
1,Donor-Donor-Acceptor,0.418
2,Donor-Donor-Aromatic,0.3027
3,Donor-Acceptor-Donor,0.418
4,Donor-Acceptor-Acceptor,0.0362
5,Donor-Acceptor-Aromatic,0.2115
6,Donor-Aromatic-Donor,0.3027
7,Donor-Aromatic-Acceptor,0.2115
8,Donor-Aromatic-Aromatic,0.418
9,Acceptor-Donor-Donor,0.418


Unnamed: 0,combo,p-value
0,Donor-Donor-Donor,0.0005
1,Donor-Donor-Acceptor,0.418
2,Donor-Donor-Aromatic,0.3027
3,Donor-Acceptor-Donor,0.418
4,Donor-Acceptor-Acceptor,0.0362
5,Donor-Acceptor-Aromatic,0.2115
6,Donor-Aromatic-Donor,0.3027
7,Donor-Aromatic-Acceptor,0.2115
8,Donor-Aromatic-Aromatic,0.418
9,Acceptor-Donor-Donor,0.418


In [17]:
print(pval_list)

[9.562807928334129e-05]
[9.562807928334129e-05]
