# Enrichment from FRESCO
7th June 2022 

This notebook is for comparing the enrichment between FRESCO models trained on X-ray fragment-protein complexes vs docked ones

### Load docking conformations

In [1]:
from rdkit.Chem import PandasTools

sdfFile = '/home/wjm41/ml_physics/frag-pcore-screen/data/COVID_Moonshot_activity data_2021-03-22_noncovalent_docked.sdf'
df_docking = PandasTools.LoadSDF(
    sdfFile, idName='canonical_CID', smilesName='SMILES', molColName='mol')


INFO:rdkit:Enabling RDKit 2022.03.1 jupyter extensions


In [2]:
from distutils.util import strtobool

df_docking = df_docking.dropna(subset=['acrylamide'])
df_docking['acrylamide'] = df_docking['acrylamide'].apply(
    strtobool).astype(bool)
df_docking['chloroacetamide'] = df_docking['chloroacetamide'].apply(
    strtobool).astype(bool)

df_docking = df_docking.query('~chloroacetamide & ~acrylamide')


Generate pharmacophore dataframe from conformers
parse conformers

In [3]:
from fresco.featurise import return_pcore_dataframe_for_list_of_mols

moonshot_pcore_dataframe = return_pcore_dataframe_for_list_of_mols(df_docking['mol'].values)
moonshot_pcore_dataframe


100%|██████████| 2538/2538 [01:02<00:00, 40.92it/s]


Unnamed: 0,pcore,smiles,mol_id,coord_x,coord_y,coord_z
0,Donor,C[C@@H]1CN(C)CCN1C(=O)C[C@@]1(C(=O)Nc2cncc3ccc...,0,8.706100,-3.282500,24.817800
0,Donor,C[C@@H]1CN(C)CCN1C(=O)C[C@@]1(C(=O)Nc2cncc3ccc...,0,6.785400,-4.799400,26.467600
0,Donor,C[C@@H]1CN(C)CCN1C(=O)C[C@@]1(C(=O)Nc2cncc3ccc...,0,7.780500,-0.126000,21.171800
0,Acceptor,C[C@@H]1CN(C)CCN1C(=O)C[C@@]1(C(=O)Nc2cncc3ccc...,0,6.594600,0.190600,17.701700
0,Acceptor,C[C@@H]1CN(C)CCN1C(=O)C[C@@]1(C(=O)Nc2cncc3ccc...,0,9.923300,0.106100,20.227800
...,...,...,...,...,...,...
0,Acceptor,O=C(NCCc1cccc(F)c1)[C@H](c1cccnc1)N(C(=O)c1coc...,2537,5.232900,-7.223200,27.364600
0,Aromatic,O=C(NCCc1cccc(F)c1)[C@H](c1cccnc1)N(C(=O)c1coc...,2537,11.660640,0.511120,24.741700
0,Aromatic,O=C(NCCc1cccc(F)c1)[C@H](c1cccnc1)N(C(=O)c1coc...,2537,6.833800,-5.037117,27.731300
0,Aromatic,O=C(NCCc1cccc(F)c1)[C@H](c1cccnc1)N(C(=O)c1coc...,2537,5.819533,-0.802667,24.718817


Generate pharmacophore 2-body distribution

In [4]:
from fresco.featurise import return_default_pharmacophore_pairs, calculate_pairwise_distances_between_pharmacophores_for_a_single_ligand

pcore_pairs = return_default_pharmacophore_pairs()

moonshot_pair_distributions = []
for index in tqdm(moonshot_pcore_dataframe.mol_id.unique()):
    pair_distribution_for_this_ligand = {}
    
    for pcore_pair in pcore_pairs:
        core_a,core_b = pcore_pair.split('-')
        pair_distribution_for_this_ligand[pcore_pair] = calculate_pairwise_distances_between_pharmacophores_for_a_single_ligand(
            moonshot_pcore_dataframe.query('mol_id == @index'), core_a, core_b)

    moonshot_pair_distributions.append(pair_distribution_for_this_ligand)


100%|██████████| 2538/2538 [01:13<00:00, 34.36it/s]


Load fragment KDEs

In [15]:
import dill as pickle
from fresco.model import load_kde_model

pickle_dir = '/home/wjm41/ml_physics/frag-pcore-screen/data/EnamineREAL/pickles/'
xray_kde_mpro = 'kde_dict_spl_mpro.pickle'
xray_kde_dict = pickle.load(open(pickle_dir+xray_kde_mpro, 'rb'))

docked_kde_mpro = '/home/wjm41/ml_physics/frag-pcore-screen/data/kde_on_docked_mpro_frags.pkl'
docked_kde_dict = load_kde_model(docked_kde_mpro)
docked_5_kde_mpro = '/home/wjm41/ml_physics/frag-pcore-screen/data/kde_on_5_docked_mpro_frags.pkl'
docked_5_kde_dict = load_kde_model(docked_5_kde_mpro)


Score moonshot mols with KDEs

In [16]:
import numpy as np
import pandas as pd
from fresco.model import score_dist

def score_with_kde(kde_model, col_name):
    
    df_docking[col_name] = -100.0
    for i, pair_dist in enumerate(moonshot_pair_distributions):
        score_df_for_this_molecule = pd.DataFrame(columns=pcore_pairs)
        
        for pcore_combination in pcore_pairs:
            kde_for_this_combination = kde_model[pcore_combination]
            # print(pair_dist[pcore_combination])
            pcore_dist = pair_dist[pcore_combination].reshape(
                -1, 1)
            pcore_score = score_dist(kde_for_this_combination, pcore_dist)
            score_df_for_this_molecule.at[0, pcore_combination] = pcore_score

        scores = score_df_for_this_molecule[pcore_pairs].to_numpy().astype(
            float)
        scores[np.isnan(scores)] = -100
        processed_score_for_this_molecule = np.nanmean(scores)
        df_docking.at[i, col_name] = processed_score_for_this_molecule
        
score_with_kde(xray_kde_dict, 'xray_fresco_score')
score_with_kde(docked_kde_dict, 'docked_fresco_score')
score_with_kde(docked_5_kde_dict, 'docked_5_fresco_score')
df_docking


Unnamed: 0,canonical_CID,Chemgauss4 Score,f_avg_IC50,xray_fresco_score,docked_fresco_score,docked_5_fresco_score
1,EDG-MED-ba1ac7b9-15,-12.6986,6.177160,-2.899725,-2.914773,-3.126699
2,MAT-POS-9ff17035-2,-12.4300,1.975630,-2.803965,-2.569468,-2.801919
3,EDJ-MED-8c98ee63-2,-12.3919,0.802811,-3.373098,-3.000755,-3.495939
4,ALP-POS-64a710fa-1,-12.3789,9.861130,-3.023161,-2.900700,-3.012714
5,BAR-COM-4e090d3a-3,-12.3516,,-2.959190,-6.074448,-3.451235
...,...,...,...,...,...,...
2307,,,,-3.253183,-2.838171,-3.291154
2329,,,,-3.079938,-2.833808,-3.208806
2340,,,,-2.864958,-2.520912,-2.787972
2354,,,,-3.064349,-2.826723,-3.200574


Average fragment scores over conformers

In [17]:
df_docking.f_avg_IC50 = df_docking.f_avg_IC50.astype(float)
df_docking['Chemgauss4 Score'] = df_docking['Chemgauss4 Score'].astype(float)

fresco_scores = ['xray_fresco_score', 'docked_fresco_score', 'docked_5_fresco_score']

columns_to_keep = ['canonical_CID',
                   'Chemgauss4 Score', 'f_avg_IC50'] + fresco_scores
df_docking = df_docking[columns_to_keep]
df_docking_grouped = df_docking.groupby(by=df_docking.canonical_CID).mean()
df_docking_grouped['canonical_CID'] = df_docking_grouped.index

df_docking_grouped['hit'] = df_docking_grouped['f_avg_IC50'] < 5
print(df_docking_grouped[['hit', 'f_avg_IC50',
      'Chemgauss4 Score'] + fresco_scores])


                       hit  f_avg_IC50  Chemgauss4 Score  xray_fresco_score  \
canonical_CID                                                                 
AAR-POS-0daf6b7e-23  False         NaN          -5.71945          -4.103888   
AAR-POS-5507155c-1   False     99.5000          -5.24347          -3.116271   
AAR-POS-8a4e0f60-1   False         NaN          -9.23545          -3.203560   
AAR-POS-8a4e0f60-10  False         NaN          -6.75070          -2.887218   
AAR-POS-8a4e0f60-2   False         NaN          -9.57557          -2.885707   
...                    ...         ...               ...                ...   
WIL-MOD-03b86a88-2   False     19.0909          -8.93996          -3.490884   
WIL-MOD-03b86a88-4   False     15.9534          -9.98616          -2.909079   
WIL-MOD-03b86a88-5   False     21.7972          -9.32327          -3.048949   
WIL-MOD-03b86a88-6   False     11.2566          -9.12435          -3.073355   
WIL-UNI-2e73223c-4   False     99.5000          -9.2

Getting submission dates

In [21]:
dateFile = '/home/wjm41/ml_physics/frag-pcore-screen/data/20220122_moonshot_submissions_data_for_alpha.csv'
df_date = pd.read_csv(dateFile)

def remove_suffix(id):
    id_separated = id.split('-')[:-1]
    new_id = '-'.join(id_separated)
    return new_id


df_docking_grouped['submission_id'] = df_docking_grouped['canonical_CID'].apply(remove_suffix)
df_merged = df_docking_grouped.merge(df_date, on='submission_id')
df_merged['Neg Dock Score'] = df_merged['Chemgauss4 Score']*-1


Enrichment against time

In [27]:
from ipywidgets import interact
from scipy.stats import spearmanr as spear
import plotly.express as px

from fresco.frag_funcs import calculate_enrichment_for_df

def return_enrichment_df(df_to_calculate, col_name, method_name, n_list):
    enrichment_list = [calculate_enrichment_for_df(
        df_to_calculate, n=int(n), index='hit', score=col_name, ascending=False) for n in n_list]
    
    df_ef = pd.DataFrame(list(zip(n_list, enrichment_list)), columns=[
        'N', 'EF'])
    df_ef['Method'] = method_name
    return df_ef


def plotly_enrichment_docking_vs_fresco(df_to_plot, title='Not Specified!'):
    n_list = np.logspace(start=np.log10(5), stop=2, num=20)
    
    orig_prop = 100*len(
        df_to_plot[df_to_plot['hit']])/len(df_to_plot)
    col_and_method_names = {'Chemgauss4 Score': 'Docking',
                            'xray_fresco_score': 'Fresco - Xray',
                            'docked_fresco_score': 'Fresco - Docked',
                            'docked_5_fresco_score': 'Fresco - Docked 5'}
    
    df_enrichment = []
    for col, method in col_and_method_names.items():
        df_ef = return_enrichment_df(df_to_plot, col, method, n_list)
        df_enrichment.append(df_ef)
    df_enrichment = pd.concat(df_enrichment)
    
    df_enrichment['hit rate'] = df_enrichment['EF']*orig_prop
    fig_hit = px.line(df_enrichment, x='N', y='hit rate',
                     color='Method')

    fig_hit.add_shape(
        type="line", line=dict(dash='dash'),
        x0=min(n_list), y0=orig_prop,
        x1=max(n_list), y1=orig_prop)
    
    fig_hit.update_layout(
        title_text=f"Enrichment - {title}",
        xaxis_title='N',
        yaxis_title=f'Hit rate <br>n = {len(df_to_plot)} <br> base rate = {orig_prop:.1f}%',
        width=800,
    )

    fig_hit.show()


def plot_enrichment_against_time(year, month, day, ic50_threshold):
    date_to_filter = int(f'{year}{month}{day}')
    df_filtered_by_date = df_merged.query('date < @date_to_filter').copy()
    df_filtered_by_date['hit'] = df_filtered_by_date['f_avg_IC50'] < ic50_threshold
    
    plotly_enrichment_docking_vs_fresco(df_filtered_by_date,
                                        title=f'IC50 < {ic50_threshold}uM<br>COVID Moonshot submissions before {day}-{month}-{year}')
    return


years_to_choose_from = ['2021', '2020']
months_to_choose_from = [f'0{x}' for x in range(
    1, 10)] + [f'{x}' for x in range(10, 13)]
days_to_choose_from = [f'0{x}' for x in range(
    1, 10)] + [f'{x}' for x in range(10, 31)]
interact(plot_enrichment_against_time,
         year=years_to_choose_from, month=months_to_choose_from, day=days_to_choose_from, ic50_threshold=(0.0, 100.0))


interactive(children=(Dropdown(description='year', options=('2021', '2020'), value='2021'), Dropdown(descripti…

<function __main__.plot_enrichment_against_time(year, month, day, ic50_threshold)>