# Enrichment from FRESCO
5th Apr 2022 

This notebook is for calculating the enrichment from FRESCO by parsing the same `.sdf` file as used for docking.

### Load dockign conformations

In [1]:
from rdkit.Chem import PandasTools

sdfFile = '/home/wjm41/ml_physics/frag-pcore-screen/data/COVID_Moonshot_activity data_2021-03-22_noncovalent_docked.sdf'
df_docking = PandasTools.LoadSDF(
    sdfFile, idName='canonical_CID', smilesName='SMILES', molColName='mol')
print(df_docking)


INFO:rdkit:Enabling RDKit 2020.09.1 jupyter extensions


                                          canonical_CID  \
0     Docking receptor of MPRO-X2908_0A_BOUND(A) > L...   
1                                   EDG-MED-ba1ac7b9-15   
2                                    MAT-POS-9ff17035-2   
3                                    EDJ-MED-8c98ee63-2   
4                                    ALP-POS-64a710fa-1   
...                                                 ...   
2654                                ALP-POS-305f6ec3-52   
2655                                ALP-POS-88a7a97e-23   
2656                                ALP-POS-b0bc6a46-27   
2657                                BRU-THA-92256091-77   
2658                                ALP-POS-ced8ea4d-30   

                                                 SMILES  \
0     CCC(C)C(NC(=O)CNC(=O)C(NC(=O)C(CCC(N)=O)NC(=O)...   
1     C[C@@H]1CN(C)CCN1C(=O)C[C@@]1(C(=O)Nc2cncc3ccc...   
2     O=C(Cc1cc(Cl)cc(Oc2cccc(=O)[nH]2)c1)Nc1cncc2cc...   
3     Cn1ccc(CNC[C@@]2(C(=O)Nc3cncc4ccccc34)CCOc3ccc...

In [2]:
from distutils.util import strtobool

df_docking = df_docking.dropna(subset=['acrylamide'])
df_docking['acrylamide'] = df_docking['acrylamide'].apply(
    strtobool).astype(bool)
df_docking['chloroacetamide'] = df_docking['chloroacetamide'].apply(
    strtobool).astype(bool)

df_docking = df_docking.query('~chloroacetamide & ~acrylamide')


parse conformers

In [3]:
moonshot_docking_confomers = []
for mol in df_docking['mol'].values:
    conformer = mol.GetConformer()
    
    data_for_one_molecule = [mol]
    for j, atom in enumerate(mol.GetAtoms()):
        data_for_one_molecule.append([atom.GetSymbol(),
                         conformer.GetPositions()[j]
                         ])
    moonshot_docking_confomers.append(data_for_one_molecule)


Generate pharmacophore dataframe from conformers

In [4]:
from fresco.frag_funcs import return_pcore_dataframe

interesting_pcores = ['Donor', 'Acceptor', 'Aromatic']

moonshot_pcore_dataframe = return_pcore_dataframe(
    moonshot_docking_confomers, interesting_pcores, hit=False)

100%|██████████| 2538/2538 [00:37<00:00, 68.33it/s]


In [5]:
print(moonshot_pcore_dataframe)


          pcore                                             smiles  mol_id  \
0         Donor  C[C@@H]1CN(C)CCN1C(=O)C[C@@]1(C(=O)Nc2cncc3ccc...       0   
1         Donor  C[C@@H]1CN(C)CCN1C(=O)C[C@@]1(C(=O)Nc2cncc3ccc...       0   
2         Donor  C[C@@H]1CN(C)CCN1C(=O)C[C@@]1(C(=O)Nc2cncc3ccc...       0   
3      Acceptor  C[C@@H]1CN(C)CCN1C(=O)C[C@@]1(C(=O)Nc2cncc3ccc...       0   
4      Acceptor  C[C@@H]1CN(C)CCN1C(=O)C[C@@]1(C(=O)Nc2cncc3ccc...       0   
...         ...                                                ...     ...   
21710  Acceptor  O=C(NCCc1cccc(F)c1)[C@H](c1cccnc1)N(C(=O)c1coc...    2537   
21711  Aromatic  O=C(NCCc1cccc(F)c1)[C@H](c1cccnc1)N(C(=O)c1coc...    2537   
21712  Aromatic  O=C(NCCc1cccc(F)c1)[C@H](c1cccnc1)N(C(=O)c1coc...    2537   
21713  Aromatic  O=C(NCCc1cccc(F)c1)[C@H](c1cccnc1)N(C(=O)c1coc...    2537   
21714  Aromatic  O=C(NCCc1cccc(F)c1)[C@H](c1cccnc1)N(C(=O)c1coc...    2537   

         coord_x   coord_y    coord_z  frag  active  IC50  
0  

Generate pharmacophore 2-body distribution

In [6]:
from fresco.frag_funcs import get_pair_distances
from itertools import product

moonshot_pair_distributions = [None]*len(set(moonshot_pcore_dataframe['mol_id']))
for j, i in tqdm(enumerate(set(moonshot_pcore_dataframe['mol_id'])), total=len(moonshot_pair_distributions)):
    pcore_pair_distribution = {}

    for pcore_pair in product(interesting_pcores, repeat=2):
        core_a, core_b = pcore_pair
        combo = core_a+'-'+core_b
        pcore_pair_distribution[combo] = get_pair_distances(
            moonshot_pcore_dataframe[moonshot_pcore_dataframe['mol_id'] == i], core_a, core_b, frag=False, active=None)
    moonshot_pair_distributions[j] = pcore_pair_distribution


100%|██████████| 2538/2538 [00:48<00:00, 52.33it/s]


Load fragment KDEs

In [7]:
import pickle

pickle_dir = '/home/wjm41/ml_physics/frag-pcore-screen/data/EnamineREAL/pickles/'
kde_mpro = 'kde_dict_spl_mpro.pickle'
kde_dict = pickle.load(open(pickle_dir+kde_mpro, 'rb'))


Score moonshot mols with KDEs

In [8]:
import numpy as np
from fresco.frag_funcs import score_dist

pcore_combinations = ['Donor-Aromatic',
                      'Donor-Acceptor',
                      'Aromatic-Aromatic',
                      'Donor-Donor',
                      'Aromatic-Acceptor',
                      'Acceptor-Acceptor']

moonshot_scores = {}
df_docking['mean_frag_score'] = -100.0
for i, pair_dist in enumerate(moonshot_pair_distributions):
    score_df_for_this_molecule = pd.DataFrame(columns=pcore_combinations)
    for pcore_combination in pcore_combinations:
        kde_for_this_combination = kde_dict[pcore_combination]
        pcore_dist = pair_dist[pcore_combination][0].reshape(
            -1, 1)
        pcore_score = score_dist(kde_for_this_combination, pcore_dist)
        score_df_for_this_molecule.at[0, pcore_combination] = pcore_score

    scores = score_df_for_this_molecule[pcore_combinations].to_numpy().astype(
        float)
    scores[np.isnan(scores)] = -100
    processed_score_for_this_molecule = np.nanmean(scores)
    df_docking.at[i, 'mean_frag_score'] = processed_score_for_this_molecule


Average fragment scores over conformers

In [9]:
df_docking.f_avg_IC50 = df_docking.f_avg_IC50.astype(float)
df_docking['Chemgauss4 Score'] = df_docking['Chemgauss4 Score'].astype(float)

columns_to_keep = ['canonical_CID', 'Chemgauss4 Score', 'f_avg_IC50', 'mean_frag_score']
df_docking = df_docking[columns_to_keep]
df_docking_grouped = df_docking.groupby(by=df_docking.canonical_CID).mean()
df_docking_grouped['canonical_CID'] = df_docking_grouped.index

df_docking_grouped['hit'] = df_docking_grouped['f_avg_IC50'] < 5
print(df_docking_grouped[['hit', 'f_avg_IC50', 'Chemgauss4 Score', 'mean_frag_score']])


                       hit  f_avg_IC50  Chemgauss4 Score  mean_frag_score
canonical_CID                                                            
AAR-POS-0daf6b7e-23  False         NaN          -5.71945        -4.103888
AAR-POS-5507155c-1   False     99.5000          -5.24347        -3.116271
AAR-POS-8a4e0f60-1   False         NaN          -9.23545        -3.203560
AAR-POS-8a4e0f60-10  False         NaN          -6.75070        -2.887218
AAR-POS-8a4e0f60-2   False         NaN          -9.57557        -2.885707
...                    ...         ...               ...              ...
WIL-MOD-03b86a88-2   False     19.0909          -8.93996        -3.490884
WIL-MOD-03b86a88-4   False     15.9534          -9.98616        -2.909079
WIL-MOD-03b86a88-5   False     21.7972          -9.32327        -3.048949
WIL-MOD-03b86a88-6   False     11.2566          -9.12435        -3.073355
WIL-UNI-2e73223c-4   False     99.5000          -9.22982        -3.024804

[1585 rows x 4 columns]


In [30]:
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np
from fresco.frag_funcs import calculate_enrichment_for_df
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={"figure.dpi": 150})



def plotly_enrichment_docking_vs_fresco(df_to_plot, title='Not Specified!'):
    n_list = np.logspace(start=np.log10(5), stop=2, num=20)
    EF_docking = [calculate_enrichment_for_df(
        df_to_plot, n=int(n), index='hit', score='Chemgauss4 Score', ascending=True) for n in n_list]
    EF_fresco = [calculate_enrichment_for_df(
        df_to_plot, n=int(n), index='hit', score='mean_frag_score', ascending=False) for n in n_list]
    orig_prop = len(
        df_to_plot[df_to_plot['hit']])/len(df_to_plot)

    df_dock_ef = pd.DataFrame(list(zip(n_list, EF_docking)), columns=['N', 'EF'])
    df_dock_ef['Method'] = 'Docking'
    df_fresco_ef = pd.DataFrame(list(zip(n_list, EF_fresco)), columns=[
                                 'N', 'EF'])
    df_fresco_ef['Method'] = 'Fresco'
    df_enrichment = pd.concat([df_dock_ef, df_fresco_ef])
    
    fig_subplot = make_subplots(rows=1, cols=2, subplot_titles=['Enrichment', 'IC50 Distribution'])
    fig_EF = px.line(df_enrichment, x='N', y='EF',
                     color='Method')

    fig_strip = px.strip(df_to_plot,  # sorting so that the colorbar is sorted!
                         x='hit',
                         y='f_avg_IC50',
                         color='hit',
                         hover_name='canonical_CID')
    for data in fig_EF.data:
        fig_subplot.add_trace(data, row=1, col=1)
    fig_subplot.add_trace(data, row=1, col=1)
    fig_subplot.add_shape( row=1, col=1,
        type="line", line=dict(dash='dash'),
        x0=min(n_list), y0=1,
        x1=max(n_list), y1=1)
    for data in fig_strip.data:
        fig_subplot.add_trace(data, row=1, col=2)
    fig_subplot.update_layout(
        title_text=f"Docking vs Fresco Enrichment - {title}",
        xaxis1_title='N',
        yaxis1_title='EF',
        xaxis2_title='Hit',
        yaxis2_title='IC50',
        width=1200,
    )
    for trace in fig_subplot['data']:

        if(trace['name'] != 'Docking' and trace['name'] != 'Fresco'):
            trace['showlegend'] = False
    fig_subplot.update_yaxes(type="log", row=1, col=2)

    fig_subplot.show()
    
def plot_enrichment_docking_vs_fresco(df_to_plot, title='Not Specified!', try_both=False):
    n_list = np.logspace(start=np.log10(5), stop=2, num=20)
    EF_docking = [calculate_enrichment_for_df(
        df_to_plot, n=int(n), index='hit', score='Chemgauss4 Score', ascending=True) for n in n_list]
    EF_fresco = [calculate_enrichment_for_df(
        df_to_plot, n=int(n), index='hit', score='mean_frag_score', ascending=False) for n in n_list]
    
    if try_both:
        df_to_plot['docking_rank'] = df_to_plot['Chemgauss4 Score'].rank(ascending=True) 
        df_to_plot['fresco_rank'] = df_to_plot['mean_frag_score'].rank(ascending=False)
        df_to_plot['both_rank'] = df_to_plot[[
            'docking_rank', 'fresco_rank']].mean(axis=1)
        EF_both = [calculate_enrichment_for_df(
            df_to_plot, n=int(n), index='hit', score='both_rank', ascending=True) for n in n_list]
    orig_prop = len(
        df_to_plot[df_to_plot['hit']])/len(df_to_plot)
    
    plt.plot(n_list, EF_docking, label='dock scores')
    plt.plot(n_list, EF_fresco, label='fresco score')
    plt.plot(n_list, EF_both, label='both')

    plt.plot(n_list, np.ones_like(n_list), 'k:')
    plt.legend()
    plt.title(f'Docking vs Fresco Enrichment\n{title}')
    plt.ylabel('EF \n(base rate = {:.1f}%)'.format(orig_prop*100))
    extraticks = [1]
    plt.yticks(list(plt.yticks()[0]) + extraticks)
    plt.ylim(bottom=0)


# plot_enrichment_docking_vs_fresco(df_docking_grouped,
#                 title='all moonshot molecules')
plotly_enrichment_docking_vs_fresco(df_docking_grouped,
                                  title='all moonshot molecules')


Getting submission dates

In [11]:
dateFile = '/home/wjm41/ml_physics/frag-pcore-screen/data/20220122_moonshot_submissions_data_for_alpha.csv'
df_date = pd.read_csv(dateFile)

def remove_suffix(id):
    id_separated = id.split('-')[:-1]
    new_id = '-'.join(id_separated)
    return new_id


df_docking_grouped['submission_id'] = df_docking_grouped['canonical_CID'].apply(remove_suffix)
df_merged = df_docking_grouped.merge(df_date, on='submission_id')



Enrichment against time

In [32]:
from ipywidgets import interact
from scipy.stats import spearmanr as spear


def plot_enrichment_against_time(year, month, ic50_threshold):
    day = '01'
    date_to_filter = int(f'{year}{month}{day}')
    df_filtered_by_date = df_merged.query('date < @date_to_filter').copy()
    df_filtered_by_date['hit'] = df_filtered_by_date['f_avg_IC50'] < ic50_threshold
    rho_dock_vs_fresco = spear(df_filtered_by_date['Chemgauss4 Score'].values, df_filtered_by_date['mean_frag_score'].values)[0]
    rho_ic50_vs_fresco = spear(df_filtered_by_date.dropna()['f_avg_IC50'].values,
                     df_filtered_by_date.dropna()['mean_frag_score'].values)[0]
    rho_ic50_vs_dock = spear(df_filtered_by_date.dropna()['Chemgauss4 Score'].values,
                               df_filtered_by_date.dropna()['mean_frag_score'].values)[0]
    print(f'rho dock vs fresco {rho_dock_vs_fresco:.2f}')
    print(f'rho dock vs IC50 {rho_ic50_vs_dock:.2f}')
    print(f'rho fresco vs IC50 {rho_ic50_vs_fresco:.2f}')

    # print(df_filtered_by_date['hit'].value_counts())
    # plot_enrichment_docking_vs_fresco(df_filtered_by_date,
    #                                   title=f'IC50 < {ic50_threshold}uM, rho = {rho_dock_vs_fresco:.2f}\n Moonshot submissions before {day}-{month}-{year} (n={len(df_filtered_by_date)})')
    plotly_enrichment_docking_vs_fresco(df_filtered_by_date,
                                        title=f'IC50 < {ic50_threshold}uM, Moonshot submissions before {day}-{month}-{year} (n={len(df_filtered_by_date)})')
    return


years_to_choose_from = ['2021', '2020']
months_to_choose_from = [f'0{x}' for x in range(
    1, 10)] + [f'{x}' for x in range(10, 13)]

interact(plot_enrichment_against_time,
         year=years_to_choose_from, month=months_to_choose_from, ic50_threshold=(0.0, 100.0))


interactive(children=(Dropdown(description='year', options=('2021', '2020'), value='2021'), Dropdown(descripti…

<function __main__.plot_enrichment_against_time(year, month, ic50_threshold)>