# Enrichment from FRESCO
5th Apr 2022 

This notebook is for calculating the enrichment from FRESCO by parsing the same `.sdf` file as used for docking.

### Load dockign conformations

In [36]:
from rdkit.Chem import PandasTools

sdfFile = '/home/wjm41/ml_physics/frag-pcore-screen/data/COVID_Moonshot_activity data_2021-03-22_noncovalent_docked.sdf'
df_docking = PandasTools.LoadSDF(
    sdfFile, idName='canonical_CID', smilesName='SMILES', molColName='mol')
print(df_docking)


                                          canonical_CID  \
0     Docking receptor of MPRO-X2908_0A_BOUND(A) > L...   
1                                   EDG-MED-ba1ac7b9-15   
2                                    MAT-POS-9ff17035-2   
3                                    EDJ-MED-8c98ee63-2   
4                                    ALP-POS-64a710fa-1   
...                                                 ...   
2654                                ALP-POS-305f6ec3-52   
2655                                ALP-POS-88a7a97e-23   
2656                                ALP-POS-b0bc6a46-27   
2657                                BRU-THA-92256091-77   
2658                                ALP-POS-ced8ea4d-30   

                                                 SMILES  \
0     CCC(C)C(NC(=O)CNC(=O)C(NC(=O)C(CCC(N)=O)NC(=O)...   
1     C[C@@H]1CN(C)CCN1C(=O)C[C@@]1(C(=O)Nc2cncc3ccc...   
2     O=C(Cc1cc(Cl)cc(Oc2cccc(=O)[nH]2)c1)Nc1cncc2cc...   
3     Cn1ccc(CNC[C@@]2(C(=O)Nc3cncc4ccccc34)CCOc3ccc...

In [37]:
from distutils.util import strtobool

df_docking = df_docking.dropna(subset=['acrylamide'])
df_docking['acrylamide'] = df_docking['acrylamide'].apply(
    strtobool).astype(bool)
df_docking['chloroacetamide'] = df_docking['chloroacetamide'].apply(
    strtobool).astype(bool)

df_docking = df_docking.query('~chloroacetamide & ~acrylamide').reset_index()
df_docking


Unnamed: 0,index,canonical_CID,SMILES,mol,series,Chemgauss4 Score,f_avg_pIC50,f_avg_IC50,chloroacetamide,acrylamide,Number of Confs,r_inhibition_at_50_uM,r_avg_IC50,relative_solubility_at_100_uM,relative_solubility_at_20_uM,f_inhibition_at_50_uM,f_inhibition_at_20_uM,r_inhibition_at_20_uM,trypsin_IC50,frag_id
0,1,EDG-MED-ba1ac7b9-15,C[C@@H]1CN(C)CCN1C(=O)C[C@@]1(C(=O)Nc2cncc3ccc...,,3-aminopyridine-like,-12.6986,5.20921,6.17716,False,False,38,,,,,,,,,
1,2,MAT-POS-9ff17035-2,O=C(Cc1cc(Cl)cc(Oc2cccc(=O)[nH]2)c1)Nc1cncc2cc...,,3-aminopyridine-like,-12.43,5.70429,1.97563,False,False,500,41.805,,,,,,,,
2,3,EDJ-MED-8c98ee63-2,Cn1ccc(CNC[C@@]2(C(=O)Nc3cncc4ccccc34)CCOc3ccc...,,3-aminopyridine-like,-12.3919,6.09539,0.802811,False,False,500,,,,,,,,,
3,4,ALP-POS-64a710fa-1,O=C(Cc1cncc2ccccc12)N(CCC1CCCCC1)Cc1cccs1,,,-12.3789,5.00607,9.86113,False,False,500,,3.02151,,,,,,,
4,5,BAR-COM-4e090d3a-3,Cc1ccccc1CNc1ccccc1NC(=O)[C@@H](O)c1cccnc1,,3-aminopyridine-like,-12.3516,,,False,False,500,35.735,,0.94,0.95,7.28047,-0.0560025,4.5875,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2533,2654,ALP-POS-305f6ec3-52,CC(C)(C)c1ccc(N(C(=O)c2c[nH]c(=O)[nH]2)[C@@H](...,,Ugi,3.0553,4.79812,15.9178,False,False,74,,,,,,,,,
2534,2655,ALP-POS-88a7a97e-23,COC(=O)[C@@H]1C[C@@H](NC(=O)[C@H](c2cccnc2)N(C...,,,3.38598,5.16595,6.82414,False,False,106,,,,,,,,,
2535,2656,ALP-POS-b0bc6a46-27,N#Cc1cc(N(C(=O)c2ccco2)[C@@H](C(=O)NCCc2cccc(F...,,,3.53119,,99.5,False,False,500,,,,,,,,,
2536,2657,BRU-THA-92256091-77,O=C(NCCc1cccc(F)c1)[C@H](c1cccnc1)N(C(=O)c1c[n...,,,4.22059,4.32089,47.765,False,False,44,,,,,,,,,


parse conformers

In [60]:
from rdkit import Chem
from rdkit.Chem.rdmolops import RemoveStereochemistry
from rdkit.Chem import AllChem

moonshot_fresh_confomers = []

for i, row in tqdm(df_docking.iterrows(), total=len(df_docking)):
    mol = row['mol']
    conformer_docking = mol.GetConformer()
    
    mol_with_h = Chem.AddHs(mol)
    successful_embedding = AllChem.EmbedMolecule(mol_with_h)
    if successful_embedding != 0:
        print(f'Naive Embedding failed for {row["canonical_CID"], row["SMILES"]}, \
              writing docking conformer to mol_{i}.sdf and retrying conformer generation without stereochemistry.')
        writer = Chem.SDWriter(f'nonsense_conformers/mol_{i}.sdf')
        writer.write(mol, confId=0)
        RemoveStereochemistry(mol_with_h)
        successful_embedding = AllChem.EmbedMolecule(mol_with_h)
    #     successful_embedding = AllChem.EmbedMolecule(
    #         mol_with_h, maxAttempts=5000, useRandomCoords=True)
    assert successful_embedding == 0, f'Retried Embedding failed for {row["canonical_CID"], row["SMILES"]}'
    conformer_fresh = mol_with_h.GetConformer()
    
    data_for_one_molecule = [mol_with_h]
    for j, atom in enumerate(mol_with_h.GetAtoms()):
        data_for_one_molecule.append([atom.GetSymbol(),
                         conformer_fresh.GetPositions()[j]
                         ])
    moonshot_fresh_confomers.append(data_for_one_molecule)


 18%|█▊        | 460/2538 [00:24<19:30,  1.77it/s]

Naive Embedding failed for ('MAT-POS-590ac91e-23', 'Cc1ccncc1NC(=O)CC1C[C@@H]2C[C@H]2C1'),               writing docking conformer to bullshit_457.sdf and retrying conformer generation without stereochemistry.


 20%|█▉        | 501/2538 [00:30<10:48,  3.14it/s]

Naive Embedding failed for ('MAT-POS-590ac91e-20', 'Cc1ccnc(C)c1NC(=O)C1C[C@@H]2C[C@H]2C1'),               writing docking conformer to bullshit_497.sdf and retrying conformer generation without stereochemistry.


 21%|██        | 534/2538 [00:36<09:33,  3.49it/s]

Naive Embedding failed for ('MAT-POS-590ac91e-28', 'Cc1ccncc1NC(=O)[C@@H]1[C@H]2CCOC[C@@H]21'),               writing docking conformer to bullshit_528.sdf and retrying conformer generation without stereochemistry.


 23%|██▎       | 579/2538 [00:42<13:28,  2.42it/s]

Naive Embedding failed for ('MAT-POS-590ac91e-20', 'Cc1ccnc(C)c1NC(=O)C1C[C@H]2C[C@@H]2C1'),               writing docking conformer to bullshit_575.sdf and retrying conformer generation without stereochemistry.


 23%|██▎       | 595/2538 [00:48<13:08,  2.46it/s]

Naive Embedding failed for ('MAT-POS-590ac91e-28', 'Cc1ccncc1NC(=O)[C@H]1[C@H]2CCOC[C@@H]21'),               writing docking conformer to bullshit_589.sdf and retrying conformer generation without stereochemistry.


 27%|██▋       | 675/2538 [00:55<08:49,  3.52it/s]

Naive Embedding failed for ('MAT-POS-590ac91e-28', 'Cc1ccncc1NC(=O)[C@@H]1[C@@H]2CCOC[C@H]21'),               writing docking conformer to bullshit_670.sdf and retrying conformer generation without stereochemistry.


 31%|███       | 792/2538 [01:05<13:52,  2.10it/s]

Naive Embedding failed for ('MAT-POS-590ac91e-28', 'Cc1ccncc1NC(=O)[C@H]1[C@@H]2CCOC[C@@H]12'),               writing docking conformer to bullshit_789.sdf and retrying conformer generation without stereochemistry.


100%|██████████| 2538/2538 [03:19<00:00, 12.72it/s]


In [4]:
import py3Dmol

view = py3Dmol.view(width=800, height=800)

id=533
nonsense_sdf = f'/home/wjm41/ml_physics/frag-pcore-screen/figs/plotting_notebooks/nonsense_conformers/mol_{id}.sdf'
nonsense_conformer = open(nonsense_sdf,'r').read()
view.addModel(nonsense_conformer, 'sdf')
view.setStyle({'stick': {}})

view.zoomTo()


<py3Dmol.view at 0x2ac851228e10>

Generate pharmacophore dataframe from conformers

In [61]:
from fresco.frag_funcs import return_pcore_dataframe

interesting_pcores = ['Donor', 'Acceptor', 'Aromatic']

moonshot_pcore_dataframe = return_pcore_dataframe(
    moonshot_fresh_confomers, interesting_pcores, hit=False)

100%|██████████| 2538/2538 [01:08<00:00, 37.31it/s]


In [40]:
print(moonshot_pcore_dataframe)


          pcore                                             smiles  mol_id  \
0         Donor  [H]c1nc([H])c2c([H])c([H])c([H])c([H])c2c1N([H...       0   
1         Donor  [H]c1nc([H])c2c([H])c([H])c([H])c([H])c2c1N([H...       0   
2         Donor  [H]c1nc([H])c2c([H])c([H])c([H])c([H])c2c1N([H...       0   
3      Acceptor  [H]c1nc([H])c2c([H])c([H])c([H])c([H])c2c1N([H...       0   
4      Acceptor  [H]c1nc([H])c2c([H])c([H])c([H])c([H])c2c1N([H...       0   
...         ...                                                ...     ...   
21710  Acceptor  [H]c1nc(C(=O)N(c2c([H])c([H])c3c(c2[H])C([H])(...    2537   
21711  Aromatic  [H]c1nc(C(=O)N(c2c([H])c([H])c3c(c2[H])C([H])(...    2537   
21712  Aromatic  [H]c1nc(C(=O)N(c2c([H])c([H])c3c(c2[H])C([H])(...    2537   
21713  Aromatic  [H]c1nc(C(=O)N(c2c([H])c([H])c3c(c2[H])C([H])(...    2537   
21714  Aromatic  [H]c1nc(C(=O)N(c2c([H])c([H])c3c(c2[H])C([H])(...    2537   

        coord_x   coord_y   coord_z  frag  active  IC50  
0    

Generate pharmacophore 2-body distribution

In [62]:
from fresco.frag_funcs import get_pair_distances
from itertools import product
from tqdm import tqdm

moonshot_pair_distributions = [None]*len(set(moonshot_pcore_dataframe['mol_id']))
for j, i in tqdm(enumerate(set(moonshot_pcore_dataframe['mol_id'])), total=len(moonshot_pair_distributions)):
    pcore_pair_distribution = {}

    for pcore_pair in product(interesting_pcores, repeat=2):
        core_a, core_b = pcore_pair
        combo = core_a+'-'+core_b
        pcore_pair_distribution[combo] = get_pair_distances(
            moonshot_pcore_dataframe[moonshot_pcore_dataframe['mol_id'] == i], core_a, core_b, frag=False, active=None)
    moonshot_pair_distributions[j] = pcore_pair_distribution


100%|██████████| 2538/2538 [00:47<00:00, 53.46it/s]


Load fragment KDEs

In [63]:
import dill as pickle

pickle_dir = '/home/wjm41/ml_physics/frag-pcore-screen/data/EnamineREAL/pickles/'
kde_mpro = 'kde_dict_spl_mpro.pickle'
kde_dict = pickle.load(open(pickle_dir+kde_mpro, 'rb'))


Score moonshot mols with KDEs

In [64]:
import numpy as np
import pandas as pd
from fresco.frag_funcs import score_dist


def score_dist_new(kde, dist):
    if len(dist):  # non-zero length

        # log-prob (larger = higher prob)
        score = kde(dist.reshape(-1, 1))

        score = np.max(score)

        return score
    else:
        return np.nan


pcore_combinations = ['Donor-Aromatic',
                      'Donor-Acceptor',
                      'Aromatic-Aromatic',
                      'Donor-Donor',
                      'Aromatic-Acceptor',
                      'Acceptor-Acceptor']

moonshot_scores = {}
df_docking['mean_frag_score'] = -100.0
for i, pair_dist in enumerate(moonshot_pair_distributions):
    score_df_for_this_molecule = pd.DataFrame(columns=pcore_combinations)
    for pcore_combination in pcore_combinations:
        kde_for_this_combination = kde_dict[pcore_combination]
        pcore_dist = pair_dist[pcore_combination][0].reshape(
            -1, 1)
        # pcore_score = score_dist(kde_for_this_combination, pcore_dist)
        pcore_score = score_dist_new(kde_for_this_combination, pcore_dist)
        score_df_for_this_molecule.at[0, pcore_combination] = pcore_score

    scores = score_df_for_this_molecule[pcore_combinations].to_numpy().astype(
        float)
    scores[np.isnan(scores)] = -100
    processed_score_for_this_molecule = np.nanmean(scores)
    df_docking.at[i, 'mean_frag_score'] = processed_score_for_this_molecule


Score using docked poses too

In [57]:
moonshot_pcore_dataframe


Unnamed: 0,pcore,smiles,mol_id,coord_x,coord_y,coord_z
0,Donor,C[C@@H]1CN(C)CCN1C(=O)C[C@@]1(C(=O)Nc2cncc3ccc...,0,8.706100,-3.282500,24.817800
0,Donor,C[C@@H]1CN(C)CCN1C(=O)C[C@@]1(C(=O)Nc2cncc3ccc...,0,6.785400,-4.799400,26.467600
0,Donor,C[C@@H]1CN(C)CCN1C(=O)C[C@@]1(C(=O)Nc2cncc3ccc...,0,7.780500,-0.126000,21.171800
0,Acceptor,C[C@@H]1CN(C)CCN1C(=O)C[C@@]1(C(=O)Nc2cncc3ccc...,0,6.594600,0.190600,17.701700
0,Acceptor,C[C@@H]1CN(C)CCN1C(=O)C[C@@]1(C(=O)Nc2cncc3ccc...,0,9.923300,0.106100,20.227800
...,...,...,...,...,...,...
0,Acceptor,O=C(NCCc1cccc(F)c1)[C@H](c1cccnc1)N(C(=O)c1coc...,2537,5.232900,-7.223200,27.364600
0,Aromatic,O=C(NCCc1cccc(F)c1)[C@H](c1cccnc1)N(C(=O)c1coc...,2537,11.660640,0.511120,24.741700
0,Aromatic,O=C(NCCc1cccc(F)c1)[C@H](c1cccnc1)N(C(=O)c1coc...,2537,6.833800,-5.037117,27.731300
0,Aromatic,O=C(NCCc1cccc(F)c1)[C@H](c1cccnc1)N(C(=O)c1coc...,2537,5.819533,-0.802667,24.718817


In [66]:
from fresco.featurise import return_default_pharmacophore_pairs, calculate_pairwise_distances_between_pharmacophores_for_a_single_ligand
from fresco.featurise import return_pcore_dataframe_for_list_of_mols

mols = df_docking['mol'].values
moonshot_pcore_dataframe = return_pcore_dataframe_for_list_of_mols(
    mols=mols)

pcore_pairs = return_default_pharmacophore_pairs()

moonshot_pair_distributions = []
for index in tqdm(moonshot_pcore_dataframe.mol_id.unique()):
    pair_distribution_for_this_ligand = {}

    for pcore_pair in pcore_pairs:
        core_a, core_b = pcore_pair.split('-')
        pair_distribution_for_this_ligand[pcore_pair] = calculate_pairwise_distances_between_pharmacophores_for_a_single_ligand(
            moonshot_pcore_dataframe.query('mol_id == @index'), core_a, core_b)

    moonshot_pair_distributions.append(pair_distribution_for_this_ligand)

def score_with_kde(kde_model, col_name):

    df_docking[col_name] = -100.0
    for i, pair_dist in enumerate(moonshot_pair_distributions):

        score_df_for_this_molecule = pd.DataFrame(columns=pcore_pairs)

        for pcore_combination in pcore_pairs:
            kde_for_this_combination = kde_model[pcore_combination]
            # print(pair_dist[pcore_combination])
            pcore_dist = pair_dist[pcore_combination].reshape(
                -1, 1)
            pcore_score = score_dist_new(kde_for_this_combination, pcore_dist)
            score_df_for_this_molecule.at[0, pcore_combination] = pcore_score

        scores = score_df_for_this_molecule[pcore_pairs].to_numpy().astype(
            float)
        # scores[np.isnan(scores)] = -100
        processed_score_for_this_molecule = np.nanmean(scores)
        df_docking.at[i, col_name] = processed_score_for_this_molecule


score_with_kde(kde_dict, 'docked_frag_score')


100%|██████████| 2538/2538 [00:35<00:00, 71.12it/s]
100%|██████████| 2538/2538 [01:02<00:00, 40.58it/s]



Average fragment scores over conformers

In [67]:
df_docking.f_avg_IC50 = df_docking.f_avg_IC50.astype(float)
df_docking['Chemgauss4 Score'] = df_docking['Chemgauss4 Score'].astype(float)

columns_to_keep = ['canonical_CID', 'Chemgauss4 Score', 'f_avg_IC50', 'mean_frag_score', 'docked_frag_score']
df_docking_copy = df_docking[columns_to_keep]
df_docking_grouped = df_docking_copy.groupby(by=df_docking.canonical_CID).mean()
df_docking_grouped['canonical_CID'] = df_docking_grouped.index

df_docking_grouped['hit'] = df_docking_grouped['f_avg_IC50'] < 5
print(df_docking_grouped[['hit', 'f_avg_IC50', 'Chemgauss4 Score', 'mean_frag_score', 'docked_frag_score']])


                       hit  f_avg_IC50  Chemgauss4 Score  mean_frag_score  \
canonical_CID                                                               
AAR-POS-0daf6b7e-23  False         NaN          -5.71945        -3.204496   
AAR-POS-5507155c-1   False     99.5000          -5.24347        -3.295316   
AAR-POS-8a4e0f60-1   False         NaN          -9.23545        -2.528411   
AAR-POS-8a4e0f60-10  False         NaN          -6.75070        -2.493290   
AAR-POS-8a4e0f60-2   False         NaN          -9.57557        -2.514308   
...                    ...         ...               ...              ...   
WIL-MOD-03b86a88-2   False     19.0909          -8.93996        -2.515766   
WIL-MOD-03b86a88-4   False     15.9534          -9.98616        -2.550120   
WIL-MOD-03b86a88-5   False     21.7972          -9.32327        -1.982709   
WIL-MOD-03b86a88-6   False     11.2566          -9.12435        -1.976290   
WIL-UNI-2e73223c-4   False     99.5000          -9.22982        -2.722867   

Strip plot

In [26]:
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np
from fresco.frag_funcs import calculate_enrichment_for_df
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={"figure.dpi": 150})


def plotly_enrichment_and_strip_docking_vs_fresco(df_to_plot, title='Not Specified!'):
    n_list = np.logspace(start=np.log10(5), stop=2, num=20)
    EF_docking = [calculate_enrichment_for_df(
        df_to_plot, n=int(n), index='hit', score='Chemgauss4 Score', ascending=True) for n in n_list]
    EF_fresco_fresh = [calculate_enrichment_for_df(
        df_to_plot, n=int(n), index='hit', score='mean_frag_score', ascending=False) for n in n_list]
    EF_fresco_docked = [calculate_enrichment_for_df(
        df_to_plot, n=int(n), index='hit', score='docked_frag_score', ascending=False) for n in n_list]
    orig_prop = len(
        df_to_plot[df_to_plot['hit']])/len(df_to_plot)

    df_dock_ef = pd.DataFrame(list(zip(n_list, EF_docking)), columns=['N', 'EF'])
    df_dock_ef['Method'] = 'Docking'
    df_fresco_ef_fresh = pd.DataFrame(list(zip(n_list, EF_fresco_fresh)), columns=[
                                 'N', 'EF'])
    df_fresco_ef_fresh['Method'] = 'Fresco (Fresh Conformers)'
    df_fresco_ef_docked = pd.DataFrame(list(zip(n_list, EF_fresco_docked)), columns=[
        'N', 'EF'])
    df_fresco_ef_docked['Method'] = 'Fresco (Docked Conformers)'
    df_enrichment = pd.concat(
        [df_dock_ef, df_fresco_ef_fresh, df_fresco_ef_docked])
    
    fig_subplot = make_subplots(rows=1, cols=2, subplot_titles=['Enrichment', 'IC50 Distribution'])
    fig_EF = px.line(df_enrichment, x='N', y='EF',
                     color='Method')

    fig_strip = px.strip(df_to_plot,  # sorting so that the colorbar is sorted!
                         x='hit',
                         y='f_avg_IC50',
                         color='hit',
                         hover_name='canonical_CID')
    for data in fig_EF.data:
        fig_subplot.add_trace(data, row=1, col=1)
    fig_subplot.add_trace(data, row=1, col=1)
    fig_subplot.add_shape( row=1, col=1,
        type="line", line=dict(dash='dash'),
        x0=min(n_list), y0=1,
        x1=max(n_list), y1=1)
    for data in fig_strip.data:
        fig_subplot.add_trace(data, row=1, col=2)
    fig_subplot.update_layout(
        title_text=f"Docking vs Fresco Enrichment - {title}",
        xaxis1_title='N',
        yaxis1_title=f'EF \n(base rate = {orig_prop*100:.1f}%)',
        xaxis2_title='Hit',
        yaxis2_title='IC50',
        width=1200,
    )
    for trace in fig_subplot['data']:

        if(trace['name'] != 'Docking' and trace['name'] != 'Fresco'):
            trace['showlegend'] = False
    fig_subplot.update_yaxes(type="log", row=1, col=2)

    fig_subplot.show()
    
def plot_enrichment_docking_vs_fresco(df_to_plot, title='Not Specified!', try_both=False):
    n_list = np.logspace(start=np.log10(5), stop=2, num=20)
    EF_docking = [calculate_enrichment_for_df(
        df_to_plot, n=int(n), index='hit', score='Chemgauss4 Score', ascending=True) for n in n_list]
    EF_fresco = [calculate_enrichment_for_df(
        df_to_plot, n=int(n), index='hit', score='mean_frag_score', ascending=False) for n in n_list]
    
    if try_both:
        df_to_plot['docking_rank'] = df_to_plot['Chemgauss4 Score'].rank(ascending=True) 
        df_to_plot['fresco_rank'] = df_to_plot['mean_frag_score'].rank(ascending=False)
        df_to_plot['both_rank'] = df_to_plot[[
            'docking_rank', 'fresco_rank']].mean(axis=1)
        EF_both = [calculate_enrichment_for_df(
            df_to_plot, n=int(n), index='hit', score='both_rank', ascending=True) for n in n_list]
    orig_prop = len(
        df_to_plot[df_to_plot['hit']])/len(df_to_plot)
    
    plt.plot(n_list, EF_docking, label='dock scores')
    plt.plot(n_list, EF_fresco, label='fresco score')
    plt.plot(n_list, EF_both, label='both')

    plt.plot(n_list, np.ones_like(n_list), 'k:')
    plt.legend()
    plt.title(f'Docking vs Fresco Enrichment\n{title}')
    plt.ylabel('EF \n(base rate = {:.1f}%)'.format(orig_prop*100))
    extraticks = [1]
    plt.yticks(list(plt.yticks()[0]) + extraticks)
    plt.ylim(bottom=0)


# plot_enrichment_docking_vs_fresco(df_docking_grouped,
#                 title='all moonshot molecules')
plotly_enrichment_and_strip_docking_vs_fresco(df_docking_grouped,
                                  title='all moonshot molecules')


Getting submission dates

In [68]:
dateFile = '/home/wjm41/ml_physics/frag-pcore-screen/data/20220122_moonshot_submissions_data_for_alpha.csv'
df_date = pd.read_csv(dateFile)

def remove_suffix(id):
    id_separated = id.split('-')[:-1]
    new_id = '-'.join(id_separated)
    return new_id


df_docking_grouped['submission_id'] = df_docking_grouped['canonical_CID'].apply(remove_suffix)
df_merged = df_docking_grouped.merge(df_date, on='submission_id')



Enrichment against time

In [70]:
from ipywidgets import interact
from scipy.stats import spearmanr as spear

def plotly_enrichment_docking_vs_fresco(df_to_plot, title='Not Specified!'):

    starting_number = 10
    stop_number = 100
    n_list = np.linspace(start=starting_number, stop=stop_number,
                         num=stop_number-starting_number, endpoint=False)    
    EF_docking = [calculate_enrichment_for_df(
        df_to_plot, n=int(n), index='hit', score='Chemgauss4 Score', ascending=True) for n in n_list]
    EF_fresco_fresh = [calculate_enrichment_for_df(
        df_to_plot, n=int(n), index='hit', score='mean_frag_score', ascending=False) for n in n_list]
    EF_fresco_docked = [calculate_enrichment_for_df(
        df_to_plot, n=int(n), index='hit', score='docked_frag_score', ascending=False) for n in n_list]
    orig_prop = len(
        df_to_plot[df_to_plot['hit']])/len(df_to_plot)

    df_dock_ef = pd.DataFrame(
        list(zip(n_list, EF_docking)), columns=['N', 'EF'])
    df_dock_ef['Method'] = 'Docking'
    df_fresco_ef_fresh = pd.DataFrame(list(zip(n_list, EF_fresco_fresh)), columns=[
        'N', 'EF'])
    df_fresco_ef_fresh['Method'] = 'Fresco (Fresh Conformers)'
    df_fresco_ef_docked = pd.DataFrame(list(zip(n_list, EF_fresco_docked)), columns=[
        'N', 'EF'])
    df_fresco_ef_docked['Method'] = 'Fresco (Docked Conformers)'
    df_enrichment = pd.concat(
        [df_dock_ef, df_fresco_ef_fresh, df_fresco_ef_docked])
    
    df_enrichment['hit rate'] = df_enrichment['EF']*orig_prop*100
    fig_EF = px.line(df_enrichment, x='N', y='hit rate',
                     color='Method')

    fig_EF.add_shape(
        type="line", line=dict(dash='dash'),
        x0=min(n_list), y0=orig_prop*100,
        x1=max(n_list), y1=orig_prop*100)
    
    fig_EF.update_layout(
        title_text=f"Enrichment (ab initio conformers) - {title}",
        xaxis_title='N',
        yaxis_title=f'Hit rate <br>n = {len(df_to_plot)} <br> base rate = {orig_prop*100:.1f}%',
        width=800,
    )

    fig_EF.show()


def plot_enrichment_against_time(year='2020', month='09', day='01', ic50_threshold='5.0'):
    date_to_filter = int(f'{year}{month}{day}')
    df_filtered_by_date = df_merged.query('date < @date_to_filter').copy()
    df_filtered_by_date['hit'] = df_filtered_by_date['f_avg_IC50'] < ic50_threshold
    rho_dock_vs_fresco = spear(df_filtered_by_date['Chemgauss4 Score'].values, df_filtered_by_date['mean_frag_score'].values)[0]
    rho_ic50_vs_fresco = spear(df_filtered_by_date.dropna()['f_avg_IC50'].values,
                     df_filtered_by_date.dropna()['mean_frag_score'].values)[0]
    rho_ic50_vs_dock = spear(df_filtered_by_date.dropna()['Chemgauss4 Score'].values,
                               df_filtered_by_date.dropna()['mean_frag_score'].values)[0]
    print(f'rho dock vs fresco {rho_dock_vs_fresco:.2f}')
    print(f'rho dock vs IC50 {rho_ic50_vs_dock:.2f}')
    print(f'rho fresco vs IC50 {rho_ic50_vs_fresco:.2f}')

    # print(df_filtered_by_date['hit'].value_counts())
    # plot_enrichment_docking_vs_fresco(df_filtered_by_date,
    #                                   title=f'IC50 < {ic50_threshold}uM, rho = {rho_dock_vs_fresco:.2f}\n Moonshot submissions before {day}-{month}-{year} (n={len(df_filtered_by_date)})')
    plotly_enrichment_docking_vs_fresco(df_filtered_by_date,
                                        title=f'IC50 < {ic50_threshold}uM<br>COVID Moonshot assays before {day}-{month}-{year}')
    return


years_to_choose_from = ['2021', '2020']
months_to_choose_from = [f'0{x}' for x in range(
    1, 10)] + [f'{x}' for x in range(10, 13)]
days_to_choose_from = [f'0{x}' for x in range(
    1, 10)] + [f'{x}' for x in range(10, 31)]
interact(plot_enrichment_against_time,
         year=years_to_choose_from, month=months_to_choose_from, day=days_to_choose_from, ic50_threshold=(0.0, 100.0))


interactive(children=(Dropdown(description='year', index=1, options=('2021', '2020'), value='2020'), Dropdown(…

<function __main__.plot_enrichment_against_time(year='2020', month='09', day='01', ic50_threshold='5.0')>