# Notebook for early moonshot regression

22nd Apr
Take only activites from early days of moonshot to show how difficult it is to regress IC50.

Load docked dataset

In [1]:
from rdkit.Chem import PandasTools

sdfFile = '/home/wjm41/ml_physics/frag-pcore-screen/data/COVID_Moonshot_activity data_2021-03-22_noncovalent_docked.sdf'
df_docking = PandasTools.LoadSDF(
    sdfFile, idName='canonical_CID', smilesName='SMILES', molColName='mol')
print(df_docking)


INFO:rdkit:Enabling RDKit 2020.09.1 jupyter extensions


                                          canonical_CID  \
0     Docking receptor of MPRO-X2908_0A_BOUND(A) > L...   
1                                   EDG-MED-ba1ac7b9-15   
2                                    MAT-POS-9ff17035-2   
3                                    EDJ-MED-8c98ee63-2   
4                                    ALP-POS-64a710fa-1   
...                                                 ...   
2654                                ALP-POS-305f6ec3-52   
2655                                ALP-POS-88a7a97e-23   
2656                                ALP-POS-b0bc6a46-27   
2657                                BRU-THA-92256091-77   
2658                                ALP-POS-ced8ea4d-30   

                                                 SMILES  \
0     CCC(C)C(NC(=O)CNC(=O)C(NC(=O)C(CCC(N)=O)NC(=O)...   
1     C[C@@H]1CN(C)CCN1C(=O)C[C@@]1(C(=O)Nc2cncc3ccc...   
2     O=C(Cc1cc(Cl)cc(Oc2cccc(=O)[nH]2)c1)Nc1cncc2cc...   
3     Cn1ccc(CNC[C@@]2(C(=O)Nc3cncc4ccccc34)CCOc3ccc...

Remove covalent compounds

In [2]:
from distutils.util import strtobool

df_docking = df_docking.dropna(subset=['acrylamide'])
df_docking['acrylamide'] = df_docking['acrylamide'].apply(
    strtobool).astype(bool)
df_docking['chloroacetamide'] = df_docking['chloroacetamide'].apply(
    strtobool).astype(bool)

df_docking = df_docking.query('~chloroacetamide & ~acrylamide')


Group conformers and take mean score

In [3]:
df_docking.f_avg_IC50 = df_docking.f_avg_IC50.astype(float)
df_docking['Chemgauss4 Score'] = df_docking['Chemgauss4 Score'].astype(float)

columns_to_keep = ['canonical_CID', 'Chemgauss4 Score',
                   'f_avg_IC50']
df_docking = df_docking[columns_to_keep]
df_docking_grouped = df_docking.groupby(by=df_docking.canonical_CID).mean()
df_docking_grouped['canonical_CID'] = df_docking_grouped.index

df_docking_grouped['hit'] = df_docking_grouped['f_avg_IC50'] < 5
print(df_docking_grouped[['hit', 'f_avg_IC50',
      'Chemgauss4 Score']])


                       hit  f_avg_IC50  Chemgauss4 Score
canonical_CID                                           
AAR-POS-0daf6b7e-23  False         NaN          -5.71945
AAR-POS-5507155c-1   False     99.5000          -5.24347
AAR-POS-8a4e0f60-1   False         NaN          -9.23545
AAR-POS-8a4e0f60-10  False         NaN          -6.75070
AAR-POS-8a4e0f60-2   False         NaN          -9.57557
...                    ...         ...               ...
WIL-MOD-03b86a88-2   False     19.0909          -8.93996
WIL-MOD-03b86a88-4   False     15.9534          -9.98616
WIL-MOD-03b86a88-5   False     21.7972          -9.32327
WIL-MOD-03b86a88-6   False     11.2566          -9.12435
WIL-UNI-2e73223c-4   False     99.5000          -9.22982

[1585 rows x 3 columns]


Merge with date dataframe

In [4]:
dateFile = '/home/wjm41/ml_physics/frag-pcore-screen/data/20220122_moonshot_submissions_data_for_alpha.csv'
df_date = pd.read_csv(dateFile)


def remove_suffix(id):
    id_separated = id.split('-')[:-1]
    new_id = '-'.join(id_separated)
    return new_id


df_docking_grouped['submission_id'] = df_docking_grouped['canonical_CID'].apply(
    remove_suffix)
df_merged = df_docking_grouped.merge(df_date, on='submission_id')


In [5]:
print(df_merged)

      Chemgauss4 Score  f_avg_IC50        canonical_CID    hit  \
0             -5.71945         NaN  AAR-POS-0daf6b7e-23  False   
1             -5.24347     99.5000   AAR-POS-5507155c-1  False   
2             -9.23545         NaN   AAR-POS-8a4e0f60-1  False   
3             -6.75070         NaN  AAR-POS-8a4e0f60-10  False   
4             -9.57557         NaN   AAR-POS-8a4e0f60-2  False   
...                ...         ...                  ...    ...   
1580          -8.93996     19.0909   WIL-MOD-03b86a88-2  False   
1581          -9.98616     15.9534   WIL-MOD-03b86a88-4  False   
1582          -9.32327     21.7972   WIL-MOD-03b86a88-5  False   
1583          -9.12435     11.2566   WIL-MOD-03b86a88-6  False   
1584          -9.22982     99.5000   WIL-UNI-2e73223c-4  False   

         submission_id  num_mols  \
0     AAR-POS-0daf6b7e        47   
1     AAR-POS-5507155c         5   
2     AAR-POS-8a4e0f60        12   
3     AAR-POS-8a4e0f60        12   
4     AAR-POS-8a4e0f60     

Lazy hack for a figure - just use scale docking score as proxy for "predicted IC50"

In [14]:
import plotly.express as px
from ipywidgets import interact

def plot_regression_against_time(year, month, day, ic50_threshold):
    date_to_filter = int(f'{year}{month}{day}')
    df_filtered_by_date = df_merged.query('date < @date_to_filter & f_avg_IC50 < @ic50_threshold').copy()
    df_filtered_by_date['Chemgauss4 Score'] = (
        df_filtered_by_date['Chemgauss4 Score']-df_filtered_by_date['Chemgauss4 Score'].mean())/df_filtered_by_date['Chemgauss4 Score'].std()
    
    df_filtered_by_date['Chemgauss4 Score'] = df_filtered_by_date['Chemgauss4 Score']*df_filtered_by_date['f_avg_IC50'].std() + df_filtered_by_date['f_avg_IC50'].mean()
    fig_scatter = px.scatter(df_filtered_by_date, 
                             x='f_avg_IC50', 
                             y='Chemgauss4 Score',
                             title=f'Predicted IC50 vs Actual IC50 (n = {len(df_filtered_by_date)})',
                             labels={
                                 "f_avg_IC50": "Actual IC50 (uM)",
                                 "Chemgauss4 Score": "Predicted IC50 (uM)",
                             },
                             width=800)
    fig_scatter.show()

years_to_choose_from = ['2021', '2020']
months_to_choose_from = [f'0{x}' for x in range(
    1, 10)] + [f'{x}' for x in range(10, 13)]
days_to_choose_from = [f'0{x}' for x in range(
    1, 10)] + [f'{x}' for x in range(10, 31)]
interact(plot_regression_against_time,
         year=years_to_choose_from, month=months_to_choose_from, day=days_to_choose_from, ic50_threshold=(0.0, 100.0))


interactive(children=(Dropdown(description='year', options=('2021', '2020'), value='2021'), Dropdown(descripti…

<function __main__.plot_regression_against_time(year, month, day, ic50_threshold)>