# Examine CDD data

20-08-2022 Check and label activity measurements from Nir to model predictions

In [1]:
import pandas as pd

df_cdd = pd.read_excel(
    '../data/measurements/CDD Excel Export - 2022-08-18 12h31m51s.xlsx')
df_cdd['SMILES'] = df_cdd["CXSMILES (CDD Compatible)"].str.replace(' .*', '')
df_cdd['IC50'] = df_cdd['Protease_SARS_Cov2_Mpro_fluorescence_dose_response: IC50 (µM)'].astype(float)
df_cdd.loc[0, 'SMILES']


  df_cdd['SMILES'] = df_cdd["CXSMILES (CDD Compatible)"].str.replace(' .*', '')


'CC(=O)NC1=CC=C(NC(=O)CN2C[C@H](C(=O)NC3=CN=CC4=CC=CC=C34)C3=CC(Cl)=CC=C3C2)C=C1'

Load our predictions

In [2]:
df_top20 = pd.read_csv('../data/predictions/correct_chlorine/top_20.csv')
df_false_negatives = pd.read_csv('../data/predictions/correct_chlorine/false_negatives.csv')
df_false_negatives['Average Prediction (%)'] = df_false_negatives[['RF Predicted Inhibition (%)', 'GP Predicted Inhibition (%)']].mean(axis=1)
df_best_amides = pd.read_csv('../data/predictions/correct_chlorine/best_amides.csv')
df_best_amides['Average Prediction (%)'] = df_best_amides[[
    'Random Forest', 'Gaussian Process']].mean(axis=1)


In [3]:
df_false_negatives


Unnamed: 0,Molecule Name,SMILES,Measured Inhibition (%),RF Predicted Inhibition (%),GP Predicted Inhibition (%),Average Prediction (%)
0,PCM-0223655,O=C(Nc1cncc2ccccc12)[C@@H]1CN(CC(=O)N(C2CCCCC2...,20.7,47.8474,59.783384,53.815392
1,PCM-0223597,CN1CCN(c2ccccc2NC(=O)CN2Cc3ccc(Cl)cc3[C@H](C(=...,24.5,62.032,56.818089,59.425045
2,PCM-0223562,CC(C)(C)N(Cc1ccccc1)C(=O)CN1Cc2ccc(Cl)cc2[C@H]...,15.5,44.2642,61.708616,52.986408
3,PCM-0223543,O=C(CN1Cc2ccc(Cl)cc2[C@H](C(=O)Nc2cncc3ccccc23...,33.8,79.4698,71.954646,75.712223
4,PCM-0223419,O=C(CN1Cc2ccc(Cl)cc2[C@H](C(=O)Nc2cncc3ccccc23...,16.5,54.2816,39.831777,47.056688


In [4]:
df_best_amides['SMILES'] = df_best_amides['amide']

In [5]:
import useful_rdkit_utils

def gen_fps(df_in):
    useful_rdkit_utils.add_molecule_and_errors(
        df_in, smiles_col='SMILES', mol_col_name='mol', error_col_name='Error')

    df_in['fps'] = [useful_rdkit_utils.mol2morgan_fp(mol) for mol in df_in['mol'].values]
    return df_in

for df in [df_top20, df_false_negatives, df_best_amides, df_cdd]:
    gen_fps(df)

INFO:rdkit:Enabling RDKit 2021.09.4 jupyter extensions


In [6]:
from rdkit import DataStructs
from rdkit.Chem import MolFromSmiles, AllChem
from tqdm import tqdm

import numpy as np

df_top20['Data'] = 'Top 20'
df_false_negatives['Data'] = 'False Negative'
df_best_amides['Data'] = 'Enamine Amide'
df_submitted = pd.concat([df_top20, df_false_negatives, df_best_amides]).reset_index(drop=True)
sim_mat = np.empty((len(df_cdd), len(df_submitted)))
for i in tqdm(range(len(df_cdd['fps']))):
    sim_mat[i] = np.array(
        DataStructs.BulkTanimotoSimilarity(df_cdd.loc[i, 'fps'], df_submitted['fps'].values))

    similarity = sim_mat[i]
    # print(similarity)
    if np.max(similarity) > 0.8:
        j = np.argmax(similarity)
        # print(f"{df1.loc[i, 'SMILES']} is the same as {df2.loc[np.argmax(similarity), 'SMILES']} with similarity {np.max(similarity)}")
        data_source = df_submitted.loc[j, 'Data']
        if data_source == 'Enamine Amide':
            print(
                f"ID={df_cdd.loc[i, 'Molecule Name']},IC50={df_cdd.loc[i, 'IC50']:.3f}uM from {data_source}, Predicted inhibition = (GP = {df_submitted.loc[j, 'Gaussian Process']:.1f}% , RF = {df_submitted.loc[j, 'Random Forest']:.1f}%)")
        else:
            print(f"ID={df_cdd.loc[i, 'Molecule Name']},IC50={df_cdd.loc[i, 'IC50']:.3f}uM from {data_source}, Predicted inhibition = (Average = {df_submitted.loc[j, 'Average Prediction (%)']:.1f}%)")

        df_cdd.loc[i, 'Data'] = df_submitted.loc[j, 'Data']


ID=ASAP-0000164,IC50=0.034uM from Top 20, Predicted inhibition = (Average = 52.6%)
ID=ASAP-0000169,IC50=0.043uM from Enamine Amide, Predicted inhibition = (GP = 82.0% , RF = 87.3%)
ID=ASAP-0000155,IC50=0.046uM from Top 20, Predicted inhibition = (Average = 77.3%)
ID=ASAP-0000157,IC50=0.062uM from Top 20, Predicted inhibition = (Average = 83.4%)
ID=ASAP-0000156,IC50=0.068uM from Top 20, Predicted inhibition = (Average = 51.8%)
ID=ASAP-0000162,IC50=0.070uM from Enamine Amide, Predicted inhibition = (GP = 84.6% , RF = 88.4%)
ID=ASAP-0000168,IC50=0.071uM from Enamine Amide, Predicted inhibition = (GP = 82.7% , RF = 84.2%)
ID=ASAP-0000167,IC50=0.073uM from Enamine Amide, Predicted inhibition = (GP = 81.8% , RF = 85.0%)
ID=ASAP-0000161,IC50=0.077uM from Enamine Amide, Predicted inhibition = (GP = 86.6% , RF = 82.6%)
ID=ASAP-0000159,IC50=0.086uM from Top 20, Predicted inhibition = (Average = 74.7%)
ID=ASAP-0000158,IC50=0.087uM from Top 20, Predicted inhibition = (Average = 85.6%)
ID=ASAP-0000

In [7]:
df_submitted

Unnamed: 0,Molecule Name,SMILES,Measured Inhibition (%),Average Prediction (%),FP,mol,Error,fps,Data,RF Predicted Inhibition (%),GP Predicted Inhibition (%),ID,amide,Random Forest,Gaussian Process
0,PCM-0223366,CCN1CCN(c2ccc(NC(=O)CN3Cc4ccc(Cl)cc4[C@H](C(=O...,97.4,56.9,True,<rdkit.Chem.rdchem.Mol object at 0x7fa83ff45de0>,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Top 20,,,,,,
1,PCM-0223638,CC(=O)Nc1ccc(NC(=O)CN2Cc3ccc(Cl)cc3[C@H](C(=O)...,95.2,52.6,True,<rdkit.Chem.rdchem.Mol object at 0x7fa83ff45f00>,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Top 20,,,,,,
2,PCM-0223648,O=C(CN1Cc2ccc(Cl)cc2[C@H](C(=O)Nc2cncc3ccccc23...,92.0,77.3,False,<rdkit.Chem.rdchem.Mol object at 0x7fa83ff45f60>,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Top 20,,,,,,
3,PCM-0223612,CN1CCN(c2ccc(NC(=O)CN3Cc4ccc(Cl)cc4[C@H](C(=O)...,91.6,35.8,True,<rdkit.Chem.rdchem.Mol object at 0x7fa83ff45fc0>,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",Top 20,,,,,,
4,PCM-0223642,O=C(CN1Cc2ccc(Cl)cc2[C@H](C(=O)Nc2cncc3ccccc23...,91.4,51.8,True,<rdkit.Chem.rdchem.Mol object at 0x7fa83ff46020>,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Top 20,,,,,,
5,PCM-0223523,COc1cc(CNC(=O)CN2Cc3ccc(Cl)cc3[C@H](C(=O)Nc3cn...,90.5,76.7,False,<rdkit.Chem.rdchem.Mol object at 0x7fa83ff46080>,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Top 20,,,,,,
6,PCM-0223525,O=C(CN1Cc2ccc(Cl)cc2[C@H](C(=O)Nc2cncc3ccccc23...,90.5,83.4,False,<rdkit.Chem.rdchem.Mol object at 0x7fa83ff460e0>,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Top 20,,,,,,
7,PCM-0223397,Cc1ccc2[nH]cc(CCNC(=O)CN3Cc4ccc(Cl)cc4[C@H](C(...,90.1,53.9,True,<rdkit.Chem.rdchem.Mol object at 0x7fa83ff46140>,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Top 20,,,,,,
8,PCM-0223544,O=C(CN1Cc2ccc(Cl)cc2[C@H](C(=O)Nc2cncc3ccccc23...,90.1,36.7,True,<rdkit.Chem.rdchem.Mol object at 0x7fa83ff461a0>,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Top 20,,,,,,
9,PCM-0223452,O=C(CN1Cc2ccc(Cl)cc2[C@H](C(=O)Nc2cncc3ccccc23...,88.3,85.6,False,<rdkit.Chem.rdchem.Mol object at 0x7fa83ff46200>,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Top 20,,,,,,


04-09-2022 2nd batch from Nir

In [8]:
df_cdd_2 = pd.read_excel(
    '../data/measurements/CDD Excel Export - 2022-09-07 03h27m51s.xlsx')
df_cdd_2['IC50'] = df_cdd_2['Protease_SARS_Cov2_Mpro_fluorescence_dose_response: IC50 (µM)'].astype(str).str.replace(
    '< ', '')
df_cdd_2['IC50'] = df_cdd_2['IC50'].astype(float)
df_cdd_2['SMILES'] = df_cdd_2["CXSMILES (CDD Compatible)"].str.replace(
    ' .*', '')
df_cdd_2.loc[0, 'SMILES']


100%|██████████| 17/17 [00:00<00:00, 1226.93it/s]
  df_cdd_2['SMILES'] = df_cdd_2["CXSMILES (CDD Compatible)"].str.replace(
100%|██████████| 25/25 [00:00<00:00, 1508.55it/s]
  df_cdd_3['SMILES'] = df_cdd_3["CXSMILES (CDD Compatible)"].str.replace(
100%|██████████| 101/101 [00:00<00:00, 1603.71it/s]

Dropping invalid columns in DataFrameGroupBy.max is deprecated. In a future version, a TypeError will be raised. Before calling .max, select only columns which should be valid for the function.


Dropping invalid columns in DataFrameGroupBy.min is deprecated. In a future version, a TypeError will be raised. Before calling .min, select only columns which should be valid for the function.


Dropping invalid columns in DataFrameGroupBy.max is deprecated. In a future version, a TypeError will be raised. Before calling .max, select only columns which should be valid for the function.


Dropping invalid columns in DataFrameGroupBy.min is deprecated. In a future version, a TypeError will be raised

'CC1=CC=C(CCNC(=O)CN2C[C@H](C(=O)NC3=CN=CC4=CC=CC=C34)C3=CC(Cl)=CC=C3C2)C=C1'

In [9]:
df_cdd_2 = df_cdd_2.sort_values(by='IC50', ascending=True).reset_index(drop=True)
gen_fps(df_cdd_2)
sim_mat = np.empty((len(df_cdd_2), len(df_submitted)))
for i in tqdm(range(len(df_cdd_2['fps']))):
    sim_mat[i] = np.array(
        DataStructs.BulkTanimotoSimilarity(df_cdd_2.loc[i, 'fps'], df_submitted['fps'].values))

    similarity = sim_mat[i]
    # print(similarity)
    if np.max(similarity) > 0.8:
        j = np.argmax(similarity)
        # print(
        #     f"{df_cdd_2.loc[i, 'SMILES']} is the same as {df_submitted.loc[np.argmax(similarity), 'SMILES']} with similarity {np.max(similarity)}")
        print(
            f"ID={df_cdd_2.loc[i, 'Molecule Name']},IC50={df_cdd_2.loc[i, 'IC50']:.3f}uM from {df_submitted.loc[j, 'Data']}, Predicted inhibition = (GP = {df_submitted.loc[j, 'Gaussian Process']:.1f}% , RF = {df_submitted.loc[j, 'Random Forest']:.1f}%)")
        df_cdd_2.loc[i, 'Data'] = df_submitted.loc[j, 'Data']


ID=ASAP-0000219,IC50=0.050uM from Enamine Amide, Predicted inhibition = (GP = 87.9% , RF = 80.6%)
ID=ASAP-0000221,IC50=0.050uM from Top 20, Predicted inhibition = (GP = nan% , RF = nan%)
ID=ASAP-0000211,IC50=0.050uM from Enamine Amide, Predicted inhibition = (GP = 84.3% , RF = 83.5%)
ID=ASAP-0000226,IC50=0.057uM from Enamine Amide, Predicted inhibition = (GP = 85.7% , RF = 57.5%)
ID=ASAP-0000214,IC50=0.057uM from Top 20, Predicted inhibition = (GP = nan% , RF = nan%)
ID=ASAP-0000225,IC50=0.057uM from Top 20, Predicted inhibition = (GP = nan% , RF = nan%)
ID=ASAP-0000209,IC50=0.064uM from Top 20, Predicted inhibition = (GP = nan% , RF = nan%)
ID=ASAP-0000205,IC50=0.069uM from Top 20, Predicted inhibition = (GP = nan% , RF = nan%)
ID=ASAP-0000213,IC50=0.071uM from Top 20, Predicted inhibition = (GP = nan% , RF = nan%)
ID=ASAP-0000218,IC50=0.075uM from Top 20, Predicted inhibition = (GP = nan% , RF = nan%)
ID=ASAP-0000216,IC50=0.077uM from Enamine Amide, Predicted inhibition = (GP = 84.3%

In [10]:
print(f'Number of molecules: {len(df_cdd)+len(df_cdd_2)}')

Number of molecules: 42


2022-11-21 Full dose-response

In [11]:
df_cdd_3 = pd.read_excel(
    '../data/measurements/CDD Excel Export - 2022-11-15 21h48m35s.xlsx')
df_cdd_3['IC50'] = df_cdd_3['Protease_SARS_Cov2_Mpro_fluorescence_dose_response: IC50 (µM)'].astype(str).str.replace(
    '< ', '')
df_cdd_3['IC50'] = df_cdd_3['IC50'].astype(str).str.replace(
    '> ', '')
df_cdd_3['IC50'] = df_cdd_3['IC50'].astype(float)
df_cdd_3['SMILES'] = df_cdd_3["CXSMILES (CDD Compatible)"].str.replace(
    ' .*', '')
df_cdd_3.loc[0, 'SMILES']


'FC(F)(F)C1=CN=C(NC(=O)CN2C[C@H](C(=O)NC3=C4C=CC=CC4=CN=C3)C3=C(C2)C=CC(Cl)=C3)C=C1'

In [12]:
df_cdd_3 = df_cdd_3.sort_values(
    by='IC50', ascending=True).reset_index(drop=True)
gen_fps(df_cdd_3)
sim_mat = np.empty((len(df_cdd_3), len(df_submitted)))
for i in tqdm(range(len(df_cdd_3['fps']))):
    sim_mat[i] = np.array(
        DataStructs.BulkTanimotoSimilarity(df_cdd_3.loc[i, 'fps'], df_submitted['fps'].values))

    similarity = sim_mat[i]
    # print(similarity)
    if np.max(similarity) > 0.8:
        j = np.argmax(similarity)
        # print(
        #     f"{df_cdd_2.loc[i, 'SMILES']} is the same as {df_submitted.loc[np.argmax(similarity), 'SMILES']} with similarity {np.max(similarity)}")
        print(
            f"ID={df_cdd_3.loc[i, 'Molecule Name']},IC50={df_cdd_3.loc[i, 'IC50']:.3f}uM from {df_submitted.loc[j, 'Data']}, Predicted inhibition = (GP = {df_submitted.loc[j, 'Gaussian Process']:.1f}% , RF = {df_submitted.loc[j, 'Random Forest']:.1f}%)")
        df_cdd_3.loc[i, 'Data'] = df_submitted.loc[j, 'Data']
        df_cdd_3.loc[i, 'FP'] = df_submitted.loc[j, 'FP']


ID=ASAP-0000221,IC50=0.028uM from Top 20, Predicted inhibition = (GP = nan% , RF = nan%)
ID=ASAP-0000219,IC50=0.034uM from Enamine Amide, Predicted inhibition = (GP = 87.9% , RF = 80.6%)
ID=ASAP-0000164,IC50=0.034uM from Top 20, Predicted inhibition = (GP = nan% , RF = nan%)
ID=ASAP-0000164,IC50=0.034uM from Top 20, Predicted inhibition = (GP = nan% , RF = nan%)
ID=ASAP-0000226,IC50=0.037uM from Enamine Amide, Predicted inhibition = (GP = 85.7% , RF = 57.5%)
ID=ASAP-0000169,IC50=0.043uM from Enamine Amide, Predicted inhibition = (GP = 82.0% , RF = 87.3%)
ID=ASAP-0000155,IC50=0.045uM from Top 20, Predicted inhibition = (GP = nan% , RF = nan%)
ID=ASAP-0000219,IC50=0.046uM from Enamine Amide, Predicted inhibition = (GP = 87.9% , RF = 80.6%)
ID=ASAP-0000155,IC50=0.046uM from Top 20, Predicted inhibition = (GP = nan% , RF = nan%)
ID=ASAP-0000226,IC50=0.046uM from Enamine Amide, Predicted inhibition = (GP = 85.7% , RF = 57.5%)
ID=ASAP-0000219,IC50=0.050uM from Enamine Amide, Predicted inhibi

In [13]:
df_cdd_all = pd.concat([df_cdd, df_cdd_2, df_cdd_3])
df_cdd_all.query('`Molecule Name` == "ASAP-0000226"')

Unnamed: 0,Molecule Name,Structure,CXSMILES (CDD Compatible),SMILES,Protease_SARS_Cov2_Mpro_fluorescence_dose_response: IC50 (µM),Protease_SARS_Cov2_Mpro_fluorescence_dose_response: Dose-response Plot,IC50,mol,Error,fps,Data,CDD Number,FP
3,ASAP-0000226,,CCN1CCN(CC1)C1=CC=C(NC(=O)CN2C[C@H](C(=O)NC3=C...,CCN1CCN(CC1)C1=CC=C(NC(=O)CN2C[C@H](C(=O)NC3=C...,0.057,,0.057,<rdkit.Chem.rdchem.Mol object at 0x7fa83ffbd120>,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Enamine Amide,,
4,ASAP-0000226,,CCN1CCN(CC1)C1=CC=C(NC(=O)CN2C[C@H](C(=O)NC3=C...,CCN1CCN(CC1)C1=CC=C(NC(=O)CN2C[C@H](C(=O)NC3=C...,0.037,,0.037,<rdkit.Chem.rdchem.Mol object at 0x7fa8401584c0>,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Enamine Amide,,
9,ASAP-0000226,,CCN1CCN(CC1)C1=CC=C(NC(=O)CN2C[C@H](C(=O)NC3=C...,CCN1CCN(CC1)C1=CC=C(NC(=O)CN2C[C@H](C(=O)NC3=C...,< 0.046,,0.046,<rdkit.Chem.rdchem.Mol object at 0x7fa8401590c0>,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Enamine Amide,,
14,ASAP-0000226,,CCN1CCN(CC1)C1=CC=C(NC(=O)CN2C[C@H](C(=O)NC3=C...,CCN1CCN(CC1)C1=CC=C(NC(=O)CN2C[C@H](C(=O)NC3=C...,0.056,,0.056,<rdkit.Chem.rdchem.Mol object at 0x7fa8401589a0>,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Enamine Amide,,
17,ASAP-0000226,,CCN1CCN(CC1)C1=CC=C(NC(=O)CN2C[C@H](C(=O)NC3=C...,CCN1CCN(CC1)C1=CC=C(NC(=O)CN2C[C@H](C(=O)NC3=C...,0.057,,0.057,<rdkit.Chem.rdchem.Mol object at 0x7fa840158ac0>,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Enamine Amide,,
91,ASAP-0000226,,CCN1CCN(CC1)C1=CC=C(NC(=O)CN2C[C@H](C(=O)NC3=C...,CCN1CCN(CC1)C1=CC=C(NC(=O)CN2C[C@H](C(=O)NC3=C...,1.168,,1.168,<rdkit.Chem.rdchem.Mol object at 0x7fa84015aa40>,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Enamine Amide,,


In [14]:
df_cdd_all = pd.concat([df_cdd, df_cdd_2, df_cdd_3])
# df_cdd_all = df_cdd_all[['Molecule Name',
#                          'CXSMILES (CDD Compatible)', 'SMILES', 'IC50', 'Data']]


In [15]:
import plotly.express as px
import molplotly

fig = px.strip(df_cdd_all, x="IC50", y='Data', color="Data",
               log_x=True, hover_name='Molecule Name')
strip_app = molplotly.add_molecules(fig,
                                    df_cdd_all,
                                    smiles_col='SMILES',
                                    color_col='Data',
                                    title_col='Molecule Name',
                                    svg_size=400,
                                    width=300)
# fig.show()
strip_app.run_server(port=8011,mode='inline')

In [16]:
df_cdd_3.groupby('Data').median().sort_values('IC50')*1000


Unnamed: 0_level_0,CDD Number,IC50
Data,Unnamed: 1_level_1,Unnamed: 2_level_1
Top 20,,82.0
Enamine Amide,,83.0
False Negative,,1279.5


In [50]:
df_cdd_3.query('Data != "False Negative"').groupby(
    'Molecule Name').mean()


Unnamed: 0_level_0,CDD Number,IC50
Molecule Name,Unnamed: 1_level_1,Unnamed: 2_level_1
ASAP-0000155,,0.0455
ASAP-0000156,,0.068
ASAP-0000157,,0.061
ASAP-0000158,,0.112
ASAP-0000159,,0.0775
ASAP-0000160,,0.104
ASAP-0000161,,0.077
ASAP-0000162,,0.079
ASAP-0000163,,0.193
ASAP-0000164,,0.034


In [61]:
import time

df_ic50 = df_cdd_3.query('Data != "False Negative"').groupby('Molecule Name').mean()*1000
df_ic50['SMILES'] = df_cdd_3.groupby('Molecule Name')['SMILES'].first()
df_ic50['Data'] = df_cdd_3.groupby('Molecule Name')['Data'].first()

# set index as column
df_ic50 = df_ic50.reset_index(level=0)
df_ic50 = df_ic50.round(1)

df_max = df_cdd_3.query('Data != "False Negative"').groupby(
    'Molecule Name').max()*1000
df_max.rename(columns={'IC50': 'IC50_max'}, inplace=True)
df_max = df_max.reset_index(level=0)

df_min = df_cdd_3.query('Data != "False Negative"').groupby(
    'Molecule Name').min()*1000
df_min.rename(columns={'IC50': 'IC50_min'}, inplace=True)
df_min = df_min.reset_index(level=0)

# set index as column
# df_std = df_std.round(1)

df_ic50['max_error'] = df_max['IC50_max'] - df_ic50['IC50']
df_ic50['min_error'] = df_ic50['IC50'] - df_min['IC50_min']
newnames = {'Top 20': 'Top 20 Crude Compounds',
            'Enamine Amide': 'Top 20 ML Compounds',}
df_ic50
df_ic50.Data = df_ic50.Data.map(newnames)
fig = px.scatter(df_ic50, 
               x="IC50",
            #    error_x="max_error",
            #    error_x_minus="min_error",
               y='Data',
               color="Data",
               log_x=True, 
               labels = {'IC50':' Mean IC50 (nM)',
                         'Data': ''},
               title='Distribution of Bioactivity for Purified Compounds',
            #    category_orders={"Data": [
            #        "Top 20", "Enamine Amide", "False Negative"]},
               hover_name='Molecule Name',
               template='simple_white',
               width=1000)
fig.update_xaxes(showgrid=True)
fig.update_layout(showlegend=False)
fig.write_image('strip_plot.pdf')
time.sleep(2)
fig.write_image('strip_plot.pdf')
fig.show()


In [18]:
strip_app = molplotly.add_molecules(fig,
                                    df_ic50,
                                    smiles_col='SMILES',
                                    color_col='Data',
                                    title_col='Molecule Name',
                                    svg_size=400,
                                    width=300)
strip_app.run_server(port=8011,mode='inline', width=1000)

In [19]:
len(df_cdd_3.query('Data == "Top 20"')['Molecule Name'].unique())

21

In [20]:
len(df_cdd_3.query('Data == "Enamine Amide"')['Molecule Name'].unique())

19

In [21]:
len(df_cdd_all.query('Data == "False Negative"')['Molecule Name'].unique())


4