Imports

In [1]:
import pandas as pd
from tqdm import tqdm

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
import mols2grid

import useful_rdkit_utils

from IPython.display import display

import plotly.express as px
import molplotly


INFO:rdkit:Enabling RDKit 2021.09.4 jupyter extensions


Load spreadsheets

In [2]:
activity_df = pd.read_csv(
    '../data/amine_coupling_yield_updated/activity.csv', usecols=['SMILES y', 'mean_corr_activity']).rename(columns={'SMILES y': 'smiles', 'mean_corr_activity': 'activity'})

yield_df = pd.read_csv('../data/amine_coupling_yield_updated/yield.csv').rename(columns={'SMILES': 'smiles'})
# added column titles by hand - comment, yield and Plate Well instead of Well

display(activity_df)
display(yield_df)

Unnamed: 0,smiles,activity
0,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...,51.8
1,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...,9.9
2,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...,9.5
3,[O-][N+](c(cc1)ccc1NC(CN(CC1C(Nc2cncc3ccccc23)...,83.6
4,CNC(c(cccc1)c1NC(CN(CC1C(Nc2cncc3ccccc23)=O)Cc...,71.1
...,...,...
295,N#Cc(cc1)cc(I)c1NC(CN(CC1C(Nc2cncc3ccccc23)=O)...,84.5
296,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...,76.5
297,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...,24.6
298,Cc1cccc(C)c1NC(CN(CC1)CCN1C(CN(CC1C(Nc2cncc3cc...,65.5


Unnamed: 0,Well,HOAt,Acid,P,random stuff,comment,yield,Plate Well,smiles
0,A1,36,35.0,0.0,29,,0%,A1,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...
1,A2,45,17.0,20.0,18,,36%,A10,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...
2,A3,45,42.0,0.0,13,,0%,A11,N#Cc(ccc(NC(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)...
3,A4,35,29.0,32.0,4,,49%,A12,CCN(CC1)CCN1c(cc1)ccc1NC(CN(CC1C(Nc2cncc3ccccc...
4,A5,46,0.0,49.0,5,,91%,A13,CCN(CC)S(c(cc1)ccc1NC(CN(CC1C(Nc2cncc3ccccc23)...
...,...,...,...,...,...,...,...,...,...
295,P14,40,36.0,0.0,24,,0%,P5,COc1cccc(CCNC(CN(CC2C(Nc3cncc4ccccc34)=O)Cc(cc...
296,P15,42,38.0,1.0,19,amine,2%,P6,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...
297,P16,27,2.0,0.0,71,amine,0%,P7,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...
298,P17,56,0.0,41.0,3,,93%,P8,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...


Check SMILES merge

In [3]:
merged_df = activity_df.merge(yield_df, on='smiles')
display(merged_df)

# Incorrect smiles in activity_df
# O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)Nc1ccc(cc(cc2)Br)c2c0 was fixed to 
# O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)Nc1ccc(cc(cc2)Br)c2c1


Unnamed: 0,smiles,activity,Well,HOAt,Acid,P,random stuff,comment,yield,Plate Well
0,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...,51.8,K11,39,38.0,0.0,23,,0%,K19
1,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...,9.9,J11,17,20.0,0.0,63,amine 55%,0%,J19
2,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...,9.5,I11,40,4.0,45.0,11,,75%,I19
3,[O-][N+](c(cc1)ccc1NC(CN(CC1C(Nc2cncc3ccccc23)...,83.6,L11,25,25.0,0.0,50,amine,0%,L19
4,CNC(c(cccc1)c1NC(CN(CC1C(Nc2cncc3ccccc23)=O)Cc...,71.1,P3,45,41.0,0.0,14,,0%,P11
...,...,...,...,...,...,...,...,...,...,...
295,N#Cc(cc1)cc(I)c1NC(CN(CC1C(Nc2cncc3ccccc23)=O)...,84.5,L3,32,33.0,0.0,35,amine,0%,L11
296,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...,76.5,P16,27,2.0,0.0,71,amine,0%,P7
297,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...,24.6,H2,38,21.0,23.0,18,,37%,H10
298,Cc1cccc(C)c1NC(CN(CC1)CCN1C(CN(CC1C(Nc2cncc3cc...,65.5,H16,48,0.0,48.0,4,,92%,H6


Sort out plate wells

In [4]:
df_with_yield = merged_df[['smiles', 'Plate Well', 'activity']]
for i, row in df_with_yield.iterrows():
    yield_slice = merged_df.query("Well == @row['Plate Well']").copy()
    df_with_yield.loc[i, 'yield (%)'] = yield_slice['yield'].str.rstrip('%').astype(float).values[0]
display(df_with_yield)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_yield.loc[i, 'yield (%)'] = yield_slice['yield'].str.rstrip('%').astype(float).values[0]


Unnamed: 0,smiles,Plate Well,activity,yield (%)
0,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...,K19,51.8,96.0
1,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...,J19,9.9,100.0
2,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...,I19,9.5,93.0
3,[O-][N+](c(cc1)ccc1NC(CN(CC1C(Nc2cncc3ccccc23)...,L19,83.6,0.0
4,CNC(c(cccc1)c1NC(CN(CC1C(Nc2cncc3ccccc23)=O)Cc...,P11,71.1,0.0
...,...,...,...,...
295,N#Cc(cc1)cc(I)c1NC(CN(CC1C(Nc2cncc3ccccc23)=O)...,L11,84.5,0.0
296,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...,P7,76.5,100.0
297,O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)...,H10,24.6,98.0
298,Cc1cccc(C)c1NC(CN(CC1)CCN1C(CN(CC1C(Nc2cncc3cc...,H6,65.5,100.0


Visualise Yields

In [5]:
fig = px.scatter(df_with_yield, x='activity', y='yield (%)', width=800, height=600,
                 title='Activity against yield')

app = molplotly.add_molecules(
    fig=fig, df=df_with_yield, smiles_col='smiles', wrap=True, wraplen=25, width=150)

app.run_server(mode='inline', port=8012, height=650)


Load model predictions

In [6]:
from rdkit.Chem import rdChemReactions

replace_this = 'O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc(Cl)2)c1cc2)-[*:1]'
with_this = 'O=C(CN(CC1C(Nc2cncc3ccccc23)=O)Cc(cc2)c1cc2Cl)-[*:1]'
my_rxn = rdChemReactions.ReactionFromSmarts(f"{replace_this}>>{with_this}")


def replace_substruct(smi):
    mol = Chem.MolFromSmiles(smi)

    products = my_rxn.RunReactants((mol,))
    new_smiles = Chem.MolToSmiles(products[0][0])
    return new_smiles


test_smi = 'Clc1ccc2[C@@H](CN(CC(=O)Nc3ccc(Br)c(Cl)c3Cl)Cc2c1)C(=O)Nc4cncc5ccccc45'
print(test_smi)
print(replace_substruct(test_smi))


Clc1ccc2[C@@H](CN(CC(=O)Nc3ccc(Br)c(Cl)c3Cl)Cc2c1)C(=O)Nc4cncc5ccccc45
O=C(CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1)Nc1ccc(Br)c(Cl)c1Cl


In [7]:
def canonicalise_and_remove_stereo(smi):
    try:
        mol = Chem.MolFromSmiles(smi)
    except TypeError:
        print(f'{smi} is invalid, returning Nan')
        return None
    Chem.rdmolops.RemoveStereochemistry(mol)
    return Chem.MolToSmiles(mol)

# My models
df = pd.read_csv('../data/predictions/loo_predictions.csv')
df['inhibition'] = df['Measured Inhibition (%)']
df_quantile = pd.read_csv(
    '../data/predictions/loo_quantile_preds.csv')[['Molecule Name', '50th quantile']]
df = df.merge(df_quantile, on='Molecule Name')

# Emmas models
emma_inliers = pd.read_csv('../data/predictions/intersected_inliers.csv')
emma_outliers = pd.read_csv('../data/predictions/intersected_outliers.csv')
emma_inliers['GP Predicted Inhibition (%)'] = emma_inliers[[
    'amine_rbf_pred', 'amine_matern_pred']].mean(axis=1)
emma_outliers['GP Predicted Inhibition (%)'] = emma_outliers[[
    'amine_rbf_pred', 'amine_matern_pred']].mean(axis=1)
emma_inliers = emma_inliers.drop(
    ['amine_rbf_pred', 'amine_matern_pred'], axis=1)
emma_outliers = emma_outliers.drop(
    ['amine_rbf_pred', 'amine_matern_pred'], axis=1)
emma_mols = pd.concat([emma_inliers, emma_outliers]).drop('SMILES', axis=1)

# # Average our models
# df_tot = df.merge(emma_mols, on='Molecule Name')
# df_tot['pred'] = df_tot[['RF Predicted Inhibition (%)',
#                          'GP Predicted Inhibition (%)']].mean(axis=1)

# display(df_tot)


def highlight_diff(s, threshold, columns):
    is_max = pd.Series(data=False, index=s.index)
    is_max[columns[0]] = abs(s.loc[columns[0]]-s.loc[columns[1]] )>= threshold
    return ['background-color: salmon; color: black' if is_max.any() else '' for v in is_max]


tqdm.pandas()
df['smiles'] = df['SMILES'].progress_apply(
    canonicalise_and_remove_stereo)

df['smiles'] = df['smiles'].progress_apply(replace_substruct)
df_with_yield['smiles'] = df_with_yield['smiles'].progress_apply(
    canonicalise_and_remove_stereo)

merged_df = df_with_yield.merge(df, on='smiles')
# display(df_with_yield)
# display(df['smiles'].values)
wrong_rows = merged_df.query('abs(100 - (inhibition + activity)) > 0.3')[['Molecule Name', 'smiles', 'activity', 'inhibition']]
wrong_rows['old_activity'] = 100 - wrong_rows['inhibition']
wrong_rows = wrong_rows.rename(columns={
                               'activity': 'Mean Corrected Activity (%): 24th Feb', 'old_activity': 'Mean Corrected Activity (%): 2nd Feb'}).drop('inhibition', axis=1)
rounder = {'Mean Corrected Activity (%): 24th Feb': '{:.1f}', 
           'Mean Corrected Activity (%): 2nd Feb': '{:.1f}'}
display(wrong_rows.style.format(rounder).apply(highlight_diff, threshold=5, columns=['Mean Corrected Activity (%): 24th Feb', 'Mean Corrected Activity (%): 2nd Feb'], axis=1)
        )


100%|██████████| 300/300 [00:00<00:00, 946.28it/s]
100%|██████████| 300/300 [00:00<00:00, 664.45it/s]
100%|██████████| 300/300 [00:00<00:00, 1134.16it/s]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Molecule Name,smiles,Mean Corrected Activity (%): 24th Feb,Mean Corrected Activity (%): 2nd Feb
8,PCM-0223409,O=C(CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1)NCc1ccc2c(c1)OCO2,25.6,14.6
9,PCM-0223446,O=C(CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1)Nc1ccc2c(c1)OCCO2,14.6,25.6
11,PCM-0223458,CCOc1ccccc1CNC(=O)CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1,26.2,13.8
12,PCM-0223487,COc1ccc(C(C)NC(=O)CN2Cc3ccc(Cl)cc3C(C(=O)Nc3cncc4ccccc34)C2)cc1,15.8,23.6
13,PCM-0223524,CCOc1ccc(CNC(=O)CN2Cc3ccc(Cl)cc3C(C(=O)Nc3cncc4ccccc34)C2)cc1,23.6,15.8
14,PCM-0223579,O=C(CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1)NC(CO)Cc1ccccc1,13.8,26.2
16,PCM-0223457,O=C(CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1)NCc1ccc([N+](=O)[O-])cc1,84.6,12.7
17,PCM-0223492,Cc1ccc([N+](=O)[O-])c(NC(=O)CN2Cc3ccc(Cl)cc3C(C(=O)Nc3cncc4ccccc34)C2)c1,12.7,84.6
19,PCM-0223462,N#Cc1ccc(Cl)c(NC(=O)CN2Cc3ccc(Cl)cc3C(C(=O)Nc3cncc4ccccc34)C2)c1,79.5,79.1
21,PCM-0223641,N#Cc1c(Cl)cccc1NC(=O)CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1,79.1,79.5


Tanimoto index search

In [8]:
import numpy as np


useful_rdkit_utils.add_molecule_and_errors(df_with_yield, 'smiles', 'mol')
useful_rdkit_utils.add_molecule_and_errors(df, 'smiles', 'mol')

df_with_yield['fps'] = [AllChem.GetMorganFingerprintAsBitVect(
    mol, radius=3, nBits=1024) for mol in df_with_yield['mol']]
df['fps'] = [AllChem.GetMorganFingerprintAsBitVect(
    mol, radius=3, nBits=1024) for mol in df['mol']]

for i in tqdm(range(len(df_with_yield['fps']))):
    similarity = np.array(DataStructs.BulkTanimotoSimilarity(
        df_with_yield['fps'].values[i], df['fps'].values))
    # print(similarity)
    if np.max(similarity) > 0.8:
        print(f"{df_with_yield.loc[i, 'smiles']} is the same as {df.loc[np.argmax(similarity), 'smiles']} with similarity {np.max(similarity)}")


O=C(Nc1cncc2ccccc12)C1CN(CC(=O)N2CCOCC2)Cc2ccc(Cl)cc21 is the same as O=C(Nc1cncc2ccccc12)C1CN(CC(=O)N2CCOCC2)Cc2ccc(Cl)cc21 with similarity 1.0
O=C(CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1)Nc1ccccc1 is the same as O=C(CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1)Nc1ccccc1 with similarity 1.0
O=C(CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1)NCc1ccccc1 is the same as O=C(CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1)NCc1ccccc1 with similarity 1.0
O=C(CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1)Nc1ccc([N+](=O)[O-])cc1 is the same as O=C(CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1)Nc1ccc([N+](=O)[O-])cc1 with similarity 1.0
CNC(=O)c1ccccc1NC(=O)CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1 is the same as CNC(=O)c1ccccc1NC(=O)CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1 with similarity 1.0
CC(=O)Nc1ccc(NC(=O)CN2Cc3ccc(Cl)cc3C(C(=O)Nc3cncc4ccccc34)C2)cc1 is the same as CC(=O)Nc1ccc(NC(=O)CN2Cc3ccc(Cl)cc3C(C(=O)Nc3cncc4ccccc34)C2)cc1 with similarity 1.0
O=C(CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1)Nc1ccc2scnc2

In [67]:
import time

merged_df['Inhibition (%)'] = 100 - merged_df['activity']
display(merged_df)
display(merged_df.query('inhibition == 86.7'))

df_without_errors = merged_df.query('`Inhibition (%)` == `Measured Inhibition (%)`')

fig = px.scatter(df_without_errors, 
                 x='Inhibition (%)',
                 y='RF Predicted Inhibition (%)',
                 color='yield (%)',
                 hover_data=['Measured Inhibition (%)'],
                 width=1000,
                 height=800, 
                 title='Leave-One-Out Validation',
                 labels={
                     "RF Predicted Inhibition (%)": "ML Predicted Inhibition (%)",
                     "Inhibition (%)": "Measured Inihibition (%)",
                 },
                 template='simple_white',)

fig.add_shape(type='line',
              x0=0,
              y0=0,
              x1=100,
              y1=100,
              line=dict(color='black', dash='dash'),
              xref='x',
              yref='y',
              opacity=1,
              line_width=2
              )
# fig.update_xaxes(showgrid=False)
# fig.update_yaxes(showgrid=False)
# fig.update_layout(
#     title='Leave-One-Out Regression',
#     # title=r'$Leave-One-Out Regression (N=300, R^2: , \rho:)$',
#     width=1000,  # size of figure
#     height=800,
#     xaxis=dict(title="Measured Inhibition (%)"),
#     yaxis=dict(title="ML Predicted Inhibition (%)"),
#     template='simple_white'
# )

fig.update_traces(marker={'size': 15})

fig.write_image('rf_loo.pdf')
time.sleep(2)
fig.write_image('rf_loo.pdf')
fig.show()
# app = molplotly.add_molecules(
#     fig=fig, df=df_without_errors, title_col='Molecule Name', smiles_col='smiles', caption_cols=['yield (%)', 'Measured Inhibition (%)'], wrap=True, wraplen=25, width=150)

# app.run_server(mode='inline', port=8013, height=650)


Unnamed: 0,smiles,Plate Well,activity,yield (%),Molecule Name,SMILES,Measured Inhibition (%),RF Predicted Inhibition (%),inhibition,50th quantile,Inhibition (%)
0,O=C(Nc1cncc2ccccc12)C1CN(CC(=O)N2CCOCC2)Cc2ccc...,K19,51.8,96.0,PCM-0223563,Clc1ccc2[C@@H](CN(CC(=O)N3CCOCC3)Cc2c1)C(=O)Nc...,48.2,51.8978,48.2,45.9,48.2
1,O=C(CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1)...,J19,9.9,100.0,PCM-0223544,Clc1ccc2[C@@H](CN(CC(=O)Nc3ccccc3)Cc2c1)C(=O)N...,90.1,28.6458,90.1,45.0,90.1
2,O=C(CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1)...,I19,9.5,93.0,PCM-0223525,Clc1ccc2[C@@H](CN(CC(=O)NCc3ccccc3)Cc2c1)C(=O)...,90.5,68.7526,90.5,71.0,90.5
3,O=C(CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1)...,L19,83.6,0.0,PCM-0223582,[O-][N+](=O)c1ccc(NC(=O)CN2C[C@@H](C(=O)Nc3cnc...,16.4,39.3330,16.4,41.9,16.4
4,CNC(=O)c1ccccc1NC(=O)CN1Cc2ccc(Cl)cc2C(C(=O)Nc...,P11,71.1,0.0,PCM-0223647,CNC(=O)c1ccccc1NC(=O)CN2C[C@@H](C(=O)Nc3cncc4c...,28.9,20.6552,28.9,24.2,28.9
...,...,...,...,...,...,...,...,...,...,...,...
295,N#Cc1ccc(NC(=O)CN2Cc3ccc(Cl)cc3C(C(=O)Nc3cncc4...,L11,84.5,0.0,PCM-0223574,Clc1ccc2[C@@H](CN(CC(=O)Nc3ccc(cc3I)C#N)Cc2c1)...,15.5,21.6562,15.5,20.5,15.5
296,O=C(Nc1cncc2ccccc12)C1CN(CC(=O)N2CCN(Cc3ccc(C(...,P7,76.5,100.0,PCM-0223660,FC(F)(F)c1ccc(CN2CCN(CC2)C(=O)CN3C[C@@H](C(=O)...,23.5,37.0372,23.5,44.8,23.5
297,O=C(CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1)...,H10,24.6,98.0,PCM-0223497,Clc1ccc2[C@@H](CN(CC(=O)Nc3nc(cs3)c4ccc(Cl)c(C...,75.4,41.1426,75.4,39.4,75.4
298,Cc1cccc(C)c1NC(=O)CN1CCN(C(=O)CN2Cc3ccc(Cl)cc3...,H6,65.5,100.0,PCM-0223511,Cc1cccc(C)c1NC(=O)CN2CCN(CC2)C(=O)CN3C[C@@H](C...,34.5,28.2744,34.5,25.4,34.5


Unnamed: 0,smiles,Plate Well,activity,yield (%),Molecule Name,SMILES,Measured Inhibition (%),RF Predicted Inhibition (%),inhibition,50th quantile,Inhibition (%)
55,O=C(CN1Cc2ccc(Cl)cc2C(C(=O)Nc2cncc3ccccc23)C1)...,A10,83.8,100.0,PCM-0223364,FC(F)(F)c1ccc(NC(=O)CN2C[C@@H](C(=O)Nc3cncc4cc...,86.7,19.4728,86.7,19.5,16.2


In [66]:
import time

fig = px.scatter(merged_df,
                 x='Inhibition (%)',
                 y='yield (%)',
                 width=1000,
                 height=800,
                 title='Measured Yield vs Measured Inhibition',
                labels={
                    "yield (%)": "Measured Yield (%)",
                    "Inhibition (%)": "Measured Inihibition (%)",
                },
                 marginal_x='histogram',
                 marginal_y='histogram',
                 template='simple_white',)
# fig.update_layout(
#     plot_bgcolor="white",
#     xaxis=dict(linecolor="black"),
#     yaxis=dict(linecolor="black")
#     template='simple_white'
#     )
# fig.update_xaxes(ticks="outside")
# fig.update_trace(marker={'size': 15})
# fig.update_yaxes(ticks="outside")
fig.write_image('yield_vs_activity.pdf')
time.sleep(2)
fig.write_image('yield_vs_activity.pdf')
fig.show()
# app = molplotly.add_molecules(
#     fig=fig, df=merged_df, title_col='Molecule Name', smiles_col='smiles', wrap=True, wraplen=25, width=150)

# app.run_server(mode='inline', port=8043, height=650)


In [11]:
from scipy.stats import spearmanr
spearmanr(merged_df['Inhibition (%)'], merged_df['yield (%)'])


SpearmanrResult(correlation=0.5719636192796742, pvalue=1.839600982965949e-27)

Check TPs/FPs/FNs

In [12]:
df_tps = pd.read_csv('../data/predictions/true_positives_avg.csv')
df_tps['Type'] = 'True Positive'
df_fps = pd.read_csv('../data/predictions/false_positives_avg.csv')
df_fps['Type'] = 'False Positive'
df_fns = pd.read_csv('../data/predictions/false_negatives_avg.csv')
df_fns['Type'] = 'False Negative'

df_picks = pd.concat([df_tps, df_fps, df_fns]).reset_index()
df_picks['smiles'] = df_picks['SMILES'].progress_apply(
    canonicalise_and_remove_stereo)
df_picks['smiles'] = df_picks['smiles'].progress_apply(replace_substruct)
df_picks_with_yields = merged_df.merge(df_picks, on='smiles').reset_index()
# display(df_picks_with_yields)

fig_picks = px.scatter(df_picks_with_yields, x='Inhibition (%)', y='RF Predicted Inhibition (%)_x', color='yield (%)', width=1000, height=800,
                 title='Activity against yield', )

app_picks = molplotly.add_molecules(
    fig=fig_picks, df=df_picks_with_yields, title_col='Molecule Name_x', smiles_col='smiles', caption_cols=['yield (%)'], wrap=True, wraplen=25, width=150)

app_picks.run_server(mode='inline', port=8020, height=850)
