In [3]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
import seaborn as sns
from rdkit import Chem
import dft_descriptors.featurisation as ft
import dft_descriptors.prepocessing as pp
import matplotlib.pyplot as plt

In [7]:
# Load HTE dataset originally published in :
# Ahneman et al. "Predicting reaction performance in C–N cross-coupling using machine learning." 
# Science 360.6385 (2018): 186-190.
# dataset downloaded from : https://rxn4chemistry.github.io/rxn_yields/data/
df_hte_bh = pd.read_excel("Dreher_and_Doyle_input_data.xlsx")

# Load HTE dataset originally published in :
# Sandfort et al. "A structure-based platform for predicting chemical reactivity." 
# Chem (2020).
# dataset downloaded from : https://rxn4chemistry.github.io/rxn_yields/data/
df_hte_suz = pd.read_excel("aap9112_Data_File_S1.xlsx")

# Load NiCOlit dataset and preprocess
df_dataset = pd.read_csv("../data_csv/Data_test11262021.csv")
df2 = pp.preprocess(df_dataset)

def AL_preprocess(df):
    df["Lewis Acid"] = df["Lewis Acid"].fillna('NoLewisAcid')
    df["Lewis Acid"] = df["Lewis Acid"].replace('nan', 'NoLewisAcid')
    Lewis_Acids_to_drop = ['O=C(O[Cs])O[Cs]', 'Cl[Cs]', 
                       'O=S(=O)(O[Sc](OS(=O)(=O)C(F)(F)F)OS(=O)(=O)C(F)(F)F)C(F)(F)F', 
                       'F[Cs]', 'O=P(O[Na])(O[Na])O[Na]', '[Rb+]',
                       'CC(C)(C)C(=O)O[Cs]', '[Cs+]', 'CC(=O)O[Cu]OC(C)=O', 'F[Sr]F']
    for al in Lewis_Acids_to_drop:
        df = df[df["Lewis Acid"] != al]
    
    return df.reset_index(drop=True)

df2 = AL_preprocess(df2)

FileNotFoundError: [Errno 2] No such file or directory: 'Dreher_and_Doyle_input_data.xlsx'

In [None]:
# Featurization of the NiCOlit dataset
X, y, DOIs, coupling_partner, origin = ft.process_dataframe_dft(df2, data_path="../data_csv/", origin=False)

In [None]:
# Gather yields for all datasets :
# BH-HTE dataset
y_bh = df_hte_bh["Output"]

# Suzuki-HTE dataset
y_suz = df_hte_suz["Product_Yield_PCT_Area_UV"]

# NiCOlit dataset
y_dataset = y

# Results of the SciFinder query : 
yields = ["<10%", "10-29%", "30-49%", "50-69%", "70-79%", "80-89%", "90-100%"]
yields_lim = [0, 10, 29, 49, 69, 79, 89, 100]
yields_count =  [ 25, 78, 243, 569, 419, 436, 433]

# Simulation of a yield distribution for the SciFinder query : 
Y_sciF = [np.random.randint(yields_lim[i], yields_lim[i+1]) for i in range(len(yields_count)) for j in range(yields_count[i])]

Y = np.concatenate((np.array(y_dataset), np.array(y_suz), np.array(y_bh), np.array(Y_sciF)))

In [None]:
# Create display dataframes for the figures
Origin = np.concatenate((np.array(origin), np.array(["HTE Suzuki" for i in range(len(y_suz))]),
                        np.array(["HTE Buchwald" for i in range(len(y_bh))]),
                       np.array(["SciFinder query" for i in range(len(Y_sciF))])))
Origin2 = np.concatenate((np.array(["NiCO-lit" for i in range(len(y_dataset))]),
                          np.array(["HTE Suzuki" for i in range(len(y_suz))]),
                          np.array(["HTE Buchwald" for i in range(len(y_bh))]),
                         np.array(["SciFinder query" for i in range(len(Y_sciF))])))
display_df =  pd.DataFrame(zip(y_dataset, origin, coupling_partner), columns =['Yields', 'Origin', 'Coupling Partner'])
display_df1 =  pd.DataFrame(zip(Y, Origin), columns =['Yields', 'Origin'])
display_df2 =  pd.DataFrame(zip(Y, Origin2), columns =['Yields', 'Origin'])

In [None]:
fig, ax = plt.subplots(2,2, figsize=(15, 10))

# Comparison of the Yield distributions of the datasets
ax[0,1] = plt.subplot(211)
sns.violinplot(y="Yields", data=display_df2, x='Origin',  kind="swarm", cut=0, ax = ax[0,1], 
               linewidth=2) #scale='count')
for tick in ax[0,1].get_xticklabels():
    tick.set_rotation(0)
ax[0,1].set_title("Datasets comparison")
ax[0,1].set_xlabel("")

# Comparison of the Yield distributions of Scope and Optimization in the NiCOlit dataset
sns.swarmplot(ax=ax[1,0], y="Yields", data=display_df, x='Origin', color='white', s = 3, 
              linewidth=0.1, dodge=False, edgecolor='black', )
sns.violinplot(y="Yields", data=display_df, x='Origin',  kind="swarm", cut=0, 
               ax = ax[1,0], palette='Blues', scale='count')
ax[1,0].set_title("Data origin in the NiCO-lit dataset")
ax[1,0].set_xlabel("")
ax[1,0].set_ylabel("")

# Yield distributions of Different Class of Coupling Partner in the NiCOlit dataset
sns.swarmplot(ax=ax[1,1], y="Yields", data=display_df, x='Coupling Partner', palette='Accent', s = 3,
              linewidth=0.1, dodge=False)
sns.violinplot(ax=ax[1,1], y="Yields", data=display_df, x='Coupling Partner',
                    inner=None, color=".9", cut=0, scale='count', linewidth=0.2)

ax[1,1].set_title("Coupling Partner Class in the NiCO-lit dataset")

for tick in ax[1,1].get_xticklabels():
    tick.set_rotation(90)
ax[1,1].set_xlabel("")

plt.savefig('yields.svg', dpi=300, format='svg',
     bbox_inches='tight' )