In [2]:
import pandas as pd 
import numpy as np
import json
import dft_descriptors.prepocessing as pp
import dft_descriptors.featurisation as ft
from featurisation import process_dataframe
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from analysis import analysis_stratification_influence, analysis_train_set_size, analysis_stratification_influence_substrates, get_raw_results, analysis_stratification_influence_substrates_raw

# data sources: data_csv/rxn_dataset, Data_test11222021


In [3]:
estimators = [('predictor', RandomForestRegressor())]
pipe = Pipeline(estimators)
metric = r2_score

# Getting all featurization

In [36]:
df_dataset = pd.read_csv('data_csv/rxn_dataset.csv')
X_rxnfp = np.array([json.loads(x) for x in df_dataset.rxnfp])
substrate_rxnfp = np.array(df_dataset.Substrate) 
DOI_rxnfp = np.array(df_dataset.DOI) 
mechanisms_rxnfp = np.array(df_dataset.Mechanism) 
origins_rxnfp = np.array(df_dataset.Origin) 
y_rxnfp = np.array(df_dataset.Yields)

In [31]:
df_dft = pd.read_csv("data_csv/Data_test11222021.csv", sep = ',')
# Removing 
vc = df_dft.DOI.value_counts()
doi_above_10 = np.array(vc[vc > 20].index)

indexes = []

for i, row in df_dft.iterrows():
    if row["DOI"] not in doi_above_10:
        indexes.append(i)
        
df_dft = df_dft.drop(indexes)
df_dft = df_dft.reset_index(drop=True)

In [32]:
df_dft = pp.preprocess(df_dft)
df_dft["Lewis Acid"] = df_dft["Lewis Acid"].fillna('NoLewisAcid')
df_dft["Lewis Acid"] = df_dft["Lewis Acid"].replace('nan', 'NoLewisAcid')

Lewis_Acids_to_drop = ['O=C(O[Cs])O[Cs]', 'Cl[Cs]', 
                       'O=S(=O)(O[Sc](OS(=O)(=O)C(F)(F)F)OS(=O)(=O)C(F)(F)F)C(F)(F)F', 
                       'F[Cs]', 'O=P(O[Na])(O[Na])O[Na]', '[Rb+]',
                       'CC(C)(C)C(=O)O[Cs]', '[Cs+]', 'CC(=O)O[Cu]OC(C)=O', 'F[Sr]F']
for al in Lewis_Acids_to_drop:
    df_dft = df_dft[df_dft["Lewis Acid"] != al]
    
df_dft = df_dft.reset_index(drop=True)


In [28]:
indexes_kept_dft = np.array(df_dft.index)

In [33]:
X_dft, y_dft, DOI_dft, mechanisms_dft, origins_dft = ft.process_dataframe_dft(df_dft, data_path="data_csv/", origin=False)

NoLigand


In [8]:
X_hybrid, y_hybrid, DOI_hybrid, mechanisms_hybrid, origins_hybrid = ft.process_dataframe_dft(df_dft, data_path="data_csv/", origin=False, hybrid=True)

NoLigand


In [34]:
df_fp = pd.read_csv('data_csv/Data_test11222021.csv')
df_fp = pp.preprocess(df_fp)
vc = df_fp.DOI.value_counts()
doi_above_10 = np.array(vc[vc > 20].index)

indexes = []

for i, row in df_fp.iterrows():
    if row["DOI"] not in doi_above_10:
        indexes.append(i)
        
df_fp = df_fp.drop(indexes)
df_fp = df_fp.reset_index(drop=True)

In [35]:
X_fp, y_fp, DOI_fp, mechanisms_fp, origins_fp = process_dataframe(df_fp)

# Random split

In [24]:
values, baseline_values, model_values, stratification_values, additional_stratification_values = get_raw_results(X_fp, y_fp, origins_fp, mechanisms_fp, n_iterations=10)
display_df =  pd.DataFrame(zip(values, baseline_values, model_values, stratification_values, additional_stratification_values), columns = ['Yields', 'Baseline', 'Global model', 'Origin', 'Mechanism'])
display_df.to_csv("results/random_split_fp_descriptors_test_size_0.2")

In [37]:
# Training set size influence 
metric_values, baseline_values, sizes = analysis_train_set_size(X_fp[indexes_kept_dft, :], y_fp[indexes_kept_dft], DOI_fp[indexes_kept_dft], metric=metric, predictor=pipe, n_iterations_external=5, n_iterations_internal=5)
metric_mean = np.mean(metric_values, axis=1)
metric_lower = np.percentile(metric_values, 5, axis=1)
metric_upper = np.percentile(metric_values, 95, axis=1)

baseline_mean = np.mean(baseline_values, axis=1)
baseline_lower = np.percentile(baseline_values, 5, axis=1)
baseline_upper = np.percentile(baseline_values, 95, axis=1)

display_df =  pd.DataFrame(zip(metric_mean, metric_lower, metric_upper, baseline_mean, baseline_lower, baseline_upper, sizes), columns = ['Metric mean', 'Metric lower','Metric upper','Baseline mean', 'Baseline lower','Baseline upper', 'Sizes'])
display_df.to_csv("results/training_size_influence_fp_descriptors")

In [25]:
values, baseline_values, model_values, stratification_values, additional_stratification_values = get_raw_results(X_dft, y_dft, origins_dft, mechanisms_dft, n_iterations=10)
display_df =  pd.DataFrame(zip(values, baseline_values, model_values, stratification_values, additional_stratification_values), columns = ['Yields', 'Baseline', 'Global model', 'Origin', 'Mechanism'])
display_df.to_csv("results/random_split_dft_descriptors_test_size_0.2")

In [None]:
# Training set size influence 
metric_values, baseline_values, sizes = analysis_train_set_size(X_dft, y_dft, DOI_dft, metric=metric, predictor=pipe, n_iterations_external=5, n_iterations_internal=5)
metric_mean = np.mean(metric_values, axis=1)
metric_lower = np.percentile(metric_values, 5, axis=1)
metric_upper = np.percentile(metric_values, 95, axis=1)

baseline_mean = np.mean(baseline_values, axis=1)
baseline_lower = np.percentile(baseline_values, 5, axis=1)
baseline_upper = np.percentile(baseline_values, 95, axis=1)

display_df =  pd.DataFrame(zip(metric_mean, metric_lower, metric_upper, baseline_mean, baseline_lower, baseline_upper, sizes), columns = ['Metric mean', 'Metric lower','Metric upper','Baseline mean', 'Baseline lower','Baseline upper', 'Sizes'])
display_df.to_csv("results/training_size_influence_dft_descriptors")

In [26]:
values, baseline_values, model_values, stratification_values, additional_stratification_values = get_raw_results(X_rxnfp, y_rxnfp, origins_rxnfp, mechanisms_rxnfp, n_iterations=10)
display_df =  pd.DataFrame(zip(values, baseline_values, model_values, stratification_values, additional_stratification_values), columns = ['Yields', 'Baseline', 'Global model', 'Origin', 'Mechanism'])
display_df.to_csv("results/random_split_rxnfp_descriptors_test_size_0.2")

In [38]:
# Training set size influence 
metric_values, baseline_values, sizes = analysis_train_set_size(X_rxnfp[indexes_kept_dft, :], y_rxnfp[indexes_kept_dft], DOI_rxnfp[indexes_kept_dft], metric=metric, predictor=pipe, n_iterations_external=5, n_iterations_internal=5)
metric_mean = np.mean(metric_values, axis=1)
metric_lower = np.percentile(metric_values, 5, axis=1)
metric_upper = np.percentile(metric_values, 95, axis=1)

baseline_mean = np.mean(baseline_values, axis=1)
baseline_lower = np.percentile(baseline_values, 5, axis=1)
baseline_upper = np.percentile(baseline_values, 95, axis=1)

display_df =  pd.DataFrame(zip(metric_mean, metric_lower, metric_upper, baseline_mean, baseline_lower, baseline_upper, sizes), columns = ['Metric mean', 'Metric lower','Metric upper','Baseline mean', 'Baseline lower','Baseline upper', 'Sizes'])
display_df.to_csv("results/training_size_influence_rxnfp_descriptors")

# Substrate split

In [31]:
stratification_results, additional_stratification_results, global_results, global_baseline_results, values = analysis_stratification_influence_substrates_raw(X_fp, y_fp, list(df_fp["Reactant Smile (C-O)"]), origins_fp , metric=metric, predictor=RandomForestRegressor(), test_size=0.2, n_iterations=10)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Global model', 'Global baseline', 'Yields'])
display_df.to_csv("results/substrate_split_fp_descriptors")

In [8]:
stratification_results, additional_stratification_results, global_results, global_baseline_results, values = analysis_stratification_influence_substrates_raw(X_dft, 1 * y_dft>50, list(df_dft["Reactant Smile (C-O)"]), origins_dft , metric=metric, predictor=RandomForestClassifier(), test_size=0.2, n_iterations=10)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Global model', 'Global baseline', 'Yields'])
display_df.to_csv("results/substrate_split_dft_descriptors_classification")

In [82]:
stratification_results, additional_stratification_results, global_results, global_baseline_results, values = analysis_stratification_influence_substrates_raw(X_fp, y_fp, list(df_fp["Reactant Smile (C-O)"]), origins_fp, metric=metric, predictor=KNeighborsRegressor(n_neighbors=1), test_size=0.2, n_iterations=10)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Global model', 'Global baseline', 'Yields'])
display_df.to_csv("results/substrate_split_fp_descriptors_KNN")

In [33]:
stratification_results, additional_stratification_results, global_results, global_baseline_results, values = analysis_stratification_influence_substrates_raw(X_rxnfp, y_rxnfp, substrate_rxnfp, origins_rxnfp , metric=metric, predictor=RandomForestRegressor(), test_size=0.2, n_iterations=10)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Global model', 'Global baseline', 'Yields'])
display_df.to_csv("results/substrate_split_rxnfp_descriptors")

# DOI split

In [34]:
stratification_results, additional_stratification_results, global_results, global_baseline_results, values = analysis_stratification_influence_substrates_raw(X_fp, y_fp, DOI_fp, origins_fp , metric=metric, predictor=RandomForestRegressor(), test_size=0.2, n_iterations=10)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Global model', 'Global baseline', 'Yields'])
display_df.to_csv("results/doi_split_fp_descriptors")

In [35]:
stratification_results, additional_stratification_results, global_results, global_baseline_results, values = analysis_stratification_influence_substrates_raw(X_dft, y_dft, DOI_dft, origins_dft , metric=metric, predictor=RandomForestRegressor(), test_size=0.2, n_iterations=10)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Global model', 'Global baseline', 'Yields'])
display_df.to_csv("results/doi_split_dft_descriptors")

In [36]:
stratification_results, additional_stratification_results, global_results, global_baseline_results, values = analysis_stratification_influence_substrates_raw(X_rxnfp, y_rxnfp, DOI_rxnfp, origins_rxnfp , metric=metric, predictor=RandomForestRegressor(), test_size=0.2, n_iterations=10)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Global model', 'Global baseline', 'Yields'])
display_df.to_csv("results/doi_split_rxnfp_descriptors")

# Mechanism split

In [37]:
stratification_results, additional_stratification_results, global_results, global_baseline_results, values = analysis_stratification_influence_substrates_raw(X_fp, y_fp, mechanisms_fp, origins_fp , metric=metric, predictor=RandomForestRegressor(), test_size=0.2, n_iterations=10)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Global model', 'Global baseline', 'Yields'])
display_df.to_csv("results/mechanisms_split_fp_descriptors")

In [38]:
stratification_results, additional_stratification_results, global_results, global_baseline_results, values = analysis_stratification_influence_substrates_raw(X_dft, y_dft, mechanisms_dft, origins_dft , metric=metric, predictor=RandomForestRegressor(), test_size=0.2, n_iterations=10)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Global model', 'Global baseline', 'Yields'])
display_df.to_csv("results/mechanisms_split_dft_descriptors")

In [39]:
stratification_results, additional_stratification_results, global_results, global_baseline_results, values = analysis_stratification_influence_substrates_raw(X_rxnfp, y_rxnfp, mechanisms_rxnfp, origins_rxnfp , metric=metric, predictor=RandomForestRegressor(), test_size=0.2, n_iterations=10)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Global model', 'Global baseline', 'Yields'])
display_df.to_csv("results/mechanisms_split_rxnfp_descriptors")

# Restricted chemical space: Suzuki

In [40]:
indexes = np.where(mechanisms_fp=='Suzuki')[0]

In [52]:
values, baseline_values, model_values, stratification_values, additional_stratification_values = get_raw_results(X_fp[indexes, :], y_fp[indexes], origins_fp[indexes], mechanisms_fp[indexes], n_iterations=10)
display_df =  pd.DataFrame(zip(values, baseline_values, model_values, stratification_values, additional_stratification_values), columns = ['Yields', 'Baseline', 'Global model', 'Origin', 'Mechanism'])
display_df.to_csv("results/random_split_fp_descriptors_test_size_0.2_mechanism_suzuki")

In [53]:
indexes = np.where(mechanisms_dft=='Suzuki')[0]

In [54]:
values, baseline_values, model_values, stratification_values, additional_stratification_values = get_raw_results(X_dft[indexes, :], y_dft[indexes], origins_dft[indexes], mechanisms_dft[indexes], n_iterations=10)
display_df =  pd.DataFrame(zip(values, baseline_values, model_values, stratification_values, additional_stratification_values), columns = ['Yields', 'Baseline', 'Global model', 'Origin', 'Mechanism'])
display_df.to_csv("results/random_split_dft_descriptors_test_size_0.2_mechanism_suzuki")

In [55]:
indexes = np.where(mechanisms_rxnfp=='Suzuki')[0]

In [56]:
values, baseline_values, model_values, stratification_values, additional_stratification_values = get_raw_results(X_rxnfp[indexes, :], y_rxnfp[indexes], origins_rxnfp[indexes], mechanisms_rxnfp[indexes], n_iterations=10)
display_df =  pd.DataFrame(zip(values, baseline_values, model_values, stratification_values, additional_stratification_values), columns = ['Yields', 'Baseline', 'Global model', 'Origin', 'Mechanism'])
display_df.to_csv("results/random_split_rxnfp_descriptors_test_size_0.2_mechanism_suzuki")

In [75]:
# TODO: clean 
r2 = []
length = []
for mecha in np.unique(mechanisms_dft):
    indexes = np.where(mechanisms_dft==mecha)[0]
    values, baseline_values, model_values, stratification_values, additional_stratification_values = get_raw_results(X_dft[indexes, :], y_dft[indexes], origins_dft[indexes], mechanisms_dft[indexes], n_iterations=10)
    print(mecha)
    print(round(r2_score(values, model_values), 2))
    r2.append(round(r2_score(values, model_values), 2))
    length.append(len(indexes))

Al _coupling
0.16
Buchwald
-0.17
C-H activation
0.49
CO2 Insertion
0.49
Isocyanates
0.25
Kumada
0.51
Murahashi
-0.11
Negishi
0.64
Ni/Cu cooperation
0.71
P_coupling
-0.12
Suzuki
0.38


In [71]:
values, baseline_values, model_values, stratification_values, additional_stratification_values = get_raw_results(X_dft, y_dft, origins_dft, mechanisms_dft, n_iterations=10)
for mecha in np.unique(mechanisms_dft):
    indexes = np.where(np.array(additional_stratification_values)==mecha)[0]
    print(mecha)
    print(round(r2_score(np.array(values)[indexes], np.array(model_values)[indexes])))

Al _coupling
0.08972712181077747
Buchwald
0.08334623783177508
C-H activation
0.5352837823528319
CO2 Insertion
0.4633179584210968
Isocyanates
0.38676919690169564
Kumada
0.48824921796229437
Murahashi
-0.0450133373194348
Negishi
0.5230062567743744
Ni/Cu cooperation
0.6848397251089264
P_coupling
0.43813325210992704
Suzuki
0.44687822151557066


# Restricted chemical space: publication

In [31]:
indexes = np.where(DOI_fp=='https://doi.org/10.1021/ja8056503')[0]

In [32]:
values, baseline_values, model_values, stratification_values, additional_stratification_values = get_raw_results(X_fp[indexes, :], y_fp[indexes], origins_fp[indexes], mechanisms_fp[indexes], n_iterations=10)
display_df =  pd.DataFrame(zip(values, baseline_values, model_values, stratification_values, additional_stratification_values), columns = ['Yields', 'Baseline', 'Global model', 'Origin', 'Mechanism'])
display_df.to_csv("results/random_split_fp_descriptors_test_size_0.2_publication")

In [33]:
indexes = np.where(DOI_dft=='https://doi.org/10.1021/ja8056503')[0]

In [34]:
values, baseline_values, model_values, stratification_values, additional_stratification_values = get_raw_results(X_dft[indexes, :], y_dft[indexes], origins_dft[indexes], mechanisms_dft[indexes], n_iterations=10)
display_df =  pd.DataFrame(zip(values, baseline_values, model_values, stratification_values, additional_stratification_values), columns = ['Yields', 'Baseline', 'Global model', 'Origin', 'Mechanism'])
display_df.to_csv("results/random_split_dft_descriptors_test_size_0.2_publication")

In [35]:
indexes = np.where(DOI_rxnfp=='https://doi.org/10.1021/ja8056503')[0]

In [36]:
values, baseline_values, model_values, stratification_values, additional_stratification_values = get_raw_results(X_rxnfp[indexes, :], y_rxnfp[indexes], origins_rxnfp[indexes], mechanisms_rxnfp[indexes], n_iterations=10)
display_df =  pd.DataFrame(zip(values, baseline_values, model_values, stratification_values, additional_stratification_values), columns = ['Yields', 'Baseline', 'Global model', 'Origin', 'Mechanism'])
display_df.to_csv("results/random_split_rxnfp_descriptors_test_size_0.2_publication")