In [1]:
import pandas as pd 
import numpy as np
import json

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
import sys
sys.path.append("..") 
import descriptors.preprocessing as pp
import descriptors.dft_featurisation as dft_ft
import descriptors.rdkit_featurisation as rdkit_ft
from analysis import analysis_train_set_size, random_split, stratified_split 

In [2]:
estimators = [('predictor', RandomForestRegressor())]
pipe = Pipeline(estimators)
metric = r2_score

# Getting all featurization

In [3]:
df_dft = pd.read_csv("../data/NiCOlit.csv", sep = ',')
df_dft, indexes = pp.preprocess(df_dft)
indexes_dft = []
for idx in df_dft.index:
    indexes_dft.append(idx)
df_dft = df_dft.reset_index(drop=True)

In [4]:
df_dataset = pd.read_csv('../data/rxnfp_featurization/rxn_dataset_2.csv')
df_dataset = df_dataset.loc[indexes_dft]
df_dataset = df_dataset.reset_index(drop=True)
X_rxnfp = np.array([json.loads(x) for x in df_dataset.rxnfp])
substrate_rxnfp = np.array(df_dataset.Substrate) 
DOI_rxnfp = np.array(df_dataset.DOI) 
mechanisms_rxnfp = np.array(df_dataset["A-X type"]) 
origins_rxnfp = np.array(df_dataset.Origin) 
y_rxnfp = np.array(df_dataset.Yields)

In [5]:
X_dft, y_dft, DOI_dft, mechanisms_dft, origins_dft, substrate_dft, ligand_dft = dft_ft.process_dataframe_dft(df_dft, data_path="../data/utils/", origin=False)

In [6]:
df_fp = pd.read_csv('../data/NiCOlit.csv')
df_fp, indexes = pp.preprocess(df_fp)
df_fp = df_fp.reset_index(drop=True)

In [7]:
X_fp, y_fp, DOI_fp, mechanisms_fp, origins_fp = rdkit_ft.process_dataframe(df_fp)

# Random split

In [8]:
values, baseline_values, model_values, stratification_values, additional_stratification_values = random_split(X_fp, y_fp, origins_fp, mechanisms_fp, n_iterations=10)
display_df =  pd.DataFrame(zip(values, baseline_values, model_values, stratification_values, additional_stratification_values), 
                           columns = ['Yields', 'Baseline', 'Predicted Yields', 'Origin', 'Coupling Partner'])
display_df.to_csv("../results/random_split_fp_descriptors_test_size_0.2")

In [10]:
# Training set size influence 
#metric_values, baseline_values, sizes = analysis_train_set_size(X_fp[indexes_kept_dft, :], y_fp[indexes_kept_dft], DOI_fp[indexes_kept_dft], metric=metric, predictor=pipe, 
#                                                                n_iterations_external=10, n_iterations_internal=1)
metric_values, baseline_values, sizes = analysis_train_set_size(X_fp, y_fp, DOI_fp, metric=metric, predictor=pipe, 
                                                                n_iterations_external=10, n_iterations_internal=1)
metric_mean = np.mean(metric_values, axis=1)
metric_lower = np.percentile(metric_values, 5, axis=1)
metric_upper = np.percentile(metric_values, 95, axis=1)

baseline_mean = np.mean(baseline_values, axis=1)
baseline_lower = np.percentile(baseline_values, 5, axis=1)
baseline_upper = np.percentile(baseline_values, 95, axis=1)

display_df =  pd.DataFrame(zip(metric_mean, metric_lower, metric_upper, baseline_mean, baseline_lower, baseline_upper, sizes), columns = ['Metric mean', 'Metric lower','Metric upper','Baseline mean', 'Baseline lower','Baseline upper', 'Sizes'])
display_df.to_csv("../results/training_size_influence_fp_descriptors")

In [11]:
values, baseline_values, model_values, stratification_values, additional_stratification_values = random_split(X_dft, y_dft, origins_dft, mechanisms_dft, 
                                                                                                              n_iterations=10)
display_df =  pd.DataFrame(zip(values, baseline_values, model_values, stratification_values, additional_stratification_values), columns = ['Yields', 'Baseline', 'Predicted Yields', 'Origin', 'Coupling Partner'])
display_df.to_csv("../results/random_split_dft_descriptors_test_size_0.2")

In [12]:
# Training set size influence 
metric_values, baseline_values, sizes = analysis_train_set_size(X_dft, y_dft, DOI_dft, metric=metric, predictor=pipe,
                                                                n_iterations_external=10, n_iterations_internal=1)
metric_mean = np.mean(metric_values, axis=1)
metric_lower = np.percentile(metric_values, 5, axis=1)
metric_upper = np.percentile(metric_values, 95, axis=1)

baseline_mean = np.mean(baseline_values, axis=1)
baseline_lower = np.percentile(baseline_values, 5, axis=1)
baseline_upper = np.percentile(baseline_values, 95, axis=1)

display_df =  pd.DataFrame(zip(metric_mean, metric_lower, metric_upper, baseline_mean, baseline_lower, baseline_upper, sizes), columns = ['Metric mean', 'Metric lower','Metric upper','Baseline mean', 'Baseline lower','Baseline upper', 'Sizes'])
display_df.to_csv("../results/training_size_influence_dft_descriptors")

In [13]:
indices = np.where(origins_dft == "Scope")[0]
values, baseline_values, model_values, stratification_values, additional_stratification_values = random_split(X_dft[indices, :], y_dft[indices], origins_dft[indices], mechanisms_dft[indices], n_iterations=10)
display_df =  pd.DataFrame(zip(values, baseline_values, model_values, stratification_values, additional_stratification_values), columns = ['Yields', 'Baseline', 'Predicted Yields', 'Origin', 'Coupling Partner'])
display_df.to_csv("../results/random_split_dft_descriptors_scope_test_size_0.2")

indices = np.where(origins_dft == "Optimisation")[0]
values, baseline_values, model_values, stratification_values, additional_stratification_values = random_split(X_dft[indices, :], y_dft[indices], origins_dft[indices], mechanisms_dft[indices], n_iterations=10)
display_df =  pd.DataFrame(zip(values, baseline_values, model_values, stratification_values, additional_stratification_values), columns = ['Yields', 'Baseline', 'Predicted Yields', 'Origin', 'Coupling Partner'])
display_df.to_csv("../results/random_split_dft_descriptors_optimisation_test_size_0.2")

In [14]:
values, baseline_values, model_values, stratification_values, additional_stratification_values = random_split(X_rxnfp, y_rxnfp, origins_rxnfp, mechanisms_rxnfp, n_iterations=10)
display_df =  pd.DataFrame(zip(values, baseline_values, model_values, stratification_values, additional_stratification_values), columns = ['Yields', 'Baseline', 'Predicted Yields', 'Origin', 'Coupling Partner'])
display_df.to_csv("../results/random_split_rxnfp_descriptors_test_size_0.2")

In [15]:
# Training set size influence 
metric_values, baseline_values, sizes = analysis_train_set_size(X_rxnfp, y_rxnfp, DOI_rxnfp, metric=metric, predictor=pipe,
                                                                n_iterations_external=10, n_iterations_internal=1)
metric_mean = np.mean(metric_values, axis=1)
metric_lower = np.percentile(metric_values, 5, axis=1)
metric_upper = np.percentile(metric_values, 95, axis=1)

baseline_mean = np.mean(baseline_values, axis=1)
baseline_lower = np.percentile(baseline_values, 5, axis=1)
baseline_upper = np.percentile(baseline_values, 95, axis=1)

display_df =  pd.DataFrame(zip(metric_mean, metric_lower, metric_upper, baseline_mean, baseline_lower, baseline_upper, sizes), columns = ['Metric mean', 'Metric lower','Metric upper','Baseline mean', 'Baseline lower','Baseline upper', 'Sizes'])
display_df.to_csv("../results/training_size_influence_rxnfp_descriptors")

# Substrate split

In [16]:
values, global_baseline_results, global_results, stratification_results, additional_stratification_results = stratified_split(X_fp, y_fp, list(df_fp["substrate"]), origins_fp , metric=metric, predictor=RandomForestRegressor(), test_size=0.2, 
                                                                                                                              n_iterations=10)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Predicted Yields', 'Global baseline', 'Yields'])
display_df.to_csv("../results/substrate_split_fp_descriptors")

In [17]:
values, global_baseline_results, global_results, stratification_results, additional_stratification_results = stratified_split(X_dft, 1 * y_dft>50, list(df_dft["substrate"]), origins_dft , metric=metric, predictor=RandomForestClassifier(), test_size=0.2, 
                                                                                                                              n_iterations=10)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Predicted Yields', 'Global baseline', 'Yields'])
display_df.to_csv("../results/substrate_split_dft_descriptors_classification")

In [18]:
values, global_baseline_results, global_results, stratification_results, additional_stratification_results = stratified_split(X_fp, y_fp, list(df_fp["substrate"]), origins_fp, metric=metric, predictor=KNeighborsRegressor(n_neighbors=1), test_size=0.2, 
                                                                                                                              n_iterations=2)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Predicted Yields', 'Global baseline', 'Yields'])
display_df.to_csv("../results/substrate_split_fp_descriptors_KNN")

In [19]:
values, global_baseline_results, global_results, stratification_results, additional_stratification_results = stratified_split(X_rxnfp, y_rxnfp, substrate_rxnfp, origins_rxnfp , metric=metric, predictor=RandomForestRegressor(), test_size=0.2,
                                                                                                                              n_iterations=2)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Predicted Yields', 'Global baseline', 'Yields'])
display_df.to_csv("../results/substrate_split_rxnfp_descriptors")

# DOI split

In [20]:
values, global_baseline_results, global_results, stratification_results, additional_stratification_results = stratified_split(X_fp, y_fp, DOI_fp, origins_fp , metric=metric, predictor=RandomForestRegressor(), test_size=0.2, 
                                                                                                                              n_iterations=1)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Predicted Yields', 'Global baseline', 'Yields'])
display_df.to_csv("../results/doi_split_fp_descriptors")

In [21]:
values, global_baseline_results, global_results, stratification_results, additional_stratification_results = stratified_split(X_dft, y_dft, DOI_dft, origins_dft , metric=metric, predictor=RandomForestRegressor(), test_size=0.2,
                                                                                                                              n_iterations=1)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Predicted Yields', 'Global baseline', 'Yields'])
display_df.to_csv("../results/doi_split_dft_descriptors")

In [22]:
values, global_baseline_results, global_results, stratification_results, additional_stratification_results = stratified_split(X_rxnfp, y_rxnfp, DOI_rxnfp, origins_rxnfp , metric=metric, predictor=RandomForestRegressor(), test_size=0.2, 
                                                                                                                              n_iterations=1)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Predicted Yields', 'Global baseline', 'Yields'])
display_df.to_csv("../results/doi_split_rxnfp_descriptors")

# Coupling partner split

In [23]:
values, global_baseline_results, global_results, stratification_results, additional_stratification_results = stratified_split(X_fp, y_fp, mechanisms_fp, origins_fp , metric=metric, predictor=RandomForestRegressor(), test_size=0.2, 
                                                                                                                              n_iterations=1)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Predicted Yields', 'Global baseline', 'Yields'])
display_df.to_csv("../results/mechanisms_split_fp_descriptors")

In [24]:
values, global_baseline_results, global_results, stratification_results, additional_stratification_results = stratified_split(X_dft, y_dft, mechanisms_dft, origins_dft , metric=metric, predictor=RandomForestRegressor(), test_size=0.2, 
                                                                                                                              n_iterations=10)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Predicted Yields', 'Global baseline', 'Yields'])
display_df.to_csv("../results/mechanisms_split_dft_descriptors")

In [25]:
values, global_baseline_results, global_results, stratification_results, additional_stratification_results = stratified_split(X_rxnfp, y_rxnfp, mechanisms_rxnfp, origins_rxnfp , metric=metric, predictor=RandomForestRegressor(), test_size=0.2, 
                                                                                                                              n_iterations=1)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Predicted Yields', 'Global baseline', 'Yields'])
display_df.to_csv("../results/mechanisms_split_rxnfp_descriptors")

# Restricted chemical space: Suzuki

In [26]:
indexes = np.where(mechanisms_fp=='B')[0]

In [27]:
values, baseline_values, model_values, stratification_values, additional_stratification_values = random_split(X_fp[indexes, :], y_fp[indexes], origins_fp[indexes], mechanisms_fp[indexes], 
                                                                                                              n_iterations=5)
display_df =  pd.DataFrame(zip(values, baseline_values, model_values, stratification_values, additional_stratification_values), columns = ['Yields', 'Baseline', 'Predicted Yields', 'Origin', 'Coupling Partner'])
display_df.to_csv("../results/random_split_fp_descriptors_test_size_0.2_mechanism_suzuki")

In [28]:
indexes = np.where(mechanisms_dft=='B')[0]

In [29]:
values, baseline_values, model_values, stratification_values, additional_stratification_values = random_split(X_dft[indexes, :], y_dft[indexes], origins_dft[indexes], mechanisms_dft[indexes], 
                                                                                                              n_iterations=1)
display_df = pd.DataFrame(zip(values, baseline_values, model_values, stratification_values, additional_stratification_values), columns = ['Yields', 'Baseline', 'Predicted Yields', 'Origin', 'Coupling Partner'])
display_df.to_csv("../results/random_split_dft_descriptors_test_size_0.2_mechanism_suzuki")

In [30]:
values, baseline_values, model_values, stratification_values, additional_stratification_values = random_split(X_rxnfp[indexes, :], y_rxnfp[indexes], origins_rxnfp[indexes], mechanisms_rxnfp[indexes], 
                                                                                                              n_iterations=1)
display_df =  pd.DataFrame(zip(values, baseline_values, model_values, stratification_values, additional_stratification_values), columns = ['Yields', 'Baseline', 'Predicted Yields', 'Origin', 'Coupling Partner'])
display_df.to_csv("../results/random_split_rxnfp_descriptors_test_size_0.2_mechanism_suzuki")

In [31]:
# TODO: clean 
r2 = []
length = []
for mecha in np.unique(mechanisms_dft):
    indexes = np.where(mechanisms_dft==mecha)[0]
    values, baseline_values, model_values, stratification_values, additional_stratification_values = random_split(X_dft[indexes, :], y_dft[indexes], origins_dft[indexes], mechanisms_dft[indexes], n_iterations=10)
    print(mecha)
    print(len(indexes))
    print(round(r2_score(values, model_values), 3))
    r2.append(round(r2_score(values, model_values), 3))
    length.append(len(indexes))

Al
53
0.158
B
472
0.433
C-H
271
0.584
CO2
87
0.482
Li
52
-0.133
NCO
57
0.265
NH
27
-0.165
RMgX
266
0.507
Si
53
0.437
Zn
68
0.645


In [32]:
values, baseline_values, model_values, stratification_values, additional_stratification_values = random_split(X_dft, y_dft, origins_dft, mechanisms_dft, n_iterations=50)
for mecha in np.unique(mechanisms_dft):
    indexes = np.where(np.array(additional_stratification_values)==mecha)[0]
    print(mecha)
    print(round(r2_score(np.array(values)[indexes], np.array(model_values)[indexes]),3))

Al
0.2
B
0.467
C-H
0.587
CO2
0.518
Li
-0.105
NCO
0.402
NH
0.171
RMgX
0.503
Si
0.636
Zn
0.553


In [33]:
for ax_t in df_dft["A-X type"].unique():
    print(ax_t)
    print(len(df_dft[df_dft["A-X type"]==ax_t]["DOI"].unique()))

KeyError: 'A-X type'