Warning this notebook requires to install the *DRFP* module:

-> can be install with : pip install drfp

-> alternative possibility are available on the git page: https://github.com/reymond-group/drfp

-> more details: https://pubs.rsc.org/en/content/articlehtml/2022/dd/d1dd00006c

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdChemReactions
from drfp import DrfpEncoder
from descriptors.preprocessing import preprocess, dict_ligand

In [2]:
# generate rection SMILES from NiCOlit
nicolit = pd.read_csv("data/NiCOlit.csv")
nicolit, indexes = preprocess(nicolit)

In [4]:
# featurize the reactions according to the DRFP method
import descriptors.drfp_featurization as drfp_ft

X_fp, y_fp, DOI_fp, mechanisms_fp, origins_fp = drfp_ft.process_dataframe(nicolit)

In [None]:
# test the performances
from analysis import analysis_train_set_size, random_split, stratified_split 

# random split
values, baseline_values, model_values, stratification_values, additional_stratification_values = random_split(X_fp, y_fp, origins_fp, mechanisms_fp, n_iterations=20)
display_df =  pd.DataFrame(zip(values, baseline_values, model_values, stratification_values, additional_stratification_values), 
                           columns = ['Yields', 'Baseline', 'Predicted Yields', 'Origin', 'Coupling Partner'])
display_df.to_csv("results/random_split_drfp_descriptors_test_size_0.2")

In [None]:
# substrate split (be carefull on the number of iterations)
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

metric = r2_score

values, global_baseline_results, global_results, stratification_results, additional_stratification_results = stratified_split(X_fp, y_fp, list(nicolit["substrate"]), origins_fp , metric=metric, predictor=RandomForestRegressor(), test_size=0.2, 
                                                                                                                              n_iterations=1)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Predicted Yields', 'Global baseline', 'Yields'])
display_df.to_csv("results/substrate_split_drfp_descriptors")

In [None]:
# coupling partner split
values, global_baseline_results, global_results, stratification_results, additional_stratification_results = stratified_split(X_fp, y_fp, mechanisms_fp, origins_fp , metric=metric, predictor=RandomForestRegressor(), test_size=0.2, 
                                                                                                                              n_iterations=10)
display_df =  pd.DataFrame(zip(stratification_results, additional_stratification_results, global_results, global_baseline_results, values), columns =['Substrate', 'Origin', 'Predicted Yields', 'Global baseline', 'Yields'])
display_df.to_csv("results/mechanisms_split_drfp_descriptors")

### Visualize the results 

In [None]:
df = pd.read_csv('results/random_split_drfp_descriptors_test_size_0.2')

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt 
import matplotlib as mpl

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
# random split
h = sns.jointplot("Yields", "Predicted Yields", df, kind='kde', fill=True)
h.set_axis_labels('Experimental yields', 'Predicted yields')
h.ax_joint.set_xticks([0, 20, 40, 60, 80, 100])
h.ax_joint.set_yticks([0, 20, 40, 60, 80, 100])

fig_path = 'images/random_split_drfp_descriptors_test_size_0.2_full.png'
plt.savefig(fig_path, dpi=300, bbox_inches='tight')

print('RMSE = ',mean_squared_error(df["Yields"], df["Predicted Yields"])**0.5)
print('MAE  = ',mean_absolute_error(df["Yields"], df["Predicted Yields"]))
print('R$^2$  = ',r2_score(df["Yields"], df["Predicted Yields"]))

In [None]:
R2 = []
for i in range(10):
    R2.append(r2_score(df["Yields"][i*282:(i+1)*282], df["Predicted Yields"][i*282:(i+1)*282]))
print(min(R2), np.mean(R2), np.std(R2), max(R2))

In [None]:
# comparison with dft results:
df = pd.read_csv('results/random_split_dft_descriptors_test_size_0.2')
R2 = []
for i in range(10):
    R2.append(r2_score(df["Yields"][i*282:(i+1)*282], df["Predicted Yields"][i*282:(i+1)*282]))
print(min(R2), np.mean(R2), np.std(R2), max(R2))

In [None]:
# substrate split
df = pd.read_csv('results/substrate_split_drfp_descriptors')

h = sns.jointplot("Yields", "Predicted Yields", df, kind='kde', fill=True)
h.set_axis_labels('Experimental yields', 'Predicted yields')
h.ax_joint.set_xticks([0, 20, 40, 60, 80, 100])
h.ax_joint.set_yticks([0, 20, 40, 60, 80, 100])
h.ax_marg_x.set_facecolor("white")
h.ax_marg_y.set_facecolor("white")
plt.savefig('images/substrate_split_drfp_descriptors_full.png', dpi=300, bbox_inches='tight')

print('RMSE = ',mean_squared_error(df["Yields"], df["Predicted Yields"])**0.5)
print('MAE  = ',mean_absolute_error(df["Yields"], df["Predicted Yields"]))
print('R^2  = ',r2_score(df["Yields"], df["Predicted Yields"]))

In [None]:
# coupling partner split
df = pd.read_csv('results/mechanisms_split_drfp_descriptors')

h = sns.jointplot("Yields", "Predicted Yields", df, kind='kde', fill=True)
h.set_axis_labels('Experimental yields', 'Predicted yields')
h.ax_joint.set_xticks([0, 20, 40, 60, 80, 100])
h.ax_joint.set_yticks([0, 20, 40, 60, 80, 100])
h.ax_marg_x.set_facecolor("white")
h.ax_marg_y.set_facecolor("white")
plt.savefig('images/mechanism_split_drfp_descriptors_full.png', dpi=300, bbox_inches='tight')

print('RMSE = ',mean_squared_error(df["Yields"], df["Predicted Yields"])**0.5)
print('MAE  = ',mean_absolute_error(df["Yields"], df["Predicted Yields"]))
print('R$^2$  = ',r2_score(df["Yields"], df["Predicted Yields"]))

In [None]:
# analysis of the DRFP performances on restricted datasets
import descriptors.dft_featurisation as dft_ft
nicolit_dft = pd.read_csv("data/NiCOlit.csv")
nicolit_dft = preprocess(nicolit_dft)
X_dft, y_dft, DOI_dft, mechanisms_dft, origins_dft, sub_dft, lig_dft = dft_ft.process_dataframe_dft(nicolit_dft, data_path="data/utils/", origin=False)

In [None]:
# substrate-split:
r2 = []
length = []
for sub in np.unique(sub_dft):
    indexes = np.where(sub_dft==sub)[0]
    values, baseline_values, model_values, stratification_values, additional_stratification_values = random_split(X_fp[indexes, :], y_dft[indexes], origins_dft[indexes], sub_dft[indexes], n_iterations=100)
    print(sub, len(indexes), round(r2_score(values, model_values), 3))
    r2.append(round(r2_score(values, model_values), 3))
    length.append(len(indexes))

In [None]:
# coupling_partner-split:
r2 = []
length = []
for sub in np.unique(mechanisms_dft):
    indexes = np.where(mechanisms_dft==sub)[0]
    values, baseline_values, model_values, stratification_values, additional_stratification_values = random_split(X_fp[indexes, :], y_dft[indexes], origins_dft[indexes], mechanisms_dft[indexes], n_iterations=100)
    print(sub, len(indexes), round(r2_score(values, model_values), 3))
    r2.append(round(r2_score(values, model_values), 3))
    length.append(len(indexes))