In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import cross_validate, LeaveOneGroupOut, GridSearchCV
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor

### Data Load and Prep

In [4]:
df = pd.read_csv("../../../Projects/Account ML/notebooks/amination.csv")

# drop reactions without yield
df = df.dropna(subset=['yield'])
df.shape

(3955, 126)

In [5]:
# get only numeric part of dataframe
df_numeric = df.select_dtypes(exclude='object')

# X, y split
X = df_numeric[df_numeric.columns.difference(['yield'])]
y = df_numeric['yield']

X.shape, y.shape

((3955, 120), (3955,))

### Single CV evaluation

Notes:
 - LeaveOneGroupOut splits by a group in a sorted fashion, i.e., molecules are sorted by their name.

In [6]:
# model estimator, here either linear (Lasso) or rf (RandomForestRegressor)
est = linear_model.Lasso(alpha=5)
est = RandomForestRegressor(n_estimators=30,
                            min_samples_split=10, 
                            min_samples_leaf=5,
                            max_depth=8)

# leave one molecule out cross-validator
lomo = LeaveOneGroupOut()

# run cross validation
scores = cross_validate(est, X, y,
                        cv=lomo.split(X, y, groups=df.additive), # here we select the group by which we split
                        scoring=('r2', 'neg_mean_squared_error'),
                        return_train_score=True,
                        n_jobs=os.cpu_count() - 1) # this adds parallelism to the CV loop

In [7]:
# convert results into a dataframe
scores['test_rmse'] = list(map(lambda s: np.sqrt(-s),
                               scores['test_neg_mean_squared_error']))
scores['train_rmse'] = list(map(lambda s: np.sqrt(-s),
                                scores['train_neg_mean_squared_error']))

results = pd.DataFrame([scores['train_rmse'],
                        scores['test_rmse'], 
                        scores['train_r2'],
                        scores['test_r2']],
                       index=['train_rmse',
                              'test_rmse',
                              'train_r2',
                              'test_r2'], 
                       columns=sorted(df.additive.unique())).T

display(results.head().round(2))
display(results.mean().to_frame('metric').round(2))

Unnamed: 0,train_rmse,test_rmse,train_r2,test_r2
"3,5-dimethylisoxazole",8.27,9.96,0.91,0.86
3-methyl-5-phenylisoxazole,8.23,9.12,0.91,0.88
3-methylisoxazole,8.06,14.24,0.91,0.8
3-phenylisoxazole,8.06,15.83,0.91,0.72
4-phenylisoxazole,7.95,24.7,0.91,0.2


Unnamed: 0,metric
train_rmse,8.2
test_rmse,15.31
train_r2,0.91
test_r2,0.39


### Hyperparameter tuning

Using brute-force grid search (no Bayesian Optimization here, yet)

In [8]:
# estimator choice
est = linear_model.Lasso()
params = {'alpha':[0.1, 1, 10, 100]}

est = RandomForestRegressor(n_estimators=30, 
                            min_samples_split=10, 
                            min_samples_leaf=5)
params = {'max_depth':[1,2,4,8],
          'n_estimators':[10, 30, 50]}

# leave one molecule out cross-validator
lomo = LeaveOneGroupOut()

In [9]:
# run the search (this may take some time)
reg = GridSearchCV(est, 
                   param_grid=params, 
                   scoring=('r2'),
                   cv=lomo.split(X, y, groups=df.additive),
                   n_jobs=os.cpu_count() - 1).fit(X, y)

In [10]:
# fetch scores
scores = pd.DataFrame(reg.cv_results_)

# get only interesting columns
par_cols = scores.columns[scores.columns.str.startswith('param_')].tolist()

scores = scores[par_cols + ['mean_test_score', 'rank_test_score']]
scores.sort_values(by='rank_test_score')

Unnamed: 0,param_max_depth,param_n_estimators,mean_test_score,rank_test_score
11,8,50,0.388067,1
10,8,30,0.378718,2
9,8,10,0.375531,3
7,4,30,0.272488,4
8,4,50,0.27046,5
6,4,10,0.262233,6
4,2,30,0.008624,7
5,2,50,0.007162,8
3,2,10,0.002478,9
0,1,10,-0.130662,10
