In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
#mpl.rc('figure', max_open_warning = 0)
#%matplotlib inline
#%config InlineBackend.figure_format='retina'

from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split

from collections import OrderedDict

In [None]:
class style:
   BOLD = '\033[1m'
   END = '\033[0m'

In [None]:
PATH = os.getcwd() # Getting current directory
descriptor_in_path = os.path.join(PATH, '../input/descriptor.csv')

df_descriptor = pd.read_csv(descriptor_in_path)

print(f'Descriptor input DataFrame shape:\n\n {df_descriptor.shape}\n')
print('------------------------------------------------------------')

print(f'\nDescriptor input data columns:\n\n {df_descriptor.columns}\n')
print('------------------------------------------------------------')

print(f'\nDescriptor input dataframe head:\n\n {df_descriptor.head()}\n')
print('------------------------------------------------------------')

del descriptor_in_path

## Renaming descriptor columns

In [None]:
rename_dict = {'name': 'mof', 'Di': 'LCD', 'Df': 'PLD', 'ASA(m2/gram)_1.9': 'GSA', 
               'AV_Volume_fraction_1.9': 'AVF', 'AV(cm3/gram)_1.9': 'GPV', 'density(gram_cm3)': 'Density'}

df_descriptor = df_descriptor.rename(columns=rename_dict)

print(f'\nCurated descriptor columns:\n\n {df_descriptor.columns}\n')
print('------------------------------------------------------------')

print(df_descriptor.dtypes) # Prints the datatype of each column in dataframe
del rename_dict

## Curating descriptor data

In [None]:
df_descriptor_gross1_atomic = df_descriptor

# Selecting materials with PLD > 3.8 A

df_descriptor_gross1_atomic = df_descriptor_gross1_atomic[(df_descriptor_gross1_atomic['PLD'] > 3.8)]

# Selecting materials with non-zero void fraction

df_descriptor_gross1_atomic = df_descriptor_gross1_atomic[(df_descriptor_gross1_atomic['AVF'] > 0.0)]

descriptor_mof_name = df_descriptor_gross1_atomic['mof'].astype(str)

PATH = os.getcwd() # Getting current directory
curated_mof_name = os.path.join(PATH, '../output/curated-mof.csv')
descriptor_mof_name.to_csv(curated_mof_name, index=False)

columns = ['PLD', 'LCD', 'GSA', 'AVF', 'GPV', 'Density', 'total_degree_unsaturation', 'degree_unsaturation', 
           'metallic_percentage', 'O_to_Metal_ration', 'N_to_O_ratio', 'H' ,'Ni', 'Co', 'Cu', 'Zn', 'Pb', 'Mn',
           'Cd', 'C', 'O', 'N', 'S', 'Cl', 'Br', 'F', 'I']

shap_columns = columns

df_descriptor_gross1_atomic = df_descriptor_gross1_atomic[columns].astype(float)
curated_mof_prop = os.path.join(PATH, '../output/curated-mof-prop.csv')

df_descriptor_gross1_atomic.to_csv(curated_mof_prop, index=False)

print(f'\nCurated gross1_atomic descriptor data:\n\n {df_descriptor_gross1_atomic}\n')
print('\n------------------------------------------------------------\n')

print(f'\nData type of each column. Note that it should be float\n\n {df_descriptor_gross1_atomic.dtypes}\n')
print('\n------------------------------------------------------------\n')

del df_descriptor
del columns
del descriptor_mof_name
del curated_mof_name
del curated_mof_prop

## Taking look at target data

In [None]:
target_in_path = os.path.join(PATH, '../input/C3H8-C3H6.csv')
#target_in_path = os.path.join(PATH, '../input/C2H6-C2H4.csv')

df_target = pd.read_csv(target_in_path)

print(f'Target property input DataFrame shape:\n\n {df_target.shape}\n')
print('------------------------------------------------------------')

print(f'\nTarget property input data columns:\n\n {df_target.columns}\n')
print('------------------------------------------------------------')

print(f'\nTarget property input dataframe head:\n\n {df_target.head()}\n')
print('------------------------------------------------------------')

del target_in_path

## Renaming Target property columns

In [None]:
rename_dict = {'MOF_no': 'mof', 'propane_avg(mol/kg)': 'propane_uptake(mol/kg)',
              'propylene_avg(mol/kg)': 'propylene_uptake(mol/kg)',
              'C3H8/C3H6 Selectivity (1Bar)': 'propane_propylene_selectivity', 'Df': 'PLD',
              'AV_Volume_fraction_1.9': 'AVF'}
'''

rename_dict = {'MOF_no': 'mof', 'ethane_avg(mol/kg)': 'ethane_uptake(mol/kg)',
              'ethylene_avg(mol/kg)': 'ethylene_uptake(mol/kg)',
              'C2H6/C2H4 Selectivity (1Bar)': 'ethane_ethylene_selectivity', 'Df': 'PLD',
              'AV_Volume_fraction_1.9': 'AVF'}

'''
df_target = df_target.rename(columns=rename_dict)

print(f'\nCurated target columns:\n\n {df_target.columns}\n')
print('------------------------------------------------------------')
      
del rename_dict

## Curating Target dataset

In [None]:
df_target_gross1_atomic = df_target

# Selecting materials with PLD > 3.8 A

df_target_gross1_atomic = df_target_gross1_atomic[(df_target_gross1_atomic['PLD'] > 3.8)]

# Selecting material with AVF > 0
df_target_gross1_atomic = df_target_gross1_atomic[(df_target_gross1_atomic['AVF'] > 0.0)]

target_mof_name = df_target_gross1_atomic['mof'].astype(str)
target_mof_name_path = os.path.join(PATH, '../output/target-mof-name.csv')
target_mof_name.to_csv(target_mof_name_path, index=False)

columns = ['propane_uptake(mol/kg)', 'propane_propylene_selectivity', 'TSN', 'propylene_uptake(mol/kg)']

#columns = ['ethane_uptake(mol/kg)', 'ethane_ethylene_selectivity', 'TSN', 'ethylene_uptake(mol/kg)']


df_target_gross1_atomic = df_target_gross1_atomic[columns].astype(float)
target_mof_prop_path = os.path.join(PATH, '../output/target-mof-prop.csv')

print(f'\nCurated target data:\n\n {df_target_gross1_atomic}\n')
print('\n------------------------------------------------------------\n')

print(f'\nData type of each column. Note that it should be float\n\n {df_target_gross1_atomic.dtypes}\n')
print('\n------------------------------------------------------------\n')

del df_target
del columns
del target_mof_name
del target_mof_name_path
del target_mof_prop_path

In [None]:
'''
profile = ProfileReport(df_join.copy(),title='C3H8-C3H6', html={'style':{'full_width':True}})
# profile.to_widgets()
#profile.to_notebook_iframe()
C3H8_report = os.path.join(PATH, '../output/C3H8-C3H6-report.csv')

profile.to_file("/home/varad/Pictures/best_model_selection_updated/1_excluding_oms/1_Propane_RACs_excluding.html")

''''

In [None]:
X_crude = df_descriptor_gross1_atomic
Y_crude = df_target_gross1_atomic

print(f'\nShape of X_crude: {X_crude.shape}')
print(f'\nShape of Y_crude: {Y_crude.shape}')

del df_descriptor_gross1_atomic
del df_target_gross1_atomic

# Creating validation set and using the same validation set for all the random seeds

In [None]:
X, X_val_crude, Y, Y_val_crude = train_test_split(X_crude, Y_crude, test_size=0.32, random_state=42)

In [None]:
# Scaling the data
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_crude)
X_val_scaled   = scaler.transform(X_val_crude)
X_test_scaled  = scaler.transform(X_test_crude)

# Normalizing the unscaled data
norm = MinMaxScaler().fit(X_train_crude)

X_train_norm  = norm.transform(X_train_crude)
X_val_norm    = norm.transform(X_val_crude)
X_test_norm   = norm.transform(X_test_crude)

# Normalizing the scaled data
norm_scaled         = MinMaxScaler().fit(X_train_scaled)

X_train_scaled_norm = norm_scaled.transform(X_train_scaled)
X_val_scaled_norm   = norm_scaled.transform(X_val_scaled)
X_test_scaled_norm  = norm_scaled.transform(X_test_scaled)

In [None]:
## Uncomment when model has to be trained on crude data

#X_train = X_train_crude
#X_val   = X_val_crude
#X_test  = X_test_crude

## Uncomment when model has to be trained on scaled data

#X_train = X_train_scaled
#X_val   = X_val_scaled
#X_test  = X_test_scaled

## Uncomment when model has to be trained on normalised data
#X_train = X_train_norm
#X_val   = X_val_norm
#X_test  = X_test_norm

## Uncomment when model has to be trained on scaled_normalised  data
X_train  = X_train_scaled_norm
X_val    = X_val_scaled_norm
X_test   = X_test_scaled_norm

In [None]:
# Target Y is neigther scaled nor normalized

# If index is 0 then, propane / ethane uptake (mol/kg)  
# If index is 1 then, selectivity
# If index is 2 then, TSN
# If index is 3 then, propylene / ethylene uptake (mol/kg)

print('------------------------------------------------------------')
print(style.BOLD + 'Define property here :' + style.END)
print('------------------------------------------------------------')

Y_target_train = Y_train_crude.iloc[:,1]
Y_target_test  = Y_test_crude.iloc[:,1]
Y_target_val   = Y_val_crude.iloc[:,1]

In [None]:
# (Propane + Selectivity + atomic + excluding + scaled + normalized) + GBR

X_train = X_train[:,[3,2,0,21,1,11,7,13,12,17]]
X_test  = X_test[:,[3,2,0,21,1,11,7,13,12,17]]
X_val   = X_val[:,[3,2,0,21,1,11,7,13,12,17]]

shap_columns = X_crude.columns[[3,2,0,21,1,11,7,13,12,17]]

## Grid search for Propane selectivity

In [None]:
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
rfr = RandomForestRegressor(random_state=RNG_SEED)

In [None]:
param_grid = {  'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 
              'max_features': ['auto', 'sqrt', 'log2'], 'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 150, 500],
             'min_samples_split': [2, 4, 6, 8, 10], 'min_samples_leaf' : [1, 2, 3, 4, 5]}

In [None]:
grid_search = GridSearchCV(estimator = rfr, param_grid = param_grid, cv = 10,
                           scoring = 'neg_mean_absolute_error', return_train_score = True, n_jobs = -1,
                          verbose=1000)

grid_search.fit(X_train, Y_target_train)

In [None]:
print("The best combinations of parameters are %s with a score of %0.3f on the validation set."
      % (grid_search.best_params_, -grid_search.best_score_))

In [None]:
# predicted Propane selectivity for all the structures

Y_pred_val = grid_search.predict(X_val) # scikit-learn automatically takes the best combination
                                     # of hyperparameters from grid search

Y_pred_train = grid_search.predict(X_train)

#plt.style.use('seaborn')
fig = plt.figure(figsize = (6,6))
plt.rcParams['font.family'] = 'serif'

fontdict_t = {'fontsize': 14, 'weight': 'bold', 'ha': 'center'}
fontdict_x = {'fontsize': 12, 'weight': 'bold', 'ha': 'center'}
fontdict_y = {'fontsize': 12, 'weight': 'bold', 'va': 'baseline', 'ha': 'center'}

plt.scatter(Y_target_val, Y_pred_val, s=30, c='green', edgecolor='black', linewidth=1, alpha=0.75, 
            label='Test set')

plt.scatter(Y_target_train, Y_pred_train, s=30, c='red', edgecolor='black', linewidth=1, alpha=0.75, label='Train set')

plt.scatter(Y_target_val, Y_pred_val, s=30, c='green', edgecolor='black', linewidth=1, alpha=0.75, 
            label='Test set')

plt.plot([np.min(Y_target_val),np.max(Y_target_val)], 
         [np.min(Y_target_val),np.max(Y_target_val)], color='black', linestyle='--')

plt.title('Performance of ML model for \ntrain and validation set', fontdict=fontdict_t, color='black')

plt.xlabel('GCMC simulated $S_{C_{3}H_{8}/C_{3}H_{6}}$', fontdict=fontdict_x)
plt.ylabel('ML Predicted $S_{C_{3}H_{8}/C_{3}H_{6}}$', fontdict=fontdict_y)

plt.legend(loc='upper left')

ticks = [1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2.0,2.1]
plt.axes().set_xticks(ticks)
plt.axes().set_yticks(ticks)

#[1.75, 1.32, 1.7, 1.27, 1.665, 1.24, 1.65, 1.21]
plt.text(1.75, 1.32, str('Train     Val'), weight='bold', horizontalalignment='left', size='medium', 
         color='black', fontsize=10)

plt.text(1.7, 1.27, str('$\mathregular{R^2:}$') + '{:.3f}'.format(r2_score(Y_target_train, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(r2_score(Y_target_val, Y_pred_val)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.text(1.665, 1.23, str('$\mathregular{MAE: }$') + '{:.3f}'.format(mean_absolute_error(Y_target_train, Y_pred_train)) +  
         str('   ') + '{:.3f}'.format(mean_absolute_error(Y_target_val, Y_pred_val)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.text(1.645, 1.19, str('$\mathregular{RMSE: }$') + '{:.3f}'.format(mean_squared_error(Y_target_train, Y_pred_train)) +  
         str('   ') + '{:.3f}'.format(mean_squared_error(Y_target_val, Y_pred_val)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.tight_layout()

In [None]:
Y_target_train

In [None]:
Y_pred_train.shape

## Shap analysis on train set

In [None]:
del rfr

In [None]:
rfr = RandomForestRegressor(random_state=RNG_SEED, bootstrap = 'False', max_depth = 10,
                            max_features = 'sqrt', n_estimators = 50, min_samples_split = 2,
                            min_samples_leaf = 1)

In [None]:
rfr.fit(X_train, Y_target_train)

In [None]:
# predicted Propane selectivity for all the structures

Y_pred_val = rfr.predict(X_val) # scikit-learn automatically takes the best combination
                                     # of hyperparameters from grid search

Y_pred_train = rfr.predict(X_train)

#plt.style.use('seaborn')
fig = plt.figure(figsize = (6,6))
plt.rcParams['font.family'] = 'serif'

fontdict_t = {'fontsize': 14, 'weight': 'bold', 'ha': 'center'}
fontdict_x = {'fontsize': 12, 'weight': 'bold', 'ha': 'center'}
fontdict_y = {'fontsize': 12, 'weight': 'bold', 'va': 'baseline', 'ha': 'center'}

plt.scatter(Y_target_val, Y_pred_val, s=30, c='green', edgecolor='black', linewidth=1, alpha=0.75, 
            label='Test set')

plt.scatter(Y_target_train, Y_pred_train, s=30, c='red', edgecolor='black', linewidth=1, alpha=0.75, label='Train set')

plt.plot([np.min(Y_target_val),np.max(Y_target_val)], 
         [np.min(Y_target_val),np.max(Y_target_val)], color='black', linestyle='--')

plt.title('Performance of ML model for \ntrain and validation set', fontdict=fontdict_t, color='black')

plt.xlabel('GCMC simulated $S_{C_{3}H_{8}/C_{3}H_{6}}$', fontdict=fontdict_x)
plt.ylabel('ML Predicted $S_{C_{3}H_{8}/C_{3}H_{6}}$', fontdict=fontdict_y)

plt.legend(loc='upper left')

ticks = [1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2.0,2.1]
plt.axes().set_xticks(ticks)
plt.axes().set_yticks(ticks)

plt.text(1.75, 1.32, str('Train     Val'), weight='bold', horizontalalignment='left', size='medium', 
         color='black', fontsize=10)

plt.text(1.7, 1.27, str('$\mathregular{R^2:}$') + '{:.3f}'.format(r2_score(Y_target_train, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(r2_score(Y_target_val, Y_pred_val)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.text(1.665, 1.23, str('$\mathregular{MAE: }$') + '{:.3f}'.format(mean_absolute_error(Y_target_train, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(mean_absolute_error(Y_target_val, Y_pred_val)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.text(1.645, 1.19, str('$\mathregular{RMSE: }$') + '{:.3f}'.format(mean_squared_error(Y_target_train, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(mean_squared_error(Y_target_val, Y_pred_val)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.tight_layout()

In [None]:
grid_search.best_estimator_.feature_importances_

In [None]:
import shap
explainer = shap.TreeExplainer(rfr)
shap_values = explainer.shap_values(X_train)

In [None]:
shap.summary_plot(shap_values, X_train, feature_names = shap_columns, max_display = 15, show = False)
plt.title("Feature importance calculated using SHAP for propane-propylene selectivity for train set\n", fontweight = "bold")
plt.tight_layout()

In [None]:
shap_values = explainer(X_train)

In [None]:
shap.plots.bar(shap_values, max_display = 15)

In [None]:
del rfr
del grid_search

## Combining training and validation set

In [None]:
# Concatenate the train and validation datasets together
X_train_new = np.concatenate((X_train, X_val), axis=0)
Y_train_new = pd.concat((Y_target_train, Y_target_val), axis=0)

print(X_train_new.shape)
print(Y_train_new.shape)

In [None]:
print(X_test.shape)
print(Y_target_test.shape)

## Retraining model on combined train and validation data

In [None]:
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
rfr = RandomForestRegressor(random_state=RNG_SEED)

In [None]:
param_grid = {  'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 
              'max_features': ['auto', 'sqrt', 'log2'], 'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 150, 500],
             'min_samples_split': [2, 4, 6, 8, 10], 'min_samples_leaf' : [1, 2, 3, 4, 5]}

In [None]:
grid_search = GridSearchCV(estimator = rfr, param_grid = param_grid, cv = 10,
                           scoring = 'neg_mean_absolute_error', return_train_score = True, n_jobs = -1,
                          verbose=1000)

grid_search.fit(X_train_new, Y_train_new)

## Grid search results

Print out the average validation errors and corresponding hyperparameter combinations

In [None]:
df_cv_result_selectivity = pd.DataFrame(grid_search.cv_results_)

In [None]:
#df_cv_result_selectivity

In [None]:
# Saving the results of all the hyperparameters searched

#df_cv_result_selectivity = df_cv_result_selectivity.astype(float)
#df_cv_result_selectivity.to_csv('/home/varad/varad/literature/24_sauradeep/workspace/ML/output/2_Gross1_Atomic/grid_search_results/selectivity/1_selectivity_propane_propylene_grid_search_results_excluding_oms.csv', index=False)

In [None]:
df_cv_result_selectivity.columns

print(f'\nGrid search results:\n\n {df_cv_result_selectivity}\n')
print('\n------------------------------------------------------------\n')

## Printing out the average validation errors and corresponding hyperparameter combinations

In [None]:
mean_test = grid_search.cv_results_['mean_test_score']
std_test = grid_search.cv_results_['std_test_score']

mean_train = grid_search.cv_results_['mean_train_score']
std_train = grid_search.cv_results_['std_train_score']

In [None]:
print("The best combinations of parameters are %s with a score of %0.3f on the validation set."
      % (grid_search.best_params_, -grid_search.best_score_))

In [None]:
# predicted Propane selectivity for all the structures

Y_pred_test = grid_search.predict(X_test) # scikit-learn automatically takes the best combination
                                     # of hyperparameters from grid search

Y_pred_train = grid_search.predict(X_train_new)

#plt.style.use('seaborn')
fig = plt.figure(figsize = (6,6))
plt.rcParams['font.family'] = 'serif'

fontdict_t = {'fontsize': 14, 'weight': 'bold', 'ha': 'center'}
fontdict_x = {'fontsize': 12, 'weight': 'bold', 'ha': 'center'}
fontdict_y = {'fontsize': 12, 'weight': 'bold', 'va': 'baseline', 'ha': 'center'}

a = Y_train_new
b = Y_target_test

plt.scatter(Y_target_test, Y_pred_test, s=30, c='green', edgecolor='black', linewidth=1, alpha=0.75, 
            label='Test set')

plt.scatter(Y_train_new, Y_pred_train, s=30, c='red', edgecolor='black', linewidth=1, alpha=0.75, label='Train set')

plt.plot([np.min(Y_target_test),np.max(Y_target_test)], 
         [np.min(Y_target_test),np.max(Y_target_test)], color='black', linestyle='--')

plt.title('Performance of ML model for \ntrain and test set', fontdict=fontdict_t, color='black')

plt.xlabel('GCMC simulated $S_{C_{3}H_{8}/C_{3}H_{6}}$', fontdict=fontdict_x)
plt.ylabel('ML Predicted $S_{C_{3}H_{8}/C_{3}H_{6}}$', fontdict=fontdict_y)

plt.legend(loc='upper left')

ticks = [1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2.0,2.1]
plt.axes().set_xticks(ticks)
plt.axes().set_yticks(ticks)

#[1.75, 1.32, 1.7, 1.27, 1.665, 1.24, 1.65, 1.21]

plt.text(1.75, 1.32, str('Train     Test'), weight='bold', horizontalalignment='left', size='medium', 
         color='black', fontsize=10)

plt.text(1.7, 1.27, str('$\mathregular{R^2:}$') + '{:.3f}'.format(r2_score(Y_train_new, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(r2_score(Y_target_test, Y_pred_test)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.text(1.665, 1.23, str('$\mathregular{MAE: }$') + '{:.3f}'.format(mean_absolute_error(Y_train_new, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(mean_absolute_error(Y_target_test, Y_pred_test)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.text(1.645, 1.19, str('$\mathregular{RMSE: }$') + '{:.3f}'.format(mean_squared_error(Y_train_new, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(mean_squared_error(Y_target_test, Y_pred_test)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.tight_layout()


In [None]:
grid_search_selectivity = grid_search

In [None]:
del rfr
del RandomForestRegressor
del param_grid
del grid_search
del GridSearchCV
del ticks
#del mean_val
del mean_train
#del std_val
#del std_train
del Y_pred_val
del Y_pred_train
del localimportance
del explainer
del index
del feature_importance
del df_cv_result_selectivity
del shap_values

## Grid search for Propane uptake

In [None]:
## Uncomment when model has to be trained on crude data

#X_train = X_train_crude
#X_val   = X_val_crude
#X_test  = X_test_crude

## Uncomment when model has to be trained on scaled data

#X_train = X_train_scaled
#X_val   = X_val_scaled
#X_test  = X_test_scaled

## Uncomment when model has to be trained on normalised data
#X_train = X_train_norm
#X_val   = X_val_norm
#X_test  = X_test_norm

## Uncomment when model has to be trained on scaled_normalised  data
X_train  = X_train_scaled_norm
X_val    = X_val_scaled_norm
X_test   = X_test_scaled_norm

In [None]:
# Target Y is neigther scaled nor normalized

# If index is 0 then, propane / ethane uptake (mol/kg)  
# If index is 1 then, selectivity
# If index is 2 then, TSN
# If index is 3 then, propylene / ethylene uptake (mol/kg)

Y_target_train = Y_train_crude.iloc[:,0]
Y_target_test  = Y_test_crude.iloc[:,0]
Y_target_val   = Y_val_crude.iloc[:,0]

In [None]:
#(Propane + propane_uptake + atomic + excluding + scaled + normalized) + GBR

X_train = X_train[:,[0,2,7,11,15,1,6,25,3,4,12]]
X_test  = X_test[:,[0,2,7,11,15,1,6,25,3,4,12]]
X_val   = X_val[:,[0,2,7,11,15,1,6,25,3,4,12]]

shap_columns = X_crude.columns[[0,2,7,11,15,1,6,25,3,4,12]]


In [None]:
from sklearn.ensemble import ExtraTreesRegressor
#from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
etr = ExtraTreesRegressor(random_state=RNG_SEED)
#svr = SVR() # random_state=RNG_SEED cannot be used

In [None]:
# etr

param_grid = {  'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
              'max_features': ['auto', 'sqrt', 'log2'], 'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 150, 500],
              'min_samples_split': [2, 4, 6, 8, 10], 'min_samples_leaf' : [1, 2, 3, 4, 5]}
'''
# svr
param_grid = {'kernel':['rbf', 'sigmoid', 'poly', 'linear'], 'C': [0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0],
              'gamma': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0], 'epsilon': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
              'degree': [1,2,3,4,5,6]}
'''

In [None]:
# etr

grid_search = GridSearchCV(estimator = etr, param_grid = param_grid, cv = 10,
                           scoring = 'neg_mean_absolute_error', return_train_score = True, n_jobs = -1,
                          verbose=1000)

'''
# svr
grid_search = GridSearchCV(estimator = svr, param_grid = param_grid, cv = 10,
                           scoring = 'neg_mean_absolute_error', return_train_score = True, n_jobs = -1,
                          verbose=1000)

'''
grid_search.fit(X_train, Y_target_train)

In [None]:
print("The best combinations of parameters are %s with a score of %0.3f on the validation set."
      % (grid_search.best_params_, -grid_search.best_score_))

In [None]:
# predicted Propane selectivity for all the structures

Y_pred_val = grid_search.predict(X_val) # scikit-learn automatically takes the best combination
                                     # of hyperparameters from grid search

Y_pred_train = grid_search.predict(X_train)

#plt.style.use('seaborn')
fig = plt.figure(figsize = (6,6))
plt.rcParams['font.family'] = 'serif'

fontdict_t = {'fontsize': 14, 'weight': 'bold', 'ha': 'center'}
fontdict_x = {'fontsize': 12, 'weight': 'bold', 'ha': 'center'}
fontdict_y = {'fontsize': 12, 'weight': 'bold', 'va': 'baseline', 'ha': 'center'}

plt.scatter(Y_target_val, Y_pred_val, s=30, c='green', edgecolor='black', linewidth=1, alpha=0.75, 
            label='Test set')

plt.scatter(Y_target_train, Y_pred_train, s=30, c='red', edgecolor='black', linewidth=1, alpha=0.75, label='Train set')

plt.plot([np.min(Y_target_val),np.max(Y_target_val)], 
         [np.min(Y_target_val),np.max(Y_target_val)], color='black', linestyle='--')

plt.title('Performance of ML model for \ntrain and validation set', fontdict=fontdict_t, color='black')

plt.xlabel('GCMC simulated $S_{C_{3}H_{8}/C_{3}H_{6}}$', fontdict=fontdict_x)
plt.ylabel('ML Predicted $S_{C_{3}H_{8}/C_{3}H_{6}}$', fontdict=fontdict_y)

plt.legend(loc='upper left')

#0.8, 0.3, 0.75, 0.25, 0.7, 0.20, 0.68, 0.15
plt.text(0.8, 0.3, str('Train     Val'), weight='bold', horizontalalignment='left', size='medium', 
         color='black', fontsize=10)

plt.text(0.74, 0.25, str('$\mathregular{R^2:}$') + '{:.3f}'.format(r2_score(Y_target_train, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(r2_score(Y_target_val, Y_pred_val)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.text(0.7, 0.20, str('$\mathregular{MAE: }$') + '{:.3f}'.format(mean_absolute_error(Y_target_train, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(mean_absolute_error(Y_target_val, Y_pred_val)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.text(0.68, 0.15, str('$\mathregular{RMSE: }$') + '{:.3f}'.format(mean_squared_error(Y_target_train, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(mean_squared_error(Y_target_val, Y_pred_val)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.tight_layout()

## Shap analysis on train set

In [None]:
del etr
#del svr

In [None]:

etr = ExtraTreesRegressor(random_state=RNG_SEED, bootstrap = 'False', max_depth = 20,
                            max_features = 'auto', n_estimators = 50, min_samples_split = 2,
                            min_samples_leaf = 1)


#svr = SVR(C = 10.0, degree = 1, epsilon = 0.01, gamma = 1.0, kernel = 'rbf')

In [None]:
model = etr.fit(X_train, Y_target_train)
#model = svr.fit(X_train, Y_target_train)

In [None]:
# predicted Propane selectivity for all the structures

Y_pred_val = etr.predict(X_val) # scikit-learn automatically takes the best combination
                                     # of hyperparameters from grid search

Y_pred_train = etr.predict(X_train)
'''
Y_pred_val = svr.predict(X_val) # scikit-learn automatically takes the best combination
                                     # of hyperparameters from grid search

Y_pred_train = svr.predict(X_train)

'''
#plt.style.use('seaborn')
fig = plt.figure(figsize = (6,6))
plt.rcParams['font.family'] = 'serif'

fontdict_t = {'fontsize': 14, 'weight': 'bold', 'ha': 'center'}
fontdict_x = {'fontsize': 12, 'weight': 'bold', 'ha': 'center'}
fontdict_y = {'fontsize': 12, 'weight': 'bold', 'va': 'baseline', 'ha': 'center'}

plt.scatter(Y_target_val, Y_pred_val, s=30, c='green', edgecolor='black', linewidth=1, alpha=0.75, 
            label='Test set')

plt.scatter(Y_target_train, Y_pred_train, s=30, c='red', edgecolor='black', linewidth=1, alpha=0.75, label='Train set')

plt.plot([np.min(Y_target_val),np.max(Y_target_val)], 
         [np.min(Y_target_val),np.max(Y_target_val)], color='black', linestyle='--')

plt.title('Performance of ML model for \ntrain and validation set', fontdict=fontdict_t, color='black')

plt.xlabel('GCMC simulated $S_{C_{3}H_{8}/C_{3}H_{6}}$', fontdict=fontdict_x)
plt.ylabel('ML Predicted $S_{C_{3}H_{8}/C_{3}H_{6}}$', fontdict=fontdict_y)

plt.legend(loc='upper left')

#0.8, 0.3, 0.75, 0.25, 0.7, 0.20, 0.68, 0.15
plt.text(0.8, 0.3, str('Train     Val'), weight='bold', horizontalalignment='left', size='medium', 
         color='black', fontsize=10)

plt.text(0.74, 0.25, str('$\mathregular{R^2:}$') + '{:.3f}'.format(r2_score(Y_target_train, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(r2_score(Y_target_val, Y_pred_val)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.text(0.7, 0.20, str('$\mathregular{MAE: }$') + '{:.3f}'.format(mean_absolute_error(Y_target_train, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(mean_absolute_error(Y_target_val, Y_pred_val)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.text(0.68, 0.15, str('$\mathregular{RMSE: }$') + '{:.3f}'.format(mean_squared_error(Y_target_train, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(mean_squared_error(Y_target_val, Y_pred_val)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.tight_layout()

In [None]:
grid_search.best_estimator_.feature_importances_

In [None]:
del shap
import shap
explainer = shap.TreeExplainer(etr)
shap_values = explainer.shap_values(X_train)

In [None]:
shap.summary_plot(shap_values, X_train, feature_names = shap_columns, max_display = 15, show = False)
plt.title("Feature importance calculated using SHAP for Propane-Uptake (mol/kg) for train set\n", fontweight = "bold")
plt.tight_layout()

In [None]:
shap_values = explainer(X_train)

In [None]:
shap.plots.bar(shap_values, max_display = 15)

## Combining training and validation set

In [None]:
# Concatenate the train and validation datasets together
X_train_new = np.concatenate((X_train, X_val), axis=0)
Y_train_new = pd.concat((Y_target_train, Y_target_val), axis=0)

print(X_train_new.shape)
print(Y_train_new.shape)

In [None]:
del etr
#del svr
del grid_search

## Retraining model on combined train and validation data

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
#from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
etr = ExtraTreesRegressor(random_state=RNG_SEED)
#svr = SVR() # random_state=RNG_SEED cannot be used

In [None]:
# etr

param_grid = {  'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
              'max_features': ['auto', 'sqrt', 'log2'], 'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 150, 500],
              'min_samples_split': [2, 4, 6, 8, 10], 'min_samples_leaf' : [1, 2, 3, 4, 5]}
'''
# svr
param_grid = {'kernel':['rbf', 'sigmoid', 'poly', 'linear'], 'C': [0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0],
              'gamma': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0], 'epsilon': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
              'degree': [1,2,3,4,5,6]}
'''

In [None]:

grid_search = GridSearchCV(estimator = etr, param_grid = param_grid, cv = 10,
                           scoring = 'neg_mean_absolute_error', return_train_score = True, n_jobs = -1,
                          verbose=1000)
'''
grid_search = GridSearchCV(estimator = svr, param_grid = param_grid, cv = 10,
                           scoring = 'neg_mean_absolute_error', return_train_score = True, n_jobs = -1,
                          verbose=1000)
'''
grid_search.fit(X_train_new, Y_train_new)

## Grid search results

Print out the average validation errors and corresponding hyperparameter combinations

In [None]:
df_cv_result_propane_uptake = pd.DataFrame(grid_search.cv_results_)

In [None]:
#df_cv_result_propane_uptake

In [None]:
# Saving the results of all the hyperparameters searched

#df_cv_result_propane_uptake = df_cv_result_propane_uptake.astype(float)
#df_cv_result_propane_uptake.to_csv('/home/varad/varad/literature/24_sauradeep/workspace/ML/output/2_Gross1_Atomic/grid_search_results/selectivity/1_selectivity_propane_propylene_grid_search_results_excluding_oms.csv', index=False)

In [None]:
df_cv_result_propane_uptake.columns

In [None]:
'''
print(f'\nGrid search results:\n\n {df_cv_result_selectivity}\n')
print('\n------------------------------------------------------------\n')
'''

## Printing out the average validation errors and corresponding hyperparameter combinations

In [None]:
mean_test = grid_search.cv_results_['mean_test_score']
std_test = grid_search.cv_results_['std_test_score']

mean_train = grid_search.cv_results_['mean_train_score']
std_train = grid_search.cv_results_['std_train_score']

'''
for mean, std, params in zip(-means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
'''

In [None]:
print("The best combinations of parameters are %s with a score of %0.3f on the validation set."
      % (grid_search.best_params_, -grid_search.best_score_))

In [None]:
# predicted Propane selectivity for all the structures

Y_pred_test = grid_search.predict(X_test) # scikit-learn automatically takes the best combination
                                     # of hyperparameters from grid search

Y_pred_train = grid_search.predict(X_train_new)

#plt.style.use('seaborn')
fig = plt.figure(figsize = (6,6))
plt.rcParams['font.family'] = 'serif'

fontdict_t = {'fontsize': 14, 'weight': 'bold', 'ha': 'center'}
fontdict_x = {'fontsize': 12, 'weight': 'bold', 'ha': 'center'}
fontdict_y = {'fontsize': 12, 'weight': 'bold', 'va': 'baseline', 'ha': 'center'}

plt.scatter(Y_target_test, Y_pred_test, s=30, c='green', edgecolor='black', linewidth=1, alpha=0.75, 
            label='Test set')

plt.scatter(Y_train_new, Y_pred_train, s=30, c='red', edgecolor='black', linewidth=1, alpha=0.75, label='Train set')

plt.plot([np.min(Y_target_test),np.max(Y_target_test)], 
         [np.min(Y_target_test),np.max(Y_target_test)], color='black', linestyle='--')

plt.title('Performance of ML model for \ntrain and test set', fontdict=fontdict_t, color='black')

plt.xlabel('GCMC simulated $N_{C_{3}H_{8}}$', fontdict=fontdict_x)
plt.ylabel('ML Predicted $N_{C_{3}H_{8}}$', fontdict=fontdict_y)

plt.legend(loc='upper left')

# 0.8, 0.3, 0.75, 0.25, 0.7, 0.20, 0.68, 0.15

plt.text(0.8, 0.3, str('Train     Test'), weight='bold', horizontalalignment='left', size='medium', 
         color='black', fontsize=10)

plt.text(0.74, 0.25, str('$\mathregular{R^2:}$') + '{:.3f}'.format(r2_score(Y_train_new, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(r2_score(Y_target_test, Y_pred_test)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.text(0.7, 0.20, str('$\mathregular{MAE: }$') + '{:.3f}'.format(mean_absolute_error(Y_train_new, Y_pred_train)) +
         str('   ') + '{:.3f}'.format(mean_absolute_error(Y_target_test, Y_pred_test)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.text(0.68, 0.15, str('$\mathregular{RMSE: }$') + '{:.3f}'.format(mean_squared_error(Y_train_new, Y_pred_train)) +
         str('   ') + '{:.3f}'.format(mean_squared_error(Y_target_test, Y_pred_test)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.tight_layout()

In [None]:
grid_search_propane_uptake = grid_search

In [None]:
del etr
#del svr
del param_grid
del grid_search
del GridSearchCV
#del ticks
#del mean_val
del mean_train
#del std_val
#del std_train
del Y_pred_val
del Y_pred_train
del localimportance
del explainer
del index
del feature_importance
del df_cv_result_propane_uptake
del shap_values

## Grid search for Propylene uptake

In [None]:
## Uncomment when model has to be trained on crude data

#X_train = X_train_crude
#X_val   = X_val_crude
#X_test  = X_test_crude

## Uncomment when model has to be trained on scaled data

#X_train = X_train_scaled
#X_val   = X_val_scaled
#X_test  = X_test_scaled

## Uncomment when model has to be trained on normalised data
#X_train = X_train_norm
#X_val   = X_val_norm
#X_test  = X_test_norm

## Uncomment when model has to be trained on scaled_normalised  data
X_train  = X_train_scaled_norm
X_val    = X_val_scaled_norm
X_test   = X_test_scaled_norm

In [None]:
# Target Y is neigther scaled nor normalized

# If index is 0 then, propane / ethane uptake (mol/kg)  
# If index is 1 then, selectivity
# If index is 2 then, TSN
# If index is 3 then, propylene / ethylene uptake (mol/kg)

Y_target_train = Y_train_crude.iloc[:,3]
Y_target_test  = Y_test_crude.iloc[:,3]
Y_target_val   = Y_val_crude.iloc[:,3]

temp_train = Y_train_crude.iloc[:,1]
temp_test  = Y_test_crude.iloc[:,1]
temp_val   = Y_val_crude.iloc[:,1]

In [None]:
#(Propane + propylene_uptake + atomic + excluding + scaled + normalized) + GBR

X_train = X_train[:,[2,7,11,15,1,5,6,3,22,4,21,25,16]]
X_test  = X_test[:,[2,7,11,15,1,5,6,3,22,4,21,25,16]]
X_val   = X_val[:,[2,7,11,15,1,5,6,3,22,4,21,25,16]]

shap_columns = X_crude.columns[[2,7,11,15,1,5,6,3,22,4,21,25,16]]


In [None]:
from sklearn.ensemble import ExtraTreesRegressor
#from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
etr = ExtraTreesRegressor(random_state=RNG_SEED)
#svr = SVR() # random_state=RNG_SEED cannot be used

In [None]:
# etr

param_grid = {  'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
              'max_features': ['auto', 'sqrt', 'log2'], 'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 150, 500],
              'min_samples_split': [2, 4, 6, 8, 10], 'min_samples_leaf' : [1, 2, 3, 4, 5]}
'''
# svr
param_grid = {'kernel':['rbf', 'sigmoid', 'poly', 'linear'], 'C': [0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0],
              'gamma': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0], 'epsilon': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
              'degree': [1,2,3,4,5,6]}
'''

In [None]:
# etr
# change
grid_search = GridSearchCV(estimator = etr, param_grid = param_grid, cv = 10,
                           scoring = 'neg_mean_absolute_error', return_train_score = True, n_jobs = -1,
                          verbose=1000)

'''
# svr
grid_search = GridSearchCV(estimator = svr, param_grid = param_grid, cv = 10,
                           scoring = 'neg_mean_absolute_error', return_train_score = True, n_jobs = -1,
                          verbose=1000)
'''

grid_search.fit(X_train, Y_target_train)

In [None]:
print("The best combinations of parameters are %s with a score of %0.3f on the validation set."
      % (grid_search.best_params_, -grid_search.best_score_))

In [None]:
# predicted Propane selectivity for all the structures

Y_pred_val = grid_search.predict(X_val) # scikit-learn automatically takes the best combination
                                     # of hyperparameters from grid search

Y_pred_train = grid_search.predict(X_train)

#plt.style.use('seaborn')
fig = plt.figure(figsize = (6,6))
plt.rcParams['font.family'] = 'serif'

fontdict_t = {'fontsize': 14, 'weight': 'bold', 'ha': 'center'}
fontdict_x = {'fontsize': 12, 'weight': 'bold', 'ha': 'center'}
fontdict_y = {'fontsize': 12, 'weight': 'bold', 'va': 'baseline', 'ha': 'center'}

plt.scatter(Y_target_val, Y_pred_val, s=30, c='green', edgecolor='black', linewidth=1, alpha=0.75, 
            label='Test set')

plt.scatter(Y_target_train, Y_pred_train, s=30, c='red', edgecolor='black', linewidth=1, alpha=0.75, label='Train set')

plt.plot([np.min(Y_target_val),np.max(Y_target_val)], 
         [np.min(Y_target_val),np.max(Y_target_val)], color='black', linestyle='--')

plt.title('Performance of ML model for \ntrain and validation set', fontdict=fontdict_t, color='black')

plt.xlabel('GCMC simulated $N_{C_{3}H_{6}}$', fontdict=fontdict_x)
plt.ylabel('ML Predicted $N_{C_{3}H_{6}}$', fontdict=fontdict_y)

plt.legend(loc='upper left')

#3.0, 1.3, 2.7, 1.1, 2.5, 0.9, 2.41, 0.7
plt.text(3.0, 1.3, str('Train     Val'), weight='bold', horizontalalignment='left', size='medium', 
         color='black', fontsize=10)

plt.text(0.74, 0.25, str('$\mathregular{R^2:}$') + '{:.3f}'.format(r2_score(Y_target_train, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(r2_score(Y_target_val, Y_pred_val)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.text(2.7, 1.1, str('$\mathregular{MAE: }$') + '{:.3f}'.format(mean_absolute_error(Y_target_train, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(mean_absolute_error(Y_target_val, Y_pred_val)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.text(2.41, 0.7, str('$\mathregular{RMSE: }$') + '{:.3f}'.format(mean_squared_error(Y_target_train, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(mean_squared_error(Y_target_val, Y_pred_val)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.tight_layout()

## Shap analysis on train set

In [None]:
del etr
#del svr

In [None]:
etr = ExtraTreesRegressor(random_state=RNG_SEED, bootstrap = 'False', max_depth = 20,
                            max_features = 'auto', n_estimators = 250, min_samples_split = 4,
                            min_samples_leaf = 1)


#svr = SVR(C = 100.0, degree = 1, epsilon = 0.1, gamma = 1.0, kernel = 'rbf')

In [None]:
model = etr.fit(X_train, Y_target_train)
#model = svr.fit(X_train, Y_target_train)

In [None]:
# predicted Propane selectivity for all the structures

Y_pred_val = etr.predict(X_val) # scikit-learn automatically takes the best combination
                                     # of hyperparameters from grid search

Y_pred_train = etr.predict(X_train)
'''
Y_pred_val = svr.predict(X_val) # scikit-learn automatically takes the best combination
                                     # of hyperparameters from grid search

Y_pred_train = svr.predict(X_train)
'''
#plt.style.use('seaborn')
fig = plt.figure(figsize = (6,6))
plt.rcParams['font.family'] = 'serif'

fontdict_t = {'fontsize': 14, 'weight': 'bold', 'ha': 'center'}
fontdict_x = {'fontsize': 12, 'weight': 'bold', 'ha': 'center'}
fontdict_y = {'fontsize': 12, 'weight': 'bold', 'va': 'baseline', 'ha': 'center'}

plt.scatter(Y_target_val, Y_pred_val, s=30, c='green', edgecolor='black', linewidth=1, alpha=0.75, 
            label='Test set')

plt.scatter(Y_target_train, Y_pred_train, s=30, c='red', edgecolor='black', linewidth=1, alpha=0.75, label='Train set')

plt.plot([np.min(Y_target_val),np.max(Y_target_val)], 
         [np.min(Y_target_val),np.max(Y_target_val)], color='black', linestyle='--')

plt.title('Performance of ML model for \ntrain and validation set', fontdict=fontdict_t, color='black')

plt.xlabel('GCMC simulated $N_{C_{3}H_{6}}$', fontdict=fontdict_x)
plt.ylabel('ML Predicted $N_{C_{3}H_{6}}$', fontdict=fontdict_y)

plt.legend(loc='upper left')

#3.0, 1.3, 2.7, 1.1, 2.5, 0.9, 2.41, 0.7
plt.text(3.0, 1.3, str('Train     Val'), weight='bold', horizontalalignment='left', size='medium', 
         color='black', fontsize=10)

plt.text(0.74, 0.25, str('$\mathregular{R^2:}$') + '{:.3f}'.format(r2_score(Y_target_train, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(r2_score(Y_target_val, Y_pred_val)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.text(2.5, 0.9, str('$\mathregular{MAE: }$') + '{:.3f}'.format(mean_absolute_error(Y_target_train, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(mean_absolute_error(Y_target_val, Y_pred_val)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.text(2.41, 0.7, str('$\mathregular{RMSE: }$') + '{:.3f}'.format(mean_squared_error(Y_target_train, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(mean_squared_error(Y_target_val, Y_pred_val)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.tight_layout()

In [None]:
grid_search.best_estimator_.feature_importances_

In [None]:
del shap

import shap
explainer = shap.TreeExplainer(etr)
shap_values = explainer.shap_values(X_train)

In [None]:
shap.summary_plot(shap_values, X_train, feature_names = shap_columns, max_display = 15, show = False)
plt.title("Feature importance calculated using SHAP for propylene-Uptake (mol/kg) for train set\n", fontweight = "bold")
plt.tight_layout()

In [None]:
shap_values = explainer(X_train)

In [None]:
shap.plots.bar(shap_values, max_display = 15)

## Combining training and validation set

In [None]:
# Concatenate the train and validation datasets together
X_train_new = np.concatenate((X_train, X_val), axis=0)
Y_train_new = pd.concat((Y_target_train, Y_target_val), axis=0)

print(X_train_new.shape)
print(Y_train_new.shape)

In [None]:
X_train_new

In [None]:
Y_train_new

In [None]:
temp_train_val_comb # selectivity

In [None]:
del etr
#del svr
del grid_search

## Retraining model on combined train and validation data

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
#from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
etr = ExtraTreesRegressor(random_state=RNG_SEED)
#svr = SVR() # random_state=RNG_SEED cannot be used

In [None]:
# etr

param_grid = {  'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
              'max_features': ['auto', 'sqrt', 'log2'], 'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 150, 500],
              'min_samples_split': [2, 4, 6, 8, 10], 'min_samples_leaf' : [1, 2, 3, 4, 5]}
'''
# svr
param_grid = {'kernel':['rbf', 'sigmoid', 'poly', 'linear'], 'C': [0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0],
              'gamma': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0], 'epsilon': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
              'degree': [1,2,3,4,5,6]}
'''

In [None]:

grid_search = GridSearchCV(estimator = etr, param_grid = param_grid, cv = 10,
                           scoring = 'neg_mean_absolute_error', return_train_score = True, n_jobs = -1,
                          verbose=1000)
'''
grid_search = GridSearchCV(estimator = svr, param_grid = param_grid, cv = 10,
                           scoring = 'neg_mean_absolute_error', return_train_score = True, n_jobs = -1,
                          verbose=1000)
'''
grid_search.fit(X_train_new, Y_train_new)

## Grid search results

Print out the average validation errors and corresponding hyperparameter combinations

In [None]:
df_cv_result_propylene_uptake = pd.DataFrame(grid_search.cv_results_)

In [None]:
#df_cv_result_propylene_uptake

In [None]:
# Saving the results of all the hyperparameters searched

#df_cv_result_propylene_uptake = df_cv_result_propane_uptake.astype(float)
#df_cv_result_propylene_uptake.to_csv('/home/varad/varad/literature/24_sauradeep/workspace/ML/output/2_Gross1_Atomic/grid_search_results/selectivity/1_selectivity_propane_propylene_grid_search_results_excluding_oms.csv', index=False)

In [None]:
df_cv_result_propylene_uptake.columns

In [None]:
'''
print(f'\nGrid search results:\n\n {df_cv_result_selectivity}\n')
print('\n------------------------------------------------------------\n')
'''

## Printing out the average validation errors and corresponding hyperparameter combinations

In [None]:
mean_test = grid_search.cv_results_['mean_test_score']
std_test = grid_search.cv_results_['std_test_score']

mean_train = grid_search.cv_results_['mean_train_score']
std_train = grid_search.cv_results_['std_train_score']

In [None]:
print("The best combinations of parameters are %s with a score of %0.3f on the validation set."
      % (grid_search.best_params_, -grid_search.best_score_))

In [None]:
# predicted Propane selectivity for all the structures

Y_pred_test = grid_search.predict(X_test) # scikit-learn automatically takes the best combination
                                     # of hyperparameters from grid search

Y_pred_train = grid_search.predict(X_train_new)

#plt.style.use('seaborn')
fig = plt.figure(figsize = (6,6))
plt.rcParams['font.family'] = 'serif'

fontdict_t = {'fontsize': 14, 'weight': 'bold', 'ha': 'center'}
fontdict_x = {'fontsize': 12, 'weight': 'bold', 'ha': 'center'}
fontdict_y = {'fontsize': 12, 'weight': 'bold', 'va': 'baseline', 'ha': 'center'}

plt.scatter(Y_target_test, Y_pred_test, s=30, c='green', edgecolor='black', linewidth=1, alpha=0.75, 
            label='Test set')

plt.scatter(Y_train_new, Y_pred_train, s=30, c='red', edgecolor='black', linewidth=1, alpha=0.75, label='Train set')

plt.plot([np.min(Y_target_test),np.max(Y_target_test)], 
         [np.min(Y_target_test),np.max(Y_target_test)], color='black', linestyle='--')

plt.title('Performance of ML model for \ntrain and test set', fontdict=fontdict_t, color='black')

plt.xlabel('GCMC simulated $N_{C_{3}H_{6}}$', fontdict=fontdict_x)
plt.ylabel('ML Predicted $N_{C_{3}H_{6}}$', fontdict=fontdict_y)

plt.legend(loc='upper left')

#3.0, 1.3, 2.7, 1.1, 2.5, 0.9, 2.41, 0.7

plt.text(3.0, 1.3, str('Train     Test'), weight='bold', horizontalalignment='left', size='medium', 
         color='black', fontsize=10)

plt.text(2.7, 1.1, str('$\mathregular{R^2:}$') + '{:.3f}'.format(r2_score(Y_train_new, Y_pred_train)) + 
         str('   ') + '{:.3f}'.format(r2_score(Y_target_test, Y_pred_test)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.text(2.5, 0.9, str('$\mathregular{MAE: }$') + '{:.3f}'.format(mean_absolute_error(Y_train_new, Y_pred_train)) +
         str('   ') + '{:.3f}'.format(mean_absolute_error(Y_target_test, Y_pred_test)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.text(2.41, 0.7, str('$\mathregular{RMSE: }$') + '{:.3f}'.format(mean_squared_error(Y_train_new, Y_pred_train)) +
         str('   ') + '{:.3f}'.format(mean_squared_error(Y_target_test, Y_pred_test)), weight='bold', 
         horizontalalignment='left', size='medium', color='black', fontsize=10)

plt.tight_layout()

In [None]:
grid_search_propylene_uptake = grid_search

In [None]:
del etr
#del svr
del param_grid
del grid_search
del GridSearchCV
#del ticks
#del mean_val
del mean_train
#del std_val
#del std_train
del Y_pred_val
del Y_pred_train
del localimportance
del explainer
del index
del feature_importance
del df_cv_result_propylene_uptake
del shap_values