In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
#mpl.rc('figure', max_open_warning = 0)
#%matplotlib inline
#%config InlineBackend.figure_format='retina'

from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split

from collections import OrderedDict

In [None]:
class style:
   BOLD = '\033[1m'
   END = '\033[0m'

In [None]:
PATH = os.getcwd() # Getting current directory
descriptor_in_path = os.path.join(PATH, '../input/descriptor.csv')

df_descriptor = pd.read_csv(descriptor_in_path)

print(f'Descriptor input DataFrame shape:\n\n {df_descriptor.shape}\n')
print('------------------------------------------------------------')

print(f'\nDescriptor input data columns:\n\n {df_descriptor.columns}\n')
print('------------------------------------------------------------')

print(f'\nDescriptor input dataframe head:\n\n {df_descriptor.head()}\n')
print('------------------------------------------------------------')

del descriptor_in_path

## Renaming descriptor columns

In [None]:
rename_dict = {'name': 'mof', 'Di': 'LCD', 'Df': 'PLD', 'ASA(m2/gram)_1.9': 'GSA', 
               'AV_Volume_fraction_1.9': 'AVF', 'AV(cm3/gram)_1.9': 'GPV', 'density(gram_cm3)': 'Density'}

df_descriptor = df_descriptor.rename(columns=rename_dict)

print(f'\nCurated descriptor columns:\n\n {df_descriptor.columns}\n')
print('------------------------------------------------------------')

print(df_descriptor.dtypes) # Prints the datatype of each column in dataframe
del rename_dict

## Curating descriptor data

In [None]:
df_descriptor_gross1_atomic = df_descriptor

# Selecting materials with PLD > 3.8 A

df_descriptor_gross1_atomic = df_descriptor_gross1_atomic[(df_descriptor_gross1_atomic['PLD'] > 3.8)]

# Selecting materials with non-zero void fraction

df_descriptor_gross1_atomic = df_descriptor_gross1_atomic[(df_descriptor_gross1_atomic['AVF'] > 0.0)]

descriptor_mof_name = df_descriptor_gross1_atomic['mof'].astype(str)

PATH = os.getcwd() # Getting current directory
curated_mof_name = os.path.join(PATH, '../output/curated-mof.csv')
descriptor_mof_name.to_csv(curated_mof_name, index=False)

columns = ['PLD', 'LCD', 'GSA', 'AVF', 'GPV', 'Density', 'total_degree_unsaturation', 'degree_unsaturation', 
           'metallic_percentage', 'O_to_Metal_ration', 'N_to_O_ratio', 'H' ,'Ni', 'Co', 'Cu', 'Zn', 'Pb', 'Mn',
           'Cd', 'C', 'O', 'N', 'S', 'Cl', 'Br', 'F', 'I']

shap_columns = columns

df_descriptor_gross1_atomic = df_descriptor_gross1_atomic[columns].astype(float)
curated_mof_prop = os.path.join(PATH, '../output/curated-mof-prop.csv')

df_descriptor_gross1_atomic.to_csv(curated_mof_prop, index=False)

print(f'\nCurated gross1_atomic descriptor data:\n\n {df_descriptor_gross1_atomic}\n')
print('\n------------------------------------------------------------\n')

print(f'\nData type of each column. Note that it should be float\n\n {df_descriptor_gross1_atomic.dtypes}\n')
print('\n------------------------------------------------------------\n')

del df_descriptor
del columns
del descriptor_mof_name
del curated_mof_name
del curated_mof_prop

## Taking look at target data

In [None]:
target_in_path = os.path.join(PATH, '../input/C3H8-C3H6.csv')
#target_in_path = os.path.join(PATH, '../input/C2H6-C2H4.csv')

df_target = pd.read_csv(target_in_path)

print(f'Target property input DataFrame shape:\n\n {df_target.shape}\n')
print('------------------------------------------------------------')

print(f'\nTarget property input data columns:\n\n {df_target.columns}\n')
print('------------------------------------------------------------')

print(f'\nTarget property input dataframe head:\n\n {df_target.head()}\n')
print('------------------------------------------------------------')

del target_in_path

## Renaming Target property columns

In [None]:
rename_dict = {'MOF_no': 'mof', 'propane_avg(mol/kg)': 'propane_uptake(mol/kg)',
              'propylene_avg(mol/kg)': 'propylene_uptake(mol/kg)',
              'C3H8/C3H6 Selectivity (1Bar)': 'propane_propylene_selectivity', 'Df': 'PLD',
              'AV_Volume_fraction_1.9': 'AVF'}
'''

rename_dict = {'MOF_no': 'mof', 'ethane_avg(mol/kg)': 'ethane_uptake(mol/kg)',
              'ethylene_avg(mol/kg)': 'ethylene_uptake(mol/kg)',
              'C2H6/C2H4 Selectivity (1Bar)': 'ethane_ethylene_selectivity', 'Df': 'PLD',
              'AV_Volume_fraction_1.9': 'AVF'}

'''
df_target = df_target.rename(columns=rename_dict)

print(f'\nCurated target columns:\n\n {df_target.columns}\n')
print('------------------------------------------------------------')
      
del rename_dict

## Curating Target dataset

In [None]:
df_target_gross1_atomic = df_target

# Selecting materials with PLD > 3.8 A

df_target_gross1_atomic = df_target_gross1_atomic[(df_target_gross1_atomic['PLD'] > 3.8)]

# Selecting material with AVF > 0
df_target_gross1_atomic = df_target_gross1_atomic[(df_target_gross1_atomic['AVF'] > 0.0)]

target_mof_name = df_target_gross1_atomic['mof'].astype(str)
target_mof_name_path = os.path.join(PATH, '../output/target-mof-name.csv')
target_mof_name.to_csv(target_mof_name_path, index=False)

columns = ['propane_uptake(mol/kg)', 'propane_propylene_selectivity', 'TSN', 'propylene_uptake(mol/kg)']

#columns = ['ethane_uptake(mol/kg)', 'ethane_ethylene_selectivity', 'TSN', 'ethylene_uptake(mol/kg)']


df_target_gross1_atomic = df_target_gross1_atomic[columns].astype(float)
target_mof_prop_path = os.path.join(PATH, '../output/target-mof-prop.csv')

print(f'\nCurated target data:\n\n {df_target_gross1_atomic}\n')
print('\n------------------------------------------------------------\n')

print(f'\nData type of each column. Note that it should be float\n\n {df_target_gross1_atomic.dtypes}\n')
print('\n------------------------------------------------------------\n')

del df_target
del columns
del target_mof_name
del target_mof_name_path
del target_mof_prop_path

In [None]:
'''
profile = ProfileReport(df_join.copy(),title='C3H8-C3H6', html={'style':{'full_width':True}})
# profile.to_widgets()
#profile.to_notebook_iframe()
C3H8_report = os.path.join(PATH, '../output/C3H8-C3H6-report.csv')

profile.to_file("/home/varad/Pictures/best_model_selection_updated/1_excluding_oms/1_Propane_RACs_excluding.html")

''''

In [None]:
X_crude = df_descriptor_gross1_atomic
Y_crude = df_target_gross1_atomic

print(f'\nShape of X_crude: {X_crude.shape}')
print(f'\nShape of Y_crude: {Y_crude.shape}')

del df_descriptor_gross1_atomic
del df_target_gross1_atomic

Here I implemented some classical ML models from `sklearn`:

* Ridge regression
* Support vector machine
* Linear support vector machine
* Random forest
* Extra trees
* Adaptive boosting
* Gradient boosting
* k-nearest neighbors
* Dummy (if one can't beat this, then our model is wrong.)

Note: the Dummy model from `sklearn` act as a good sanity check for our ML studies. If our models does not perform significantly better than the equivalent Dummy models, something is wrong in our model implementation.

In [None]:
from time import time

from sklearn.dummy import DummyRegressor

from sklearn.linear_model import Ridge

from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.neighbors import KNeighborsRegressor

from sklearn.svm import SVR
from sklearn.svm import LinearSVR

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In addition, we define some helper functions.

In [None]:
def instantiate_model(model_name):
    model = model_name()
    return model

def fit_model(model, X_train, y_train):
    ti = time()
    model = instantiate_model(model)
    model.fit(X_train, y_train)
    fit_time = time() - ti
    return model, fit_time

def evaluate_model(model, X, y_act):
    y_pred = model.predict(X)
    r2 = r2_score(y_act, y_pred)
    mae = mean_absolute_error(y_act, y_pred)
    rmse_val = mean_squared_error(y_act, y_pred, squared=False)
    return r2, mae, rmse_val

def fit_evaluate_model(model, model_name, split, X_train, y_train, X_val, y_act_val):
    model, fit_time = fit_model(model, X_train, y_train)
    r2_train, mae_train, rmse_train = evaluate_model(model, X_train, y_train)
    r2_val, mae_val, rmse_val = evaluate_model(model, X_val, y_act_val)
    result_dict = {
        'split': split,
        'model_name': model_name,
        'model_name_pretty': type(model).__name__,
        'model_params': model.get_params(),
        'fit_time': fit_time,
        'r2_train': r2_train,
        'mae_train': mae_train,
        'rmse_train': rmse_train,
        'r2_val': r2_val,
        'mae_val': mae_val,
        'rmse_val': rmse_val}
    return model, result_dict

def append_result_df(df, result_dict):
    df_result_appended = df.append(result_dict, ignore_index=True)
    return df_result_appended

def append_model_dict(dic, model_name, model):
    dic[model_name] = model
    return dic

Build an empty DataFrame to store model results:

In [None]:
df_classics = pd.DataFrame(columns=['split',
                                    'model_name',
                                    'model_name_pretty',
                                    'model_params',
                                    'fit_time',
                                    'r2_train',
                                    'mae_train',
                                    'rmse_train',
                                    'r2_val',
                                    'mae_val',
                                    'rmse_val'])
df_classics

## Define the models

Here, I instantiated several classical machine learning models for use.
I have not tuned the hyperparameters of the model. And default parametes are used here.
Hyper parameters tuning using `Grid search` will be the next step

In [None]:
# Build a dictionary of model names
classic_model_names = OrderedDict({
    'dumr': DummyRegressor,
    'rr': Ridge,
    'abr': AdaBoostRegressor,
    'gbr': GradientBoostingRegressor,
    'rfr': RandomForestRegressor,
    'etr': ExtraTreesRegressor,
    'svr': SVR,
    'lsvr': LinearSVR,
    'knr': KNeighborsRegressor,
})

## Instantiate and fit the models

Now, we can fit the ML models.

We will loop through each of the models listed above. For each of the models, we will:
* instantiate the model (`with default parameters`)
* fit the model using the training data
* use the fitted model to generate predictions from the validation data
* evaluate the performance of the model using the predictions
* store the results in a DataFrame for analysis

In [None]:
def plot_pred_act_mine(Y_act_train, Y_pred_train, Y_act, Y_pred, model, path, scale, prop, cord_list, val):
    
    # Setting plotting attributes
    plt.rcParams['font.family'] = 'serif'

    fontdict_t = {'fontsize': 14, 'weight': 'bold', 'ha': 'center'}
    fontdict_x = {'fontsize': 12, 'weight': 'bold', 'ha': 'center'}
    fontdict_y = {'fontsize': 12, 'weight': 'bold', 'va': 'baseline', 'ha': 'center'}
    
    # Plotting
    plot = plt.figure(figsize=(6,6))
    
    #print(model_name)
    #print(model.__name__)
    #print(model)
    
    #raise ValueError('Testing going on')
    
    if val: 
        #print('Plotting a plot for Train and Validation set')
        
        # Finding Maximum and minimum for straight line graph
        
        xy_max = np.max([np.max(Y_act_train), np.max(Y_pred_train)])
        xy_min = np.min([np.min(Y_act_train), np.min(Y_pred_train)])
        
        plt.scatter(Y_act, Y_pred, s=30, c='green', edgecolor='black', linewidth=1, alpha=0.75, 
                    label='Validation set')
        
        plt.scatter(Y_act_train, Y_pred_train, s=30, c='red', edgecolor='black', linewidth=1, alpha=0.75, 
                    label='Train set')
        
        plt.plot([xy_min, xy_max], [xy_min,xy_max], color='black', linestyle='--')
        
        #plt.title(f'{type(model).__name__}, r2: {r2_score(act, pred):0.4f}')
        plt.title(f'{type(model).__name__} model for \ntrain and validation set ({scale})',
                  fontdict=fontdict_t, color='black')
        plt.axis('scaled')
        
        plt.xlabel(f'GCMC simulated {prop}', fontdict=fontdict_x)
        plt.ylabel(f'ML Predicted {prop}', fontdict=fontdict_y)
        plt.legend(loc='upper left')
        
        plt.text(cord_list[0], cord_list[1], str('Train     Validation'), weight='bold', horizontalalignment='left', 
                 size='medium', color='black', fontsize=10)

        plt.text(cord_list[2], cord_list[3], str('$\mathregular{R^2:}$ ') + '{:.3f}'.format(r2_score(Y_act_train, Y_pred_train))
                 + str('   ') + '{:.3f}'.format(r2_score(Y_act, Y_pred)), weight='bold', 
                 horizontalalignment='left', size='medium', color='black', fontsize=10)

        plt.text(cord_list[4], cord_list[5], str('$\mathregular{MAE:}$ ') + '{:.3f}'.format(mean_absolute_error(Y_act_train, Y_pred_train)) 
                 + str('   ') + '{:.3f}'.format(mean_absolute_error(Y_act, Y_pred)), weight='bold', 
                 horizontalalignment='left', size='medium', color='black', fontsize=10)
        
        plt.text(cord_list[6], cord_list[7], str('$\mathregular{RMSE:}$ ') + '{:.3f}'.format(mean_squared_error(Y_act_train, Y_pred_train, squared = False)) 
                 + str('   ') + '{:.3f}'.format(mean_squared_error(Y_act, Y_pred, squared = False)), weight='bold', 
                 horizontalalignment='left', size='medium', color='black', fontsize=10)
        
        train_val_path = path + '/' +  '1_train_val' + '/' + str(model.__name__)
        
        #print (path)
        #print(str(model.__name__))
        
        plt.savefig(train_val_path, dpi=300)
        
        #raise ValueError('Testing going on val = true!!')
        
        return plot
    
    else:
        #print('Plotting a plot for Train and Test set')
        #print('Note that here the train set is combination of train and validation set')
        
        # Finding Maximum and minimum for straight line graph
        
        xy_max = np.max([np.max(Y_act_train), np.max(Y_pred_train)])
        xy_min = np.min([np.min(Y_act_train), np.min(Y_pred_train)])
        
        plt.scatter(Y_act, Y_pred, s=30, c='green', edgecolor='black', linewidth=1, alpha=0.75, label='Test set')
        
        plt.scatter(Y_act_train, Y_pred_train, s=30, c='red', edgecolor='black', linewidth=1, alpha=0.75, 
                    label='Train set')
        
        plt.plot([xy_min, xy_max], [xy_min,xy_max], color='black', linestyle='--')
        
        #plt.title(f'{type(model).__name__}, r2: {r2_score(act, pred):0.4f}')
        plt.title(f'{type(model).__name__} model for \ntrain and test set ({scale})',
                  fontdict=fontdict_t, color='black')
        plt.axis('scaled')
        
        plt.xlabel(f'GCMC simulated {prop}', fontdict=fontdict_x)
        plt.ylabel(f'ML Predicted {prop}', fontdict=fontdict_y)
        plt.legend(loc='upper left')
        
        plt.text(cord_list[0], cord_list[1], str('Train     Test'), weight='bold', horizontalalignment='left', 
                 size='medium', color='black', fontsize=10)

        plt.text(cord_list[2], cord_list[3], str('$\mathregular{R^2:}$ ') + '{:.3f}'.format(r2_score(Y_act_train, Y_pred_train))
                 + str('   ') + '{:.3f}'.format(r2_score(Y_act, Y_pred)), weight='bold', 
                 horizontalalignment='left', size='medium', color='black', fontsize=10)

        plt.text(cord_list[4], cord_list[5], str('$\mathregular{MAE:}$ ') + '{:.3f}'.format(mean_absolute_error(Y_act_train, Y_pred_train)) 
                 + str('   ') + '{:.3f}'.format(mean_absolute_error(Y_act, Y_pred)), weight='bold', 
                 horizontalalignment='left', size='medium', color='black', fontsize=10)
        
        plt.text(cord_list[6], cord_list[7], str('$\mathregular{RMSE:}$ ') + '{:.3f}'.format(mean_squared_error(Y_act_train, Y_pred_train, squared = False)) 
                 + str('   ') + '{:.3f}'.format(mean_squared_error(Y_act, Y_pred, squared = False)), weight='bold', 
                 horizontalalignment='left', size='medium', color='black', fontsize=10)
        
        train_test_path = path + '/' + '2_train_test' + '/' + str(model.__name__)
        
        #print (path)
        #print(str(type(model).__name__))
        
        plt.savefig(train_test_path, dpi=300)
        
        #raise ValueError('Testing going on val = true!!')
        
        return plot

# Creating validation set and using the same validation set for all the random seeds

In [None]:
X, X_val_crude, Y, Y_val_crude = train_test_split(X_crude, Y_crude, test_size=0.32, random_state=42)

In [None]:
# Instantiate a dictionary to store the model objects
classic_models = OrderedDict()

# Keep track of elapsed time
ti = time()

# base path
base_path = os.path.join(PATH, '../output/best_model/')

# Mixture status
mixture_status = "1_Propane/"
#mixture_status = "2_Ethane/"

# Which Property is used as target variable
property_status = "1_selectivity/"
#property_status = "2_uptake_paraffin/"
#property_status = "3_uptake_olefin/"
#property_status = "4_TSN/"

# Whether atomic features are used or RACs are used
feature_status = "1_Atomic/"
#feature_status = "2_RACs/"

# Combined path
comb_path = base_path + feature_status + property_status + mixture_status

# A dataframe to get the average r2 for all the splits of all the models
df_average = pd.DataFrame(columns=['model_name',
                                   'model_name_pretty',
                                   '<r2_train>',
                                   '<MAE_train>',
                                   '<RMSE_train>',
                                   '<r2_val>',
                                   '<MAE_val>',
                                   '<RMSE_val>',
                                   '<r2_new_train>',
                                   '<MAE_new_train>',
                                   '<RMSE_new_train>',
                                   '<r2_test>',
                                   '<MAE_test>',
                                   '<RMSE_test>'])


# Loop through each model type, fit and predict, and evaluate and store results
for model_name_temp, model_temp in classic_model_names.items():
    #print(model_name)
    #print(model.__name__)
    #print(model)
    #splits = range(10)
    
    # Model is selected
    
    splits = [41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
    
    df_classics_val = pd.DataFrame(columns=['split',
                                    'model_name',
                                    'model_name_pretty',
                                    'model_params',
                                    'fit_time',
                                    'r2_train',
                                    'mae_train',
                                    'rmse_train',
                                    'r2_val',
                                    'mae_val',
                                    'rmse_val'])
    
    df_classics_test = df_classics_val
    
    for split in splits:
        
        # Random splits for the model selected
        
        model_name = model_name_temp
        model      = model_temp
        
        #print('----------------')
        #print(model_name)
        #print(model.__name__)
        #print(model)
        #print('----------------')
        
        print(f'Fitting and evaluating model {model_name}: {model.__name__} for random seed of {split}')
        #print(f'Fitting and evaluating model {model_name} for random seed of {split}')
        
        # Creating the test train split
        
        X_train_crude, X_test_crude, Y_train_crude, Y_test_crude = train_test_split(X, Y, test_size=0.294, random_state=split)

        #-----------------------------------------------------------------------------------------------------#
        ## For Learning curve
        
        #print(f'\n X_train is :\n\n {X_train_crude}\n')
        #print('\n----------------------------------------------------------------------------------------------\n')
        
        #print(f'\n X_val is :\n\n {X_val_crude}\n')
        #print('\n----------------------------------------------------------------------------------------------\n')
        
        #print(f'\n X_test is :\n\n {X_test_crude}\n')
        #print('\n----------------------------------------------------------------------------------------------\n')

        #raise ValueError('Testing going on!!')
        
#************************************************************************************************************#        
        # Scaling the data
        scaler = StandardScaler()

        X_train_scaled = scaler.fit_transform(X_train_crude)
        X_val_scaled   = scaler.transform(X_val_crude)
        X_test_scaled  = scaler.transform(X_test_crude)
        
        # Normalizing the unscaled data
        norm = MinMaxScaler().fit(X_train_crude)

        X_train_norm  = norm.transform(X_train_crude)
        X_val_norm    = norm.transform(X_val_crude)
        X_test_norm   = norm.transform(X_test_crude)
        
        # Normalizing the scaled data
        norm_scaled         = MinMaxScaler().fit(X_train_scaled)

        X_train_scaled_norm = norm_scaled.transform(X_train_scaled)
        X_val_scaled_norm   = norm_scaled.transform(X_val_scaled)
        X_test_scaled_norm  = norm_scaled.transform(X_test_scaled)

#***********************************************************************************************************#
        
#***********************************************************************************************************#        
        
        ## Uncomment when model has to be trained on crude data
        #X_train = X_train_crude
        #X_val   = X_val_crude
        #X_test  = X_test_crude

        ## Uncomment when model has to be trained on scaled data

        #X_train = X_train_scaled
        #X_val   = X_val_scaled
        #X_test  = X_test_scaled

        ## Uncomment when model has to be trained on normalised data
        #X_train = X_train_norm
        #X_val   = X_val_norm
        #X_test  = X_test_norm

        ## Uncomment when model has to be trained on scaled_normalised  data
        X_train  = X_train_scaled_norm
        X_val    = X_val_scaled_norm
        X_test   = X_test_scaled_norm
        
#***********************************************************************************************************# 

#***********************************************************************************************************#
        
        # Target Y is neigther scaled nor normalized
    
        # If index is 0 then, propane / ethane uptake (mol/kg)  
        # If index is 1 then, selectivity
        # If index is 2 then, TSN
        # If index is 3 then, propylene / ethylene uptake (mol/kg)

        i = 1
        
        Y_target_train = Y_train_crude.iloc[:,i]
        Y_target_test  = Y_test_crude.iloc[:,i]
        Y_target_val   = Y_val_crude.iloc[:,i]
        
        # Note feature status = atomic or RAC does not matter as cord_list does not change
               
        #--------------------------------------------------------------------------------------------------#
        # Propane + {property} + {atomic / RAC (doesn't matter)} + including + scaled + normalized #
        
        elif (i == 1 and mixture_status == "1_Propane/" and property_status == "1_selectivity/"):
            print("This should be running")
            scale  = "Scaled + Normalized"
            prop   = "$S_{C_{3}H_{8}/C_{3}H_{6}}$"
            cord_list = [1.75, 1.20, 1.65, 1.13, 1.59, 1.08, 1.56, 1.03]
            #raise ValueError('Testing going on!!')

        elif (i == 0 and mixture_status == "1_Propane/" and property_status == "2_uptake_paraffin/"):
            #print("This should be running")
            scale  = "Scaled + Normalized"
            prop   = "$N_{C_{3}H_{8}}$"
            cord_list = [1.0, 0.3, 0.85, 0.2, 0.78, 0.13, 0.74, 0.06]
            #raise ValueError('Testing going on!!')
        
        elif (i == 3 and mixture_status == "1_Propane/" and property_status == "3_uptake_olefin/"):
            #print("This should be running")
            scale  = "Scaled + Normalized"
            prop   = "$N_{C_{3}H_{6}}$"
            cord_list = [4.0, 1.3, 3.4, 1.0, 3.1, 0.7, 2.9, 0.4]
            #raise ValueError('Testing going on!!')
            
        elif (i == 2 and mixture_status == "1_Propane/" and property_status == "4_TSN/"):
            #print("This should be running")
            scale  = "Scaled + Normalized"
            prop   = "TSN"
            cord_list = [0.13, 0.04, 0.110, 0.025, 0.099, 0.013, 0.094, 0.001]
            #raise ValueError('Testing going on!!')
            
        #--------------------------------------------------------------------------------------------------#
        
        #--------------------------------------------------------------------------------------------------#
        # Ethane + {property} + {atomic / RAC (doesn't matter)} + including + scaled + normalized #
        
        elif (i == 1 and mixture_status == "2_Ethane/" and property_status == "1_selectivity/"):
            #print("This should be running")
            scale  = "Scaled + Normalized"
            prop   = "$S_{C_{2}H_{6}/C_{2}H_{4}}$"
            cord_list = [2.25, 1.2, 2.06, 1.05, 1.95, 0.92, 1.89, 0.79]
            #raise ValueError('Testing going on!!')

        elif (i == 0 and mixture_status == "2_Ethane/" and property_status == "2_uptake_paraffin/"):
            #print("This should be running")
            scale  = "Scaled + Normalized"
            prop   = "$N_{C_{2}H_{6}}$"
            cord_list = [0.40, 0.1, 0.35, 0.06, 0.32, 0.03, 0.305, 0.00]
            #raise ValueError('Testing going on!!')
        
        elif (i == 3 and mixture_status == "2_Ethane/" and property_status == "3_uptake_olefin/"):
            #print("This should be running")
            scale  = "Scaled + Normalized"
            prop   = "$N_{C_{2}H_{4}}$"
            cord_list = [1.75, 0.5, 1.5, 0.3, 1.38, 0.15, 1.32, 0.0]
            #raise ValueError('Testing going on!!')
            
        elif (i == 2 and mixture_status == "2_Ethane/" and property_status == "4_TSN/"):
            #print("This should be running")
            scale  = "Scaled + Normalized"
            prop   = "TSN"
            cord_list = [0.1, 0.040, 0.085, 0.025, 0.077, 0.013, 0.073, 0.001]
            #raise ValueError('Testing going on!!')
        else :
            raise ValueError('Combinations are wrong')
        
        #--------------------------------------------------------------------------------------------------#
        
#***********************************************************************************************************#

        #print('************')
        #print(model_name)
        #print(model.__name__)
        #print(model)
        #print('************')
        
#***********************************************************************************************************#

# Evaluating model performance on train and same validation set for different models

        model_val, result_dict_val = fit_evaluate_model(model, model_name, split, X_train, Y_target_train, X_val, Y_target_val)
        
        df_classics_val = append_result_df(df_classics_val, result_dict_val)
        
        Y_act_train  = Y_target_train
        Y_pred_train = model_val.predict(X_train)
        
        Y_act_val  = Y_target_val
        Y_pred_val = model_val.predict(X_val)
        
        model_performance_path_val = comb_path  + str(model_name_temp) + '/' + 'split_' + str(split)
        
        plot = plot_pred_act_mine(Y_act_train, Y_pred_train, Y_act_val, Y_pred_val, model, model_performance_path_val, scale, prop, cord_list, val=True)
        #raise ValueError('Testing going on!!')
        
        del model_val
        del Y_act_train, Y_pred_train, Y_act_val, Y_pred_val
        del model_performance_path_val
        
#***********************************************************************************************************#

#***********************************************************************************************************#

# Evaluating model performance on new train set and test set for different models
# Actually this process should be done on the best model selected in previous step
# However, I am doing predictions on test set for all the models. This is done only for analysis purposes.

        X_train_new = np.concatenate((X_train, X_val), axis=0)
        Y_train_new = pd.concat((Y_target_train, Y_target_val), axis=0)
        
        model_test, result_dict_test = fit_evaluate_model(model, model_name, split, X_train_new, Y_train_new, X_test, Y_target_test)
        
        df_classics_test = append_result_df(df_classics_test, result_dict_test)
        
        Y_act_train  = Y_train_new
        Y_pred_train = model_test.predict(X_train_new)
        
        Y_act_test  = Y_target_test
        Y_pred_test = model_test.predict(X_test)
        
        model_performance_path_test = comb_path  + str(model_name_temp) + '/' + 'split_' + str(split)
        
        plot = plot_pred_act_mine(Y_act_train, Y_pred_train, Y_act_test, Y_pred_test, model, model_performance_path_test, scale, prop, cord_list, val=False)
        
        del X_train_new, Y_train_new
        del model_test, Y_act_train, Y_pred_train, Y_act_test, Y_pred_test
        del model_performance_path_test

#***********************************************************************************************************#

#***********************************************************************************************************#
        
        del model_name, model
        #del X, X_test_crude, Y, Y_test_crude
        del X_test_crude, Y_test_crude
        #del X_train_crude, X_val_crude, Y_train_crude, Y_val_crude
        del X_train_crude, Y_train_crude
        del scaler, X_train_scaled, X_val_scaled, X_test_scaled, 
        del norm, X_train_norm, X_val_norm, X_test_norm
        del norm_scaled, X_train_scaled_norm, X_val_scaled_norm, X_test_scaled_norm
        del X_train, X_val, X_test
        del Y_target_train, Y_target_test, Y_target_val
        
#***********************************************************************************************************#

#***********************************************************************************************************#

    #raise ValueError('Testing going on!!')
    
    df_classics_val['split'] = df_classics_val['split'].astype(int)
    
    print(f'\n df_classics_val for model {model_temp.__name__} is :\n\n {df_classics_val}\n')
    print('\n----------------------------------------------------------------------------------------------\n')
    
    split_stat_path_val = comb_path  + str(model_name_temp) + '/' + str(model_name_temp) + '_' +'train_val' + '.csv'
    
    #print(split_stat_path_val)
    
    df_classics_val.to_csv(split_stat_path_val, index=False)
    
    #raise ValueError('Testing going on!!')
    
#***********************************************************************************************************#

#***********************************************************************************************************#
    
    # Print the average R2, MAE and RMSE for all the splits of a particular model for train set of train_val
    
    avg_r2_train   = df_classics_val['r2_train'].mean()
    avg_mae_train  = df_classics_val['mae_train'].mean()
    avg_rmse_train = df_classics_val['rmse_train'].mean()
    
    print(f'Average train r2 for train-val set for model {model_temp.__name__} is       : {avg_r2_train:0.4f}')
    print(f'Average train MAE for train-val set for for model {model_temp.__name__} is  : {avg_mae_train:0.4f}')
    print(f'Average train RMSE for train-val set for for model {model_temp.__name__} is : {avg_rmse_train:0.4f}')
    print('\n----------------------------------------------------------------------------------------------\n')
    
    # Print the average R2, MAE and RMSE for all the splits of a particular model for validation set of train_val
    
    avg_r2_val   = df_classics_val['r2_val'].mean()
    avg_mae_val  = df_classics_val['mae_val'].mean()
    avg_rmse_val = df_classics_val['rmse_val'].mean()

    print(f'Average validation r2 for train-val set for model {model_temp.__name__} is       : {avg_r2_val:0.4f}')
    print(f'Average validation MAE for train-val set for for model {model_temp.__name__} is  : {avg_mae_val:0.4f}')
    print(f'Average validation RMSE for train-val set for for model {model_temp.__name__} is : {avg_rmse_val:0.4f}')
    print('\n----------------------------------------------------------------------------------------------\n')
    

    
#***********************************************************************************************************#

#***********************************************************************************************************#
    
# Note here train = new_train and val = test

    df_classics_test['split'] = df_classics_test['split'].astype(int)
    
    #print(f'\n df_classics_test for model {model_temp.__name__} is :\n\n {df_classics_test}\n')
    #print('\n------------------------------------------------------------\n')
    
    split_stat_path_test = comb_path  + str(model_name_temp) + '/' + str(model_name_temp) + '_' + 'train_test' + '.csv'
    
    #print(split_stat_path_test)
    
    df_classics_test.to_csv(split_stat_path_test, index=False)
    
#***********************************************************************************************************#

#***********************************************************************************************************#
    
    # Print the average R2, MAE and RMSE for all the splits of a particular model for new train_set of for train_val
    
    avg_r2_new_train   = df_classics_test['r2_train'].mean() # Note here train = new_train and val = test
    avg_mae_new_train  = df_classics_test['mae_train'].mean() # Note here train = new_train and val = test 
    avg_rmse_new_train = df_classics_test['rmse_train'].mean() # Note here train = new_train and val = test 

    print(f'Average new_train r2 for train-test set for model {model_temp.__name__} is   : {avg_r2_new_train:0.4f}')
    print(f'Average new_train MAE for train-test set for model {model_temp.__name__} is  : {avg_mae_new_train:0.4f}')
    print(f'Average new_train RMSE for train-test set for model {model_temp.__name__} is : {avg_rmse_new_train:0.4f}')
    print('\n----------------------------------------------------------------------------------------------\n')

    
    # Print the average R2, MAE and RMSE for all the splits of a particular model for test set for train_val
    
    avg_r2_test   = df_classics_test['r2_val'].mean() # Note here train = new_train and val = test
    avg_mae_test  = df_classics_test['mae_val'].mean() # Note here train = new_train and val = test 
    avg_rmse_test = df_classics_test['rmse_val'].mean() # Note here train = new_train and val = test 

    print(f'Average validation r2 for train-test set for model {model_temp.__name__} is   : {avg_r2_test:0.4f}')
    print(f'Average validation MAE for train-test set for model {model_temp.__name__} is  : {avg_mae_test:0.4f}')
    print(f'Average validation RMSE for train-test set for model {model_temp.__name__} is : {avg_rmse_test:0.4f}')
    print('\n----------------------------------------------------------------------------------------------\n')
    
#***********************************************************************************************************#

#***********************************************************************************************************#
    # Here we are calculating average value of R2, MAE, RMSe for all the 10 splits of a particular order
    
    average_dict = {
        'model_name': model_temp.__name__,
        'model_name_pretty': model_name_temp,
        '<r2_train>': avg_r2_train,
        '<MAE_train>': avg_mae_train,
        '<RMSE_train>': avg_rmse_train,
        '<r2_val>': avg_r2_val,
        '<MAE_val>': avg_mae_val,
        '<RMSE_val>': avg_rmse_val,
        '<r2_new_train>': avg_r2_new_train,
        '<MAE_new_train>': avg_mae_new_train,
        '<RMSE_new_train>': avg_rmse_new_train,
        '<r2_test>': avg_r2_test,
        '<MAE_test>': avg_mae_test,
        '<RMSE_test>': avg_rmse_test}
        
    df_average = append_result_df(df_average, average_dict)
    
    print(f'\n df_average is :\n\n {df_average}\n')
    print('\n----------------------------------------------------------------------------------------------\n')
    
    del avg_r2_val, avg_mae_val, avg_rmse_val
    del avg_r2_test, avg_mae_test, avg_rmse_test
    
    #raise ValueError('Testing going on!!')

#***********************************************************************************************************#

#***********************************************************************************************************#
    
    #We then plot the train and validation $r^2$ scores for each of the 10 models.

    #Note the high variability in the r2_val score. In contrast, the variability in the r2_train score is comparatively lower.
    
    df_classics_val.plot('split', ['r2_train', 'r2_val'], kind='bar')
    plt.title(f'Performance of {model_temp.__name__}\nwith {len(splits)} different data splits')
    plt.ylim((0.0, 1.0))
    plt.ylabel('$r^2$')
    plt.xlabel('Split #')
    plt.legend(loc='lower right', framealpha=0.9)
    #plt.show()
    histo_R2_path = comb_path  + str(model_name_temp) + '/' + str(model_name_temp) + '_' + 'R2_histo.png' 
    plt.savefig(histo_R2_path, dpi=300)
    del histo_R2_path
    
    df_classics_val.plot('split', ['mae_train', 'mae_val'], kind='bar')
    plt.title(f'Performance of {model_temp.__name__}\nwith {len(splits)} different data splits')
    plt.ylabel('MAE')
    plt.xlabel('Split #')
    plt.legend(loc='lower right', framealpha=0.9)
    #plt.show()
    histo_MAE_path = comb_path  + str(model_name_temp) + '/' + str(model_name_temp) + '_' + 'MAE_histo.png' 
    plt.savefig(histo_MAE_path, dpi=300)
    del histo_MAE_path 
    
    df_classics_val.plot('split', ['rmse_train', 'rmse_val'], kind='bar')
    plt.title(f'Performance of {model_temp.__name__}\nwith {len(splits)} different data splits')
    plt.ylabel('RMSE')
    plt.xlabel('Split #')
    plt.legend(loc='lower right', framealpha=0.9)
    #plt.show()
    histo_RMSE_path = comb_path  + str(model_name_temp) + '/' + str(model_name_temp) + '_' + 'RMSE_histo.png' 
    plt.savefig(histo_RMSE_path, dpi=300)
    del histo_RMSE_path
    
    #a = df_classics_val
    
    del df_classics_val, df_classics_test

    #raise ValueError('Testing going on!!')
    
#***********************************************************************************************************#
    
#***********************************************************************************************************#

# Sort in order of decreasing validation r2 score

print(f'\n df_average before sorting is :\n\n {df_average}\n')
print('\n--------------------------------------------------------------------------------------------------\n')

df_average = df_average.sort_values('<r2_val>', ascending=False, ignore_index=True)

print(f'\n df_average after sorting is :\n\n {df_average}\n')
print('\n--------------------------------------------------------------------------------------------------\n')

# Saving the sorted df_average

df_average_path = comb_path + 'sorted_average_scores.csv'

df_average.to_csv(df_average_path, index=False)

del df_average_path

# Find the best-performing model that we have tested
best_row = df_average.iloc[0, :].copy()

# Get the model type and model parameters
best_model    = best_row['model_name']
best_avg_r2   = best_row['<r2_val>']
#best_avg_mae  = best_row['<MAE_val>']
#best_avg_rmse = best_row['<RMSE_val>']


print(f'\n The best model is {best_model} with an average r2 of {best_avg_r2}\n')
print('\n--------------------------------------------------------------------------------------------------\n')

#model_params = best_row['model_params']

# Instantiate the model again using the parameters
#model = classic_model_names[model_name](**model_params)
#print(model)

print('\n--------------------------------------------------------------------------------------------------\n')
print('------------------------------------------------------------')
print(style.BOLD + '\n Options for this run are :' + style.END)
print(f'\nFeature status : {feature_status}')
print(f'\nProperty_status: {property_status}')
print(f'\nMixture_status : {mixture_status}')
print('------------------------------------------------------------')

In [None]:
raise ValueError('Testing going on!!')