<center><font size="6"><b>Showcasing new validation strategy with GridSearchCV</b></font></center>

# Unchanged codes
Unchanged codes are lumped together below.

In [1]:
# preamble, add some code for formatting, not related tto validation strategy

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.display import set_matplotlib_formats

set_matplotlib_formats('pdf', 'png')
np.set_printoptions(precision=3,suppress=True)
plt.rcParams['savefig.dpi'] = 75

plt.rcParams['figure.autolayout'] = False
plt.rcParams['figure.figsize'] = 10, 6
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14

plt.rcParams['text.usetex'] = True
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.serif'] = "cm"
plt.rcParams['text.latex.preamble']= r"\usepackage{subdepth}, \usepackage{type1cm}"

In [2]:
# read the 36-descriptor data
df36Descriptor = pd.read_excel('data/ML_data/descriptor_used.xlsx',header=4,index_col=1)

# clean up the column
columns = [df36Descriptor.columns[1]] + df36Descriptor.columns[3: -11].tolist()

newColumns = {}
for ci in columns:
    if ' ' in ci:
        newColumns[ci] = ci.split(' ',1)[0]
    elif '(' in ci:
        newColumns[ci] = ci.split('(',1)[0]
    else:
        newColumns[ci] = ci

dfShortNames = df36Descriptor[columns].rename(columns=newColumns)

# reduce columns to only contain MOF features
shared_descriptor = [col for col in dfShortNames.columns if col in newColumns]
dfMLReduced = dfShortNames[shared_descriptor]

# the MOFs in "dfMLReduced" and adsorption data sets are different, so it is necessary to match the MOFs in two datasets
def datasetMatch(MOFName):
    dfML= dfMLReduced[dfMLReduced['MOF'].isin(MOFName)].drop_duplicates()
    matchedMOFIndex=np.isin(MOFName, dfML['MOF'].values)
    return matchedMOFIndex, dfML

# read flexibility data
flexibilityList=os.listdir('data/flexibility_data/y_data/adsorption_data') # obtain list of csv files for 9 adsorption uptakes
flexivilityData=[]
adsorbateNameList = []

for i, name in enumerate(flexibilityList):
    # read csv files for certain adsorption uptakes
    df = pd.read_csv('data/flexibility_data/y_data/adsorption_data/' + name)
    
    # obtain the rigid value
    rigidValue = np.array(df[df.columns[1]], dtype = float)
    
    # obtain the flexible mean value
    flexValue = np.mean(np.array(df[df.columns[2:]],dtype=float),axis=1)
    
    # obtain the adsorbate label
    label = np.array([name.split("_")[1] for x in range(0,len(flexValue))],dtype=str)
    adsorbateNameList.append(name.split("_")[1])
    
    # stack the rigid value, flexible mean value and the adsorbate label
    singleSet = np.column_stack([rigidValue,flexValue,label])

    if i == 0:
        # obtain the name list of MOFs
        MOFNameTemp = np.array(df[df.columns[0]], dtype = str)
        MOFName = [x.split("_")[0] for x in MOFNameTemp]
        
        # search the MOF name in "dfMLReduced", generating dfML
        matchedMOFIndex, dfML = datasetMatch(MOFName)
        print("The number of MOFs shared by two datasets are: {:d}.".format(dfML.shape[0]))
        
        # generating flexibilityData as "y"
        flexibilityData = singleSet[matchedMOFIndex,:].copy()
    else:
        # concatenate "y"
        flexibilityData = np.concatenate([flexibilityData.copy(),singleSet[matchedMOFIndex,:].copy()])

# manually add adsorbate descriptors

# Mw/gr.mol-1, Tc/K, Pc/bar, ω, Tb/K, Tf/K

adsorbateData=np.array([
    ['xenon',131.293,289.7,58.4,0.008,164.87,161.2], 
    ['butane',58.1,449.8,39.5,0.3,280.1,146.7], 
    ['propene',42.1,436.9,51.7,0.2,254.8,150.6], 
    ['ethane',30.1,381.8,50.3,0.2,184.0,126.2], 
    ['propane',44.1,416.5,44.6,0.2,230.1,136.5], 
    ['CO2',44.0,295.9,71.8,0.2,317.4,204.9], 
    ['ethene',28.054,282.5,51.2,0.089,169.3,228], 
    ['methane',16.04,190.4,46.0,0.011,111.5,91],
    ['krypton',83.798,209.4,55.0,0.005,119.6,115.6]])

adsorbateData.shape
adDf = pd.DataFrame(data=adsorbateData, columns=["adsorbate", "Mw/gr.mol-1", "Tc/K", "Pc/bar", "ω", "Tb/K", "Tf/K"])

# sort the dataframe based on adsorbateNameList
sorterIndex = dict(zip(adsorbateNameList,range(len(adsorbateNameList))))
adDf['an_Rank'] = adDf['adsorbate'].map(sorterIndex)
adDf.sort_values(['an_Rank'],ascending = [True], inplace = True)
adDf.drop('an_Rank', 1, inplace = True)
adDfFloat = adDf.iloc[:, 1:].astype(np.float)
adDfFloat["adsorbate"] = adDf["adsorbate"]


# replicate dfML for 9 adsorbates
dfMLReplicate = pd.concat([dfML]*9)

# replicate adDf for 89 MOFs
adDfReplicate = pd.DataFrame(np.repeat(adDfFloat.values,89,axis=0))
adDfReplicate.columns = adDfFloat.columns

# concatenate two datasets
dfMLReplicate.reset_index(drop=True, inplace=True)
adDfReplicate.reset_index(drop=True, inplace=True)
XAllDescriptor = pd.concat([dfMLReplicate, adDfReplicate],axis=1)

X = np.concatenate((XAllDescriptor.iloc[:, 1:-1], flexibilityData[:, 0].reshape(-1, 1)),axis=1).astype(np.float)
y = flexibilityData[:, 1].astype('float64') .reshape(-1,1)


# feature scaling
X_scaled = (X - X.mean(axis=0))/X.std(axis=0) 

The number of MOFs shared by two datasets are: 89.


# data set split
## <span style="color:red">Validation set split (change)</span>

In [3]:
np.random.seed(5)
from sklearn.model_selection import train_test_split

# combine the unscaled and scaled X, so that they can be split together
X_combined = np.concatenate((X, X_scaled), axis=1)

# ---------------------------- don't touch the validation set ----------------------------
X_train_test_combined, X_validation_combined, y_train_test, y_validation = train_test_split(X_combined, \
                                                                                            y, test_size=0.25)
X_train_test, X_train_test_scaled = X_train_test_combined[:, :35], X_train_test_combined[:, 35:]
X_validation, X_validation_scaled = X_validation_combined[:, :35], X_validation_combined[:, 35:]
# ---------------------------- don't touch the validation set ----------------------------

## 5-fold on train-test set <span style="color:red">(no change, but we are going to use kf defined here)</span>

In [5]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle = True, random_state=20)

cv_splitter = kf.split(X_train_test_combined)

for i, (train_index, test_index) in enumerate(cv_splitter):
    
    # initialize sets
    if i == 0:
        X_train_combined_5fold = np.zeros(X_train_test_combined[train_index].shape + (5,))
        X_test_combined_5fold = np.zeros(X_train_test_combined[test_index].shape + (5,))
        y_train_5fold = np.zeros(y_train_test[train_index].shape + (5,))
        y_test_5fold = np.zeros(y_train_test[test_index].shape + (5,))
    
    X_train_combined_5fold[:, :, i], X_test_combined_5fold[:, :, i] = X_train_test_combined[train_index], X_train_test_combined[test_index]
    y_train_5fold[:, :, i], y_test_5fold[:, :, i] = y_train_test[train_index], y_train_test[test_index]

X_train_5fold, X_train_scaled_5fold = X_train_combined_5fold[:, :35], X_train_combined_5fold[:, 35:]
X_test_5fold, X_test_scaled_5fold = X_test_combined_5fold[:, :35], X_test_combined_5fold[:, 35:]

# Show how to use 5-fold sets with GridSearchCV

<span style="color:red">data used in GridSearchCV: `X_train_test`, `y_train_test`. They are the whole dataset excluding validation set. </span>
Notice their dimension:

In [7]:
print(X_train_test.shape)
print(y_train_test.shape)

(600, 35)
(600, 1)


To use scaled data, simply replace `X_train_test` with `X_train_test_scaled`.

<span style="color:red">cv method for GridSearchCV: `cv=kf`</span>

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

loss_fun = 'logistic'

print("Initial search space: {}".format(layer_list))
layer_dict = {
    'hidden_layer_sizes': [1] + list(range(10, 110, 10))
}

model_NN_1 = MLPRegressor(max_iter=200, activation=loss_fun)
model_NN_1_search = GridSearchCV(model_NN_1, layer_dict, n_jobs=-1, cv=kf)
model_NN_1_search.fit(X_train_test, y_train_test)



model_NN_1_best = model_NN_1_search.best_estimator_
print("Best model score: {}".format(model_NN_1_search.best_score_))