In [None]:
#############################################################################################################
#
#  Welcome to the fifth MAST-ML tutorial notebook, MASTML_Tutorial_5_NestedCV_and_OptimizedModels.ipynb! 
#  In this notebook, we will perform more advanced model fitting routines, including nested cross validation
#  and hyperparameter optimization. In this tutorial, we will learn how to use MAST-ML to:
#       1. Assess performance on manually left-out test data
#       2. Perform nested cross validation to assess model performance on unseen data
#       3. Optimize the hyperparameters of our models to create the best model
#
#############################################################################################################

In [None]:
#####################################
#
# Task 0: Setting up MAST-ML in Colab
#
#####################################

In [None]:
# If you are working on Google Colab and need to install MAST-ML, 
# begin by cloning the relevant branch of MAST-ML to the Colab session
# and install the needed dependencies:

!git clone --single-branch --branch dev_Ryan_2020-12-21 https://github.com/uw-cmg/MAST-ML
!pip install -r MAST-ML/requirements.txt

In [None]:
# Sync your Google drive to Colab so that we can save MAST-ML results to our Google
# Drive. If we save to the Colab session, the data will be deleted when the session 
# ends.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# We need to add the MAST-ML folder to our sys path so that python can find the modules
import sys
sys.path.append('MAST-ML')

In [1]:
# Here we import the MAST-ML modules used in this tutorial
from mastml.mastml import Mastml
from mastml.datasets import LocalDatasets
from mastml.models import SklearnModel
from mastml.preprocessing import SklearnPreprocessor
from mastml.data_splitters import SklearnDataSplitter, NoSplit
from mastml.hyper_opt import GridSearch, RandomizedSearch, BayesianSearch
import numpy as np

Using TensorFlow backend.


In [2]:
# Set the name of the savepath to save MAST-ML results to
SAVEPATH = 'drive/MyDrive/MASTML_tutorial_5_NestedCV_and_OptimizedModels'

# Initialize the MAST-ML run, write savepath
mastml = Mastml(savepath=SAVEPATH)
savepath = mastml.get_savepath

# When the above command is run, a new folder with the name designated SAVEPATH is created.
# This is where all of the output for the current MAST-ML run will be saved to.
# Note that you can perform multiple runs with the same folder name, and the current datetime
# will be appended to the name so that no data is lost or overwritten.

drive/MyDrive/MASTML_tutorial_5_NestedCV_and_OptimizedModels not empty. Renaming...


In [None]:
################################################################
#
# Task 1: Assess performance on manually left-out test data
#
################################################################

In [None]:
# In this tutorial, we will see there are two ways to evaluate models in
# a "two-level" fashion, i.e. where we split data into train/leftout data, 
# then build and evaluate models with a cross validation data splitter on
# the training set to build various sub train/test splits. This is also
# commonly called nested cross validation. The leftout data is never used 
# in training at any point, so can function as a good approximation for
# how the model may perform on new, unseen data.
#
# MAST-ML offers two methods to do this. The first is for the user to specify 
# specific data points in the imported data that should be reserved as left-out
# data. The second method is to perform an automatic nested cross-validation scheme.
# Here, we will showcase the first method where some data points are manually
# selected to function as left-out data.

In [3]:
# In this tutorial, we will again use the diffusion dataset that we examined in the 
# previous tutorials. Here, we use the LocalDatasets module to load in the diffusion dataset. 
# We are using the diffusion_data_leaveoutPt.xlsx file as it contains a column denoting
# the data to leave out (those with Pt as the host element)

# To specify which data points are left out, we made a new column in the data file (it can
# have whatever name you want), and then label 0 if the data is not left out data and 1
# if the data is to be used as left-out data. For this example we are using all data where
# Pt is the host element as left out data.

# Need to denote the column name of the target (y-data)
target = 'E_regression'

# There are columns in the data file not used as features or target. We need to
# list them here in the parameter extra_columns
extra_columns = ['Material compositions 1', 'Material compositions 2', 'is_testdata']

# Here, we make an instance of our LocalDatasets class. It needs a few parameters:
#   file_path: where the data is stored
#   target: the column name of the y-data
#   extra_columns: list containing extra columns in the data file not used for fitting
#   group_column: column name denoting group labels (only used for LeaveOutGroup CV)
#   testdata_columns: column names denoting left-out data to evaluate using best
#     model from CV tests. This is manual way to leave out data. In our current case, 
#     the column called "is_testdata" denotes which points will be left out
#   as_frame: whether to return data as dataframe. Want this to be true.
d = LocalDatasets(file_path='../mastml/data/diffusion_data_leaveoutPt.xlsx', #'MAST-ML/mastml/data/figshare_7418492/All_Model_Data.xlsx'
                  target=target, 
                  extra_columns=extra_columns, 
                  group_column='Material compositions 1',
                  testdata_columns=['is_testdata'],
                  as_frame=True)

# Load the data with the load_data() method
data_dict = d.load_data()

# Let's assign each data object to its respective name
X = data_dict['X']
y = data_dict['y']
X_extra = data_dict['X_extra']
groups = data_dict['groups']
X_testdata = data_dict['X_testdata']



In [None]:
# We have one left out data set in this example, so X_testdata is a list with an array containing
# the indices of the left out data, which will be automatically used in our data splitter. We
# see there are 30 left out data points, which correspond to the number of points where Pt is the
# host element
X_testdata[0].shape

In [4]:
# We are going to primarily be evaluating Kernel ridge models in this tutorial. Let's build 
# and run a Kernel ridge model using random 5-fold cross validation, where we also specify
# our X_testdata in the splitter evaluate() method (as the leaveout_inds parameter),
# so that it knows to predict the values of the left out data.

model_krr = SklearnModel(model='KernelRidge', kernel='rbf')
preprocessor = SklearnPreprocessor(preprocessor='StandardScaler', as_frame=True)
metrics = ['r2_score', 'mean_absolute_error', 'root_mean_squared_error', 'rmse_over_stdev']
splitter = SklearnDataSplitter(splitter='RepeatedKFold', n_repeats=1, n_splits=5)
splitter.evaluate(X=X,
                  y=y, 
                  models=[model_krr],
                  preprocessor=preprocessor,
                  metrics=metrics,
                  plots=['Scatter', 'Histogram'],
                  savepath=savepath,
                  X_extra=X_extra,
                  leaveout_inds=X_testdata,
                  verbosity=3)

# When we examine our output directory, we see that now there is a new "split_outer_0"
# directory, and within that directory reside the individual split directories we have
# seen from previous tutorials. There is one split outer directory because there was one
# set of left-out data. For this split_outer_0 directory, the Pt host data are left out, and
# random 5-fold CV is performed on the remaining data to assess model performance
#
# How did the model do with predicting the Pt data? My model got an RMSE of 0.282 eV, while
# the 5-fold RMSE on the remaining data was 0.153 eV. The prediction of Pt was higher than 
# this value because random 5-fold CV is an overly optimistic predictor of model performance
# when predicting the values of unseen data from a new group!

In [None]:
#############################################################################################
#
# Task 2: Perform nested cross validation to assess model performance on unseen data
#
#############################################################################################

In [9]:
# Instead of just manually specifying which data should be left-out data, we can have MAST-ML
# do this automatically via nested cross validation. Any data splitter can be used to perform
# nested cross validation. Here, we will perform nested CV with our kernel ridge method using
# both 5-fold and leave out group cross validation. Then, we can compare how our previous result
# of predicting on Pt compares to the left-out data RMSE values from 5-fold and leave out group
# tests. 

# This time, we are not specifying the leaveout_inds parameter. To use nested CV,
# we set nested_CV=True in the evaluate method. The data splitter method is repeated
# for each nesting level, so here we are doing 1 cycle of 5-fold CV. That means that there
# will be 5 level 1 splits, and 5 level 2 splits for each of the level 1 split, making for 
# a total of 5*5 = 25 splits. 
splitter = SklearnDataSplitter(splitter='RepeatedKFold', n_repeats=1, n_splits=5)
splitter.evaluate(X=X,
                  y=y, 
                  models=[model_krr],
                  preprocessor=preprocessor,
                  metrics=metrics,
                  plots=['Scatter', 'Histogram'],
                  savepath=savepath,
                  X_extra=X_extra,
                  nested_CV=True,
                  verbosity=3)

# For leave out group, recall we need to set the groups parameter in the evaluate method.
# When doing nested CV with leave out group, the total number of splits will be n_groups*n_groups,
# so in our case that is 125 splits (be patient- this run will take about 10 minutes or so).
splitter = SklearnDataSplitter(splitter='LeaveOneGroupOut')
splitter.evaluate(X=X,
                  y=y, 
                  groups=groups,
                  models=[model_krr],
                  preprocessor=preprocessor,
                  metrics=metrics,
                  plots=['Scatter', 'Histogram'],
                  savepath=savepath,
                  X_extra=X_extra,
                  nested_CV=True,
                  verbosity=3)

# My previous result of leave out Pt RMSE was 0.282 eV. The nested CV left out data RMSEs are
# 0.170 eV for 5-fold and 0.200 eV for leave out group. It's evident that the leave out group
# test is more closely aligned with the scale of error one should expect by predicting on a 
# new group, and the prediction of Pt seems to be a rather difficult one, considering its 
# error is even higher than the nested leave out group score.

In [None]:
#####################################################################################
#
# Task 3: Optimize the hyperparameters of our models to create the best model
#
#####################################################################################

In [None]:
# Let's go back to our dataset from the first task where we leave out Pt and
# include hyperparameter optimization for our kernel ridge model. To keep the runs reasonably fast, 
# we are only going to optimize the alpha parameter in Kernel ridge, which is the regularization strength.
# This optimization will be done for each train/test split in the random
# 5-fold cross validation stage. Then, the best model will be selected and used to predict the
# left out Pt data. 

# The parameters for hyperparameter optimization routines have a defined structure. The parameter
# names need to be separated by semicolons (see below commented part for designating alpha, gamma,
# and kernel type). The parameter values are also delimited by semicolons (one set per parameter).
# For designating the grid of values to explore, the first number is the lower bound, the second
# number is the upper bound, the third number is grid density, and the fourth value is "lin" or 
# "log" to denote linear or logarithmic scale, respectively. The final value is the data type of the
# parameter.

hyperopt = GridSearch(param_names='alpha',
                     param_values='-5 5 100 log float',
                     scoring='root_mean_squared_error')

splitter = SklearnDataSplitter(splitter='RepeatedKFold', n_repeats=1, n_splits=5)
splitter.evaluate(X=X,
                  y=y, 
                  models=[model_krr],
                  preprocessor=preprocessor,
                  metrics=metrics,
                  hyperopts=[hyperopt],
                  plots=['Scatter', 'Histogram'],
                  savepath=savepath,
                  X_extra=X_extra,
                  leaveout_inds=X_testdata,
                  verbosity=3)

# Below is an example of optimizing three parameters in the Kernel ridge model. We don't run it here
# because it will take a long time
# hyperopt = GridSearch(param_names='alpha ; gamma ; kernel',
#                     param_values='-5 5 100 log float ; -5 5 100 log float ; linear rbf sigmoid str',
#                     scoring='root_mean_squared_error')

# How does this model perform on the Pt data compared to the one we ran above? When just using the
# non-optimized Kernel ridge model, I got an RMSE of 0.282 eV. After optimizing just the alpha parameter,
# the RMSE on the Pt data has dropped to just 0.169 eV

In [None]:
# With everything we've covered here, you could combine nested CV with the hyperparameter optimization
# to obtain a more realistic

In [None]:
# You've now completed your fifth MAST-ML tutorial notebook! Now that you're more familiar with performing detailed
# model evaluation with nested cross validation and creating optimized models, it is time to move to one of the final
# tutorials in this series, and a subject of great importance when evaluating the efficacy of ML models: predictions
# of model errors, domains, and uncertainty quantification (UQ).
#
# The next example in this notebook series is titled MASTML_Tutorial_6_ErrorAnalysis_UncertaintyQuantification.ipynb, 
# and will guide you through the process assessing the true and predicted errors of some models on a select dataset
# and detail methods to recalibrate the model uncertainty estimates to more accurately reflect the true model errors.