In [None]:
########################################################################################################################################
#
#  Welcome to the sixth MAST-ML tutorial notebook, MASTML_Tutorial_6_ErrorAnalysis_UncertaintyQuantification.ipynb! 
#  In this notebook tutorial, we will learn about how MAST-ML can be used to: 
#       1. Assess the true and predicted errors of our model, and some useful measures of their statistical distributions
#       2. Explore different methods of quantifying and calibrating model uncertainties. 
#       3. Compare the uncertainty quantification behavior of Bayesian and ensemble-based models.
#
########################################################################################################################################

In [None]:
#####################################
#
# Task 0: Setting up MAST-ML in Colab
#
#####################################

In [None]:
# If you are working on Google Colab and need to install MAST-ML, 
# begin by cloning the relevant branch of MAST-ML to the Colab session
# and install the needed dependencies:

!git clone --single-branch --branch dev_Ryan_2020-12-21 https://github.com/uw-cmg/MAST-ML
!pip install -r MAST-ML/requirements.txt

In [None]:
# Sync your Google drive to Colab so that we can save MAST-ML results to our Google
# Drive. If we save to the Colab session, the data will be deleted when the session 
# ends.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# We need to add the MAST-ML folder to our sys path so that python can find the modules
import sys
sys.path.append('MAST-ML')

In [1]:
# Here we import the MAST-ML modules used in this tutorial
from mastml.mastml import Mastml
from mastml.datasets import LocalDatasets
from mastml.preprocessing import SklearnPreprocessor
from mastml.models import SklearnModel
from mastml.data_splitters import SklearnDataSplitter

Using TensorFlow backend.


In [2]:
# Set the name of the savepath to save MAST-ML results to
SAVEPATH = 'drive/MyDrive/MASTML_tutorial_6_ErrorAnalysis_UncertaintyQuantification'

# Initialize the MAST-ML run, write savepath
mastml = Mastml(savepath=SAVEPATH)
savepath = mastml.get_savepath

# When the above command is run, a new folder with the name designated SAVEPATH is created.
# This is where all of the output for the current MAST-ML run will be saved to.
# Note that you can perform multiple runs with the same folder name, and the current datetime
# will be appended to the name so that no data is lost or overwritten.

drive/MyDrive/MASTML_results_GettingStarted_1 not empty. Renaming...


In [None]:
###############################################################################################################
#
# Task 1: Assess the true and predicted errors of our model, and some useful measures 
#         of their statistical distributions
#
###############################################################################################################

In [None]:
# In this tutorial, we will again use the diffusion dataset that we examined in the 
# previous tutorial. Here, we use the LocalDatasets module to load in the diffusion dataset. 

# Need to denote the column name of the target (y-data)
target = 'E_regression'

# There are columns in the data file not used as features or target. We need to
# list them here in the parameter extra_columns
extra_columns = ['Material compositions 1', 'Material compositions 2']

# Here, we make an instance of our LocalDatasets class. It needs a few parameters:
#   file_path: where the data is stored
#   target: the column name of the y-data
#   extra_columns: list containing extra columns in the data file not used for fitting
#   group_column: column name denoting group labels (only used for LeaveOutGroup CV)
#   testdata_columns: column names denoting left-out data to evaluate using best
#     model from CV tests. This is manual way to leave out data. Can also be done
#     automatically using nested CV (we will do this in later tutorials)
#   as_frame: whether to return data as dataframe. Want this to be true.
d = LocalDatasets(file_path='../mastml/data/diffusion_data_selectfeatures.xlsx', #'MAST-ML/mastml/data/figshare_7418492/All_Model_Data.xlsx'
                  target=target, 
                  extra_columns=extra_columns, 
                  group_column='Material compositions 1',
                  testdata_columns=None,
                  as_frame=True)

# Load the data with the load_data() method
data_dict = d.load_data()

# Let's assign each data object to its respective name
X = data_dict['X']
y = data_dict['y']
X_extra = data_dict['X_extra']
groups = data_dict['groups']
X_testdata = data_dict['X_testdata']

In [None]:
preprocessor = SklearnPreprocessor(preprocessor='StandardScaler', as_frame=True)
model_rf = SklearnModel(model='RandomForestRegressor', n_estimators=150)
model_ens = EnsembleModel(model='Ridge', n_estimators=150)
models = [model_rf, model_ens]

In [None]:
preprocessor = SklearnPreprocessor(preprocessor='StandardScaler', as_frame=True)
model_rf = SklearnModel(model='RandomForestRegressor', n_estimators=150)
splitter = SklearnDataSplitter(splitter='RepeatedKFold', n_repeats=1, n_splits=5)
splitter.evaluate(X=X,
                  y=y, 
                  models=[model_rf],
                  preprocessor=preprocessor,
                  metrics=metrics,
                  plots=['Error'],
                  savepath=savepath,
                  X_extra=X_extra,
                  nested_CV=True,
                  error_method='stdev_weak_learners', 
                  recalibrate_errors=True,
                  verbosity=3)

In [None]:
###########################################################################################
#
# Task 2: Explore different methods of quantifying and calibrating model uncertainties. 
#
###########################################################################################

In [None]:
preprocessor = SklearnPreprocessor(preprocessor='StandardScaler', as_frame=True)
model_rf = SklearnModel(model='RandomForestRegressor', n_estimators=150)
splitter = SklearnDataSplitter(splitter='RepeatedKFold', n_repeats=1, n_splits=5)
splitter.evaluate(X=X,
                  y=y, 
                  models=[model_rf],
                  preprocessor=preprocessor,
                  metrics=metrics,
                  plots=['Error'],
                  savepath=savepath,
                  X_extra=X_extra,
                  nested_CV=True,
                  error_method='jackknife_after_bootstrap', 
                  recalibrate_errors=True,
                  verbosity=3)

In [None]:
###############################################################################################################
#
# Task 3: Compare the uncertainty quantification behavior of Bayesian and ensemble-based models.
#
###############################################################################################################

In [None]:
preprocessor = SklearnPreprocessor(preprocessor='StandardScaler', as_frame=True)
model_rf = SklearnModel(model='RandomForestRegressor', n_estimators=150)
model_ens = EnsembleModel(model='Ridge', n_estimators=150)
model_gpr = SklearnModel(model='GaussianProcessRegressor', kernel='ConstantKernel*RBF', n_restarts_optimizer=10)

models = [model_rf, model_ens]

In [None]:
# You've now completed your sixth MAST-ML tutorial notebook! You have almost reached the end of the MAST-ML
# tutorial series. The final tutorial in this series details how to upload your best trained models to 
# the DLHub database for other people to use, and use these shared models to make predictions on new data
# with only a few lines of python code.
#
# The next example in this notebook series is titled MASTML_Tutorial_7_ModelHosting_and_Predictions.ipynb, 
# and will guide you through the process of sharing your favorite models on the DLHub model hosting service,
# then use this newly hosted model to make predictions on new data.