In [None]:
############################################################################
#
#  Welcome to the first MAST-ML tutorial notebook! In this notebook, we will
#  perform a first, basic run where we: 
#       1. Import example data of Boston housing prices
#       2. Define a data preprocessor to normalize the data
#       3. Define a linear regression model and kernel ridge model to fit the data
#       4. Evaluate each of our models with 5-fold cross validation
#
############################################################################

In [None]:
# If you are working on Google Colab and need to install MAST-ML, 
# begin by cloning the relevant branch of MAST-ML to the Colab session
# and install the needed dependencies:

!git clone --single-branch --branch dev_Ryan_2020-12-21 https://github.com/uw-cmg/MAST-ML
!pip install -r MAST-ML/requirements.txt

In [None]:
# Sync your Google drive to Colab so that we can save MAST-ML results to our Google
# Drive. If we save to the Colab session, the data will be deleted when the session 
# ends.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# We need to add the MAST-ML folder to our sys path so that python can find the modules
import sys
sys.path.append('MAST-ML')

In [1]:
# Here we import the MAST-ML modules used in this tutorial
from mastml.mastml import Mastml
from mastml.datasets import SklearnDatasets
from mastml.preprocessing import SklearnPreprocessor
from mastml.models import SklearnModel
from mastml.data_splitters import SklearnDataSplitter

Using TensorFlow backend.


In [2]:
# Set the name of the savepath to save MAST-ML results to
SAVEPATH = 'drive/MyDrive/MASTML_results_GettingStarted_1'

# Initialize the MAST-ML run, write savepath
mastml = Mastml(savepath=SAVEPATH)
savepath = mastml.get_savepath

# When the above command is run, a new folder with the name designated SAVEPATH is created.
# This is where all of the output for the current MAST-ML run will be saved to.
# Note that you can perform multiple runs with the same folder name, and the current datetime
# will be appended to the name so that no data is lost or overwritten.

drive/MyDrive/MASTML_results_GettingStarted_1 not empty. Renaming...


In [3]:
# Here, we use the SklearnDatasets module to load in the data used in this tutorial.
# In this tutorial, we examine the model dataset of Boston housing prices.

X, y = SklearnDatasets(as_frame=True).load_boston()

In [4]:
# Let's quickly examine the X data so we know what we're dealing with:
X

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [5]:
# We have 506 data points (houses) and 13 columns (features). The features aren't the
# most straightforward to discern at first glance, but they include data such as square footage, age, zoning
# area, etc.

In [6]:
# From looking at the y-data, we see that the target data are home prices, in thousands. This
# data is from a while ago, which is why the prices are so low by today's standards
y

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2
...,...
501,22.4
502,20.6
503,23.9
504,22.0


In [7]:
# This data set already has a set of 13 features, so we don't need to generate
# any more for the purposes of this example. However, we do want to normalize 
# our data for improved model fitting. Here, we define our preprocessing function. 
# We are just going to use the basic StandardScaler in scikit-learn to normalize 
# each column to have mean zero and standard deviation of one.

preprocessor = SklearnPreprocessor(preprocessor='StandardScaler', as_frame=True)

In [12]:
# Now we need to decide what types of models to fit to the Boston housing data.
# Here, we define two models. The first is a basic linear regression model, and
# the second is a kernel ridge model with Gaussian kernel. The scikit-learn model name can 
# just be given as a string matching the model name in the "model" field. 
# The remaining arguments are the parameters to pass to the model. If no parameters 
# are given, default values are used.

model1 = SklearnModel(model='LinearRegression')
model2 = SklearnModel(model='KernelRidge', kernel='rbf')

# MAST-ML takes a list of the models as input:
models = [model1, model2]


In [13]:
# The heart of any MAST-ML run is evaluating the chosen features and model on particular
# subsets of the data. Here, we will do a basic example of running a random 
# leave-out cross validation test (5-fold CV). MAST-ML will output data and plots
# for each split as well as some more comprehensive analysis performed over all
# splits. The saved model and preprocessor corresponding to the best split will
# also be put in the split directory, which can be imported for use in future
# predictions 

splitter = SklearnDataSplitter(splitter='RepeatedKFold', n_repeats=1, n_splits=5)
splitter.evaluate(X=X,
                  y=y, 
                  models=models,
                  preprocessor=preprocessor,
                  savepath=savepath,
                  verbosity=3)

In [None]:
# Now that the MAST-ML run has completed, you can check the contents of the output folder.
# You'll see that two new folders were made inside of it. They are:
#     1.) LinearRegression_RepeatedKFold_SklearnPreprocessor
#     2.) KernelRidge_RepeatedKFold_SklearnPreprocessor
#
# At the top level of each of these folders, you will see a number of .xlsx data files and analysis
# image files. Examine the data plot named 'parity_plot_test.png' for each model
# folder. This plot displays the true vs predicted values for all test data. Which model performed better 
# for this test?
#
# Since MAST-ML is now run in a notebook environment, you can go back and add new models, preprocessors, or even
# add new features to your data and re-run the above defined data splitter. Similar as with the main save directory,
# the data splitter directory names with have the current datetime appended to their base names to prevent data loss.
#
# Congratulations! You've completed your first MAST-ML run. This is just the first in a series of tutorial notebooks.
# There are many other notebooks to explore that will guide you through different aspects of what MAST-ML can offer,
# including the different types of output plots and data files, how to import your own data or download data from
# commonly used online materials databases, fitting different types of models, more complex fits that involve tuning
# hyperparameters, generating and selecting features, as well as in-depth model uncertainty quantification.
#
# The next example in this notebook series is titled MASTML_DataImport_2.ipynb, and will guide you through the process
# of importing data from a local source and downloading and importing data from an online database