In [None]:
# Begin by cloning the development branch of MAST-ML to the Colab session
!git clone --single-branch --branch dev_Ryan_2020-12-21 https://github.com/uw-cmg/MAST-ML

In [None]:
# Next, we install the required dependencies of MAST-ML to our Colab session
!pip install -r MAST-ML/requirements.txt

In [1]:
# Sync your Google drive to Colab so that we can save MAST-ML results to our Google
# Drive. If we save to the Colab session, the data will be deleted when the session 
# ends.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

ModuleNotFoundError: No module named 'google'

In [2]:
# We need to add the MAST-ML folder to our sys path so that python can find the modules
import sys
sys.path.append('MAST-ML')

In [3]:
# Here we import the MAST-ML modules used in this tutorial
from mastml.mastml import Mastml
from mastml.datasets import LocalDatasets#, DataCleaning
from mastml.data_cleaning import DataCleaning
from mastml.preprocessing import SklearnPreprocessor
from mastml.models import SklearnModel
from mastml.data_splitters import SklearnDataSplitter, NoSplit
from mastml.feature_selectors import EnsembleModelFeatureSelector, NoSelect

Figshare is an optional dependency. To import data from figshare, manually install figshare via git clone of git clone https://github.com/cognoma/figshare.git
forestci is an optional dependency. To install latest forestci compatabilty with scikit-learn>=0.24, run pip install git+git://github.com/scikit-learn-contrib/forest-confidence-interval.git
XGBoost is an optional dependency. If you want to use XGBoost models, please manually install xgboost package with pip install xgboost. If have error with finding libxgboost.dylib library, dobrew install libomp. If do not have brew on your system, first do ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" from the Terminal
Keras is an optional dependency. To use keras, do pip install keras tensorflow
matminer and pymatgen are optional dependencies. To use data splitter methods invoking these packages,do pip install matminer pymatgen


In [4]:
# Set the name of the savepath to save MAST-ML results to
SAVEPATH = 'drive/MyDrive/MASTML_results_2021-03-01'

# Initialize the MAST-ML run, write savepath
mastml = Mastml(savepath=SAVEPATH)
savepath = mastml.get_savepath

drive/MyDrive/MASTML_results_2021-03-01 not empty. Renaming...


In [5]:
# Here, we use the LocalDatasets module to load in a dataset. We are using the
# diffusion dataset for this example

# Need to denote the column name of the target (y-data)
target = 'E_regression.1'

# There are columns in the data file not used as features or target. We need to
# list them here in the parameter extra_columns
extra_columns = ['Material compositions 1', 'Material compositions 2', 'Hop activation barrier', 'E_regression']

# Here, we make an instance of our LocalDatasets class. It needs a few parameters:
#   file_path: where the data is stored
#   target: the column name of the y-data
#   extra_columns: list containing extra columns in the data file not used for fitting
#   group_column: column name denoting group labels (only used for LeaveOutGroup CV)
#   testdata_columns: column names denoting left-out data to evaluate using best
#     model from CV tests. This is manual way to leave out data. Can also be done
#     automatically using nested CV (will see below)
#   as_frame: whether to return data as dataframe. Want this to be true.
d = LocalDatasets(file_path='mastml/data/figshare_7418492/All_Model_Data.xlsx', 
                  target=target, 
                  extra_columns=extra_columns, 
                  group_column='Material compositions 1',
                  testdata_columns=None,
                  as_frame=True)

# Load the data with the load_data() method
data_dict = d.load_data()



In [6]:
# Let's look at the contents of the loaded data_dict
data_dict.keys()

# We see there are 5 keys:
#   X: the X feature matrix (used to fit the ML model)
#   y: the y target data vector (true values)
#   X_extra: matrix of extra information not used in fitting (i.e. not part of X or y)
#   groups: vector of group labels (here, a list of host elements)
#   X_testdata: matrix or vector of left out data. Empty for our current example.

dict_keys(['X', 'y', 'groups', 'X_extra', 'X_testdata'])

In [7]:
# Let's assign each data object to its respective name

X = data_dict['X']
y = data_dict['y']
X_extra = data_dict['X_extra']
groups = data_dict['groups']
X_testdata = data_dict['X_testdata']

In [8]:
# Let's have a look at the X feature matrix. It contains 287 elemental features.
# There are a total of 408 diffusion activation barriers used for fitting
X

Unnamed: 0,valence_composition_average,phi_composition_average,NdValence_composition_average,MiracleRadius_composition_average,GSestFCClatcnt_composition_average,SecondIonizationEnergy_composition_average,IonizationEnergy_composition_average,n_ws^third_composition_average,ThermalConductivity_composition_average,CovalentRadius_composition_average,...,Site2_MendeleevNumber,Site2_ElasticModulus,Site2_Electronegativity,Site2_AtomicWeight,Site2_HeatFusion,Site2_SpecificHeatCapacity,Site2_AtomicRadii,Site2_ThermalExpansionCoefficient,Site2_BCCefflatcnt,Site2_AtomicVolume
0,2.0,4.350,10.0,144.0,4.027313,21.4900,731.00,1.360,429.00,145.0,...,65,80.0,1.93,107.868200,11.30,0.235,1.444,18.9,6.375951,17.075648
1,2.5,4.725,8.5,134.5,3.737485,19.2750,744.50,1.555,264.50,135.5,...,58,208.0,1.88,58.933195,16.20,0.421,1.253,13.0,5.507318,10.995861
2,4.0,4.500,7.5,137.0,3.788936,18.9950,691.90,1.545,261.35,142.0,...,49,259.0,1.66,51.996100,21.00,0.449,1.249,4.9,5.632801,12.092937
3,2.0,4.400,10.0,135.0,3.782567,20.8910,738.20,1.415,415.00,138.5,...,64,124.0,1.90,63.546000,13.60,0.385,1.278,16.5,5.617632,11.829942
4,4.0,4.640,8.0,134.5,3.764269,18.8350,745.15,1.565,254.60,138.5,...,55,211.0,1.83,55.845000,13.81,0.449,1.241,11.8,5.557847,11.777365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,4.5,3.750,3.0,150.5,4.350534,13.7250,662.00,1.525,38.20,169.5,...,47,104.0,1.60,92.906380,30.00,0.265,1.429,7.3,6.625794,18.002133
404,4.5,3.750,2.5,151.5,4.348236,6.5650,710.50,1.520,40.10,172.5,...,48,183.0,1.50,180.947880,36.57,0.140,1.430,6.3,6.618497,18.046730
405,4.0,3.625,2.0,150.0,4.291886,13.3550,659.00,1.465,22.30,167.5,...,43,110.0,1.54,47.867000,14.15,0.523,1.448,8.6,6.458833,17.636317
406,4.0,3.525,2.0,158.0,4.494239,14.0275,651.00,1.430,22.85,175.0,...,45,139.0,1.30,178.490000,27.20,0.140,1.564,5.9,7.046762,22.268711


In [9]:
# Let's look at the groups list. The groups denote the element identity of the host 
# metal
groups

0      Ag
1      Ag
2      Ag
3      Ag
4      Ag
       ..
403    Zr
404    Zr
405    Zr
406    Zr
407    Zr
Name: Material compositions 1, Length: 408, dtype: object

In [10]:
# Let's examine the extra data not used in fitting. Note that the y-data we are 
# fitting to are equal to the E_regression values listed here, minus the E_regression
# value of the pure material. Material compositions 1 are the host elements, which
# are the same values used in the groups list. Material compositions 2 are the solute
# elements (i.e. the diffusing impurity in the host material)
X_extra

Unnamed: 0,Material compositions 1,Material compositions 2,Hop activation barrier,E_regression
0,Ag,Ag,-0.365590,1.824500
1,Ag,Co,-0.675263,1.734358
2,Ag,Cr,-0.047690,2.083639
3,Ag,Cu,-0.428459,1.802300
4,Ag,Fe,0.081529,2.142172
...,...,...,...,...
403,Zr,Nb,-0.478007,2.523770
404,Zr,Ta,-0.054726,2.747650
405,Zr,Ti,-0.022476,2.842630
406,Zr,Hf,-0.104201,2.801830


In [11]:
# Finally, we can look at our manually left-out data set. For this tutorial we
# have left this blank
X_testdata

In [12]:
# If the data contains missing values (this one doesn't), we can clean the data, 
# which corrects missing values and provides some basic analysis of the input data.
# Since there are no missing values the data cleaner will still output some useful
# plots and statistics of our input data

cleaner = DataCleaning()
X, y = cleaner.evaluate(X=X, 
                        y=y, 
                        method='imputation', 
                        strategy='mean', 
                        savepath=savepath)

In [13]:
# Here is where we can specify some of the core MAST-ML options, like how to 
# preprocess our data, what models to use in fitting, and how to select features.
# In addition, we would also specify how to optimize model hyperparameters, but
# we don't need to do that for this first tutorial. Lastly, we can decide which
# metrics to evaluate in our fits.

# Here, we define two models. The first is a random forest model, the second is
# a Gaussian process model. The scikit-learn model name can just be given as
# a string matching the model name in the "model" field. The remaining arguments
# are the parameters to pass to the model. If no parameters are given, default
# values are used.
model1 = SklearnModel(model='RandomForestRegressor', n_estimators=150, max_depth=30)
model2 = SklearnModel(model='GaussianProcessRegressor', kernel='ConstantKernel*RBF', n_restarts_optimizer=10)
# MAST-ML takes a list of the models as input.
models = [model1, model2]

# Here, we define our preprocessing function. We are just going to use the basic
# StandardScaler in scikit-learn to normalize each column to have mean zero and
# standard deviation of one.
preprocessor = SklearnPreprocessor(preprocessor='StandardScaler', as_frame=True)

# Next, we define a couple ways to select features. The first method, NoSelect,
# just uses all features in the X feature set. The second method, EnsembleModelFeatureSelector,
# uses a random forest model to select the top (here, 20) features based on 
# ranked random forest feature importance scores.
selector1 = NoSelect()
selector2 = EnsembleModelFeatureSelector(model=SklearnModel(model='RandomForestRegressor'), n_features_to_select=20)
selectors = [selector1, selector2]

# Finally, we list which metrics we want to evaluate. If none are given, MAST-ML
# will default to evaulating just the mean absolute error. A complete list of 
# metrics can be obtained from calling Metrics()._metric_zoo() in metrics.py.
metrics = ['r2_score', 'mean_absolute_error', 'root_mean_squared_error', 'rmse_over_stdev']


In [14]:
# As a first run, we want to define and run the case where no data split is 
# performed. This represents a full fit to all of the data. We will fit both our
# random forest and Gaussian process model, but just for the case where all
# features are used.

#splitter = NoSplit()
#splitter.evaluate(X=X,
#                  y=y, 
#                  models=models,
#                  preprocessor=preprocessor,
#                  selectors=[selector1],
#                  metrics=metrics,
#                  savepath=savepath,
#                  X_extra=X_extra,
#                  verbosity=3)

In [15]:
# Next, we want to do something a bit more informative. We will do a random 
# leave-out cross validation test (5-fold CV). MAST-ML will output data and plots
# for each split as well as some more comprehensive analysis performed over all
# splits. The saved model and preprocessor corresponding to the best split will
# also be put in the split directory, which can be imported for use in future
# predictions 

#splitter = SklearnDataSplitter(splitter='RepeatedKFold', n_repeats=1, n_splits=5)
#splitter.evaluate(X=X,
#                  y=y, 
#                  models=models,
#                  preprocessor=preprocessor,
#                  selectors=[selector2],
#                  metrics=metrics,
#                  plots=['Scatter', 'Histogram'],
#                  savepath=savepath,
#                  X_extra=X_extra,
#                  verbosity=3)

In [16]:
# Randomly leaving out data isn't the only way to assess model performance. This
# dataset has logical groupings based on which element is the host. So, we can
# do another cross validation test where this time we leave out each host once.
# Let's test just our random forest model with the leave out group CV method.

#splitter = SklearnDataSplitter(splitter='LeaveOneGroupOut')
#splitter.evaluate(X=X,
#                  y=y, 
#                  models=[model1],
#                  preprocessor=preprocessor,
#                  groups=groups,
#                  selectors=[selector2],
#                  metrics=metrics,
#                  plots=['Scatter', 'Histogram'],
#                  savepath=savepath,
#                  X_extra=X_extra,
#                  verbosity=3)

In [17]:
# Now lets use just random forest and the full feature set to do random leave out CV, 
# but let's do a nested variant so we can recalibrate the predicted random forest errors.
# This will allow us to more deeply assess the uncertainty estimates of the 
# random forest model, and see that they would benefit from a correction scheme,
# which in this case is applied following the method of Palmer et al.
#
# Note that this will take longer than previous runs since we are doing nested
# CV. Since we are doing 2 repeats of leave out 1/3 of the data (n_splits=3), and 
# nesting is turned on, means we have (2*3)^2 = 36 total splits

#splitter = SklearnDataSplitter(splitter='RepeatedKFold', n_repeats=2, n_splits=3)
#splitter.evaluate(X=X,
#                  y=y, 
#                  models=[model1],
#                  preprocessor=preprocessor,
#                  selectors=[selector1],
#                  metrics=metrics,
#                  plots=['Error'],
#                  savepath=savepath,
#                  X_extra=X_extra,
#                  nested_CV=True,
#                  error_method='stdev_weak_learners', #stdev_weak_learners, jackknife_after_bootstrap
#                  recalibrate_errors=True,
#                  verbosity=2)

In [18]:
from mastml.data_splitters import LeaveOutClusterCV
from mastml.models import SklearnModel
from sklearn.datasets import make_blobs

In [19]:
splitter = LeaveOutClusterCV("KMeans", n_clusters=15)

In [20]:
splitter.evaluate(X=X, y=y,
                  models=[model1],
                  selectors=[selector1],
                  savepath=savepath)