In [None]:
############################################################################################
#
#  Welcome to the second MAST-ML tutorial notebook, MASTML_Tutorial_2_DataImport.ipynb! 
#  In this notebook, we will learn different ways to download and import data
#  into a MAST-ML run.
#       1. Import model datasets from scikit-learn
#       2. Conduct different data cleaning methods 
#       3. Import and prepare a real dataset that is stored locally
#       4. Download data from various materials databases
#
############################################################################################

In [None]:
#####################################
#
# Task 0: Setting up MAST-ML in Colab
#
#####################################

In [None]:
# If you are working on Google Colab and need to install MAST-ML, 
# begin by cloning the relevant branch of MAST-ML to the Colab session
# and install the needed dependencies:

!git clone --single-branch --branch dev_Ryan_2020-12-21 https://github.com/uw-cmg/MAST-ML
!pip install -r MAST-ML/requirements.txt

In [None]:
# Sync your Google drive to Colab so that we can save MAST-ML results to our Google
# Drive. If we save to the Colab session, the data will be deleted when the session 
# ends.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# We need to add the MAST-ML folder to our sys path so that python can find the modules
import sys
sys.path.append('MAST-ML')

In [2]:
# Here we import the MAST-ML modules used in this tutorial
from mastml.mastml import Mastml
from mastml.datasets import SklearnDatasets, LocalDatasets, MatminerDatasets, FigshareDatasets, FoundryDatasets
from mastml.data_cleaning import DataCleaning

In [3]:
# Set the name of the savepath to save MAST-ML results to
SAVEPATH = 'drive/MyDrive/MASTML_tutorial_2_DataImport'

# Initialize the MAST-ML run, write savepath
mastml = Mastml(savepath=SAVEPATH)
savepath = mastml.get_savepath

# When the above command is run, a new folder with the name designated SAVEPATH is created.
# This is where all of the output for the current MAST-ML run will be saved to.
# Note that you can perform multiple runs with the same folder name, and the current datetime
# will be appended to the name so that no data is lost or overwritten.

drive/MyDrive/MASTML_tutorial_2_DataImport not empty. Renaming...


In [None]:
#########################################################
#
# Task 1: Import model datasets from scikit-learn
#
#########################################################

In [60]:
# Let's begin by first showing how to load some common model datasets that
# come with the scikit-learn package. As in the first tutorial, we use the SklearnDatasets module 
# to load in the data. There are a few regression datasets that come with scikit-learn, such as
# the Boston housing data, the diabetes dataset, and the Friedman data set. For now,
# let's once again load in the Boston housing dataset.

X, y = SklearnDatasets(as_frame=True).load_boston()

In [61]:
# Let's quickly examine the X data so we know what we're dealing with:
X

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [None]:
#########################################################
#
# Task 2: Conduct different data cleaning methods 
#
#########################################################

In [62]:
# We have 506 data points (houses) and 13 columns (features). From inspecting the dataset,
# we can see there are no missing values in the dataset. However, many datasets in the real
# world aren't this neat and tidy, and may contain values that are missing. To illustrate
# how we can go about cleaning our data, let's make the first few rows of values in the AGE
# column not-a-number (NaN), which is what they will be if you import a dataset with missing values:
import numpy as np
X['AGE'][0:4] = np.nan
X

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [63]:
# We can now see that the first four rows of the 'AGE' column are NaN instead
# of their original values. Let's clean the data using a few different methods
# in the DataCleaning class. First, let's copy the X data so we can more easily
# see the changes we make to each dataset
from copy import copy
X1row = X1col = X2 = X3 = X4 = copy(X)

In [64]:
# There are few different ways we can clean the data: (1) we can simply 
# remove the data points that have missing values, (2) we can perform
# data imputation to replace the missing values with the mean (or median) value

# Let's start with simply removing the problematic data points:
X1row, y1 = DataCleaning().remove(X=X1row, 
                                  y=y, 
                                  axis=0)
X1row

# What we can see is that we now have 502 rows instead of the original 506. 
# The four data points that had missing values in the AGE column have been removed.

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
5,0.02985,0.0,2.18,0.0,0.458,6.430,58.7,6.0622,3.0,222.0,18.7,394.12,5.21
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.60,12.43
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.90,19.15
8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [65]:
# Note that we coudl have instead removed the column (feature) containing
# the missing values, as in our case all of the missing values were part of
# a single feature:
X1col, y1 = DataCleaning().remove(X=X1col, 
                                  y=y, 
                                  axis=1)
X1col

# What we can see is that we now have all 506 of our original data points,  
# but we now just have 12 of the 13 features as the AGE feature has been removed.

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,2.3889,1.0,273.0,21.0,393.45,6.48


In [66]:
# There are smarter things we can do besides simply removing the problematic
# data points. Here, let's use the imputation process to replace the missing
# values with the mean value of the feature column

X2, y2 = DataCleaning().imputation(X=X2, 
                                   y=y, 
                                   strategy='mean')
X2

# What we can see is that we now have all 506 of our original data points and 
# all 13 of our feature columns, and the first four rows of the AGE feature
# now contain the average value for that feature.

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,68.621315,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,68.621315,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,68.621315,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,68.621315,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.200000,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.100000,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.700000,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.000000,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.300000,2.3889,1.0,273.0,21.0,393.45,6.48


In [67]:
# Finally, let's do imputation again but use the median instead of 
# the mean value

X3, y3 = DataCleaning().imputation(X=X3, 
                                   y=y, 
                                   strategy='median')
X3

# What we can see is that we now have all 506 of our original data points and 
# all 13 of our feature columns, and the first four rows of the AGE feature
# now contain the median value for that feature.

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,77.7,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,77.7,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,77.7,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,77.7,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [68]:
# MAST-ML provides a convenient method for evaluating a data cleaning routine,
# and saving the output to a dedicated folder in your MAST-ML output directory
# Let's do imputation again on our final remaining dataset and examine what's
# contained in the saved output:

X4, y4 = DataCleaning().evaluate(X=X4, 
                                 y=y, 
                                 method='imputation', 
                                 savepath=savepath, 
                                 strategy='mean')

# Check your MAST-ML save directory: you'll see a new folder starting with the name "DataCleaning_..."
# In here, there are numerous Excel data files and a histogram plot. Below is a brief overview
# of what is contained in these files. One useful visualization is the histogram of the target data
# you are trying to fit machine learning models to
#
# data_cleaned.xlsx: the cleaned dataset
# data_columns_with_strings: listed column names that contain string entries. Empty for our dataset
# data_original.xlsx: the original dataset prior to cleaning
# data_outliers_all.xlsx: a summary of the possible outlier points for each feature column
# data_outliers_summary.xlsx: a summary showing, for each data point, the number of feature values which may be outliers
# histogram_target_values_statistics.xlsx: key summary statistics of the distribution of y target data
# histogram_target_values.png: histogram plot of the y target data with some basic statistics included
# histogram_target_values.xlsx: the raw data used to make the histogram plot of the same name

In [None]:
#########################################################################
#
# Task 3: Import and prepare a real dataset that is stored locally 
#
#########################################################################

In [None]:
# Next, we want to move away from the model datasets contained in scikit-learn and learn how to import
# a real materials science dataset. As part of MAST-ML, we have included a dataset in the mastml/data
# folder. The data is contained there as figshare_7418492/All_Model_Data.xlsx. This dataset contains
# calculated migration energies of solute alloys in metal host materials.

In [70]:
# Here, we use the LocalDatasets module to load in the above mentioned diffusion dataset. 

# Need to denote the column name of the target (y-data)
target = 'E_regression.1'

# There are columns in the data file not used as features or target. We need to
# list them here in the parameter extra_columns
extra_columns = ['Material compositions 1', 'Material compositions 2', 'Hop activation barrier', 'E_regression']

# Here, we make an instance of our LocalDatasets class. It needs a few parameters:
#   file_path: where the data is stored
#   target: the column name of the y-data
#   extra_columns: list containing extra columns in the data file not used for fitting
#   group_column: column name denoting group labels (only used for LeaveOutGroup CV)
#   testdata_columns: column names denoting left-out data to evaluate using best
#     model from CV tests. This is manual way to leave out data. Can also be done
#     automatically using nested CV (we will do this in later tutorials)
#   as_frame: whether to return data as dataframe. Want this to be true.
d = LocalDatasets(file_path='../mastml/data/figshare_7418492/All_Model_Data.xlsx', #'MAST-ML/mastml/data/figshare_7418492/All_Model_Data.xlsx'
                  target=target, 
                  extra_columns=extra_columns, 
                  group_column='Material compositions 1',
                  testdata_columns=None,
                  as_frame=True)

# Load the data with the load_data() method
data_dict = d.load_data()



In [71]:
# Let's look at the contents of the loaded data_dict
data_dict.keys()

# We see there are 5 keys:
#   X: the X feature matrix (used to fit the ML model)
#   y: the y target data vector (true values)
#   X_extra: matrix of extra information not used in fitting (i.e. not part of X or y)
#   groups: vector of group labels (here, a list of host elements)
#   X_testdata: matrix or vector of left out data. Empty for our current example.

dict_keys(['X', 'y', 'groups', 'X_extra', 'X_testdata'])

In [72]:
# Let's assign each data object to its respective name

X = data_dict['X']
y = data_dict['y']
X_extra = data_dict['X_extra']
groups = data_dict['groups']
X_testdata = data_dict['X_testdata']

In [73]:
# Let's have a look at the X feature matrix. It contains 287 elemental features.
# There are a total of 408 diffusion activation barriers used for fitting
X

Unnamed: 0,valence_composition_average,phi_composition_average,NdValence_composition_average,MiracleRadius_composition_average,GSestFCClatcnt_composition_average,SecondIonizationEnergy_composition_average,IonizationEnergy_composition_average,n_ws^third_composition_average,ThermalConductivity_composition_average,CovalentRadius_composition_average,...,Site2_MendeleevNumber,Site2_ElasticModulus,Site2_Electronegativity,Site2_AtomicWeight,Site2_HeatFusion,Site2_SpecificHeatCapacity,Site2_AtomicRadii,Site2_ThermalExpansionCoefficient,Site2_BCCefflatcnt,Site2_AtomicVolume
0,2.0,4.350,10.0,144.0,4.027313,21.4900,731.00,1.360,429.00,145.0,...,65,80.0,1.93,107.868200,11.30,0.235,1.444,18.9,6.375951,17.075648
1,2.5,4.725,8.5,134.5,3.737485,19.2750,744.50,1.555,264.50,135.5,...,58,208.0,1.88,58.933195,16.20,0.421,1.253,13.0,5.507318,10.995861
2,4.0,4.500,7.5,137.0,3.788936,18.9950,691.90,1.545,261.35,142.0,...,49,259.0,1.66,51.996100,21.00,0.449,1.249,4.9,5.632801,12.092937
3,2.0,4.400,10.0,135.0,3.782567,20.8910,738.20,1.415,415.00,138.5,...,64,124.0,1.90,63.546000,13.60,0.385,1.278,16.5,5.617632,11.829942
4,4.0,4.640,8.0,134.5,3.764269,18.8350,745.15,1.565,254.60,138.5,...,55,211.0,1.83,55.845000,13.81,0.449,1.241,11.8,5.557847,11.777365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,4.5,3.750,3.0,150.5,4.350534,13.7250,662.00,1.525,38.20,169.5,...,47,104.0,1.60,92.906380,30.00,0.265,1.429,7.3,6.625794,18.002133
404,4.5,3.750,2.5,151.5,4.348236,6.5650,710.50,1.520,40.10,172.5,...,48,183.0,1.50,180.947880,36.57,0.140,1.430,6.3,6.618497,18.046730
405,4.0,3.625,2.0,150.0,4.291886,13.3550,659.00,1.465,22.30,167.5,...,43,110.0,1.54,47.867000,14.15,0.523,1.448,8.6,6.458833,17.636317
406,4.0,3.525,2.0,158.0,4.494239,14.0275,651.00,1.430,22.85,175.0,...,45,139.0,1.30,178.490000,27.20,0.140,1.564,5.9,7.046762,22.268711


In [74]:
# Let's look at the groups list. The groups denote the element identity of the host 
# metal
groups

0      Ag
1      Ag
2      Ag
3      Ag
4      Ag
       ..
403    Zr
404    Zr
405    Zr
406    Zr
407    Zr
Name: Material compositions 1, Length: 408, dtype: object

In [75]:
# Let's examine the extra data not used in fitting. Note that the y-data we are 
# fitting to are equal to the E_regression values listed here, minus the E_regression
# value of the pure material. Material compositions 1 are the host elements, which
# are the same values used in the groups list. Material compositions 2 are the solute
# elements (i.e. the diffusing impurity in the host material)
X_extra

Unnamed: 0,Material compositions 1,Material compositions 2,Hop activation barrier,E_regression
0,Ag,Ag,-0.365590,1.824500
1,Ag,Co,-0.675263,1.734358
2,Ag,Cr,-0.047690,2.083639
3,Ag,Cu,-0.428459,1.802300
4,Ag,Fe,0.081529,2.142172
...,...,...,...,...
403,Zr,Nb,-0.478007,2.523770
404,Zr,Ta,-0.054726,2.747650
405,Zr,Ti,-0.022476,2.842630
406,Zr,Hf,-0.104201,2.801830


In [76]:
# Finally, we can look at our manually left-out data set. For this tutorial we
# have left this blank, but we will make use of it in a future tutorial
X_testdata

In [None]:
#########################################################################
#
# Task 4: Download and import data from various materials databases 
#
#########################################################################

In [None]:
# You've now completed your second MAST-ML tutorial notebook! Now that you're more familiar with working with datasets
# within MAST-ML, the next thing for us to do is look at feature engineering in more detail. 

# The next example in this notebook series is titled MASTML_Tutorial_3_FeatureEngineering.ipynb, and will guide you through the process
# of generating, preprocessing, and selecting features for a particular model and data split test.