In [None]:
###############################################################################################
#
#  Welcome to the third MAST-ML tutorial notebook, MASTML_Tutorial_3_FeatureEngineering.ipynb! 
#  In this notebook, we will learn different ways to generate, preprocess, and select
#  features:
#       1. Generate features based on material composition
#       2. Generate one-hot encoded features based on group labels
#       3. Preprocess features to be normalized
#       4. Select features using an ensemble model-based approach
#       5. Select features using forward selection
#       6. Generate learning curves using a basic feature selection approach
#
###############################################################################################

In [None]:
#####################################
#
# Task 0: Setting up MAST-ML in Colab
#
#####################################

In [None]:
# If you are working on Google Colab and need to install MAST-ML, 
# begin by cloning the relevant branch of MAST-ML to the Colab session
# and install the needed dependencies:

!git clone --single-branch --branch dev_Ryan_2020-12-21 https://github.com/uw-cmg/MAST-ML
!pip install -r MAST-ML/requirements.txt

In [None]:
# Sync your Google drive to Colab so that we can save MAST-ML results to our Google
# Drive. If we save to the Colab session, the data will be deleted when the session 
# ends.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# We need to add the MAST-ML folder to our sys path so that python can find the modules
import sys
sys.path.append('MAST-ML')

In [1]:
# Here we import the MAST-ML modules used in this tutorial
from mastml.mastml import Mastml
from mastml.datasets import LocalDatasets
from mastml.models import SklearnModel
from mastml.preprocessing import SklearnPreprocessor
from mastml.feature_selectors import SklearnFeatureSelector, EnsembleModelFeatureSelector
from mastml.feature_generators import ElementalFeatureGenerator, OneHotGroupGenerator
from mastml.learning_curve import LearningCurve
import numpy as np
from copy import copy

In [2]:
# Set the name of the savepath to save MAST-ML results to
SAVEPATH = 'drive/MyDrive/MASTML_tutorial_3_FeatureEngineering'

# Initialize the MAST-ML run, write savepath
mastml = Mastml(savepath=SAVEPATH)
savepath = mastml.get_savepath

# When the above command is run, a new folder with the name designated SAVEPATH is created.
# This is where all of the output for the current MAST-ML run will be saved to.
# Note that you can perform multiple runs with the same folder name, and the current datetime
# will be appended to the name so that no data is lost or overwritten.

drive/MyDrive/MASTML_tutorial_3_FeatureEngineering not empty. Renaming...


In [3]:
##########################################################
#
# Task 1: Generate features based on material composition
#
##########################################################

In [4]:
# Let's import the diffusion data used in the previous tutorial again
# using the LocalDatasets class. This time, we are using a version
# where there are no X features, and we will proceed to generate new features
target = 'E_regression'
extra_columns = ['Material compositions 1', 'Material compositions 2', 'Material compositions joined']

d = LocalDatasets(file_path='../mastml/data/diffusion_data_nofeatures.xlsx', 
                  target=target, 
                  extra_columns=extra_columns, 
                  group_column='Material compositions 1',
                  testdata_columns=None,
                  as_frame=True)

# Load the data with the load_data() method
data_dict = d.load_data()

# Designate the X, y and X_extra data 
X = data_dict['X']
y = data_dict['y']
X_extra = data_dict['X_extra']
groups = data_dict['groups']




In [5]:
# If we look at the X feature data, we see that it is empty. We need to make some
# features to described our data!
X

0
1
2
3
4
...
403
404
405
406
407


In [6]:
# Since this dataset contains materials composition strings, we can create
# features of the compositinos using elemental properties of the elements
# in each composition. We can do that using the ElementalFeatureGenerator
# class. The ElementalFeatureGenerator takes the following arguments:
#
# composition_df : a dataframe with the composition strings to featurize
# feature_types : list denoting which types of elemental feature operations to make
# remove_constant_columns : bool denoting whether columns with same value for each point are removed
#
# The evaluate() method conducts the feature generation and saves the output to a designated
# folder under the provided savepath. In this folder, the generated data is saved as the file
# generated_features.xlsx

X, y = ElementalFeatureGenerator(composition_df=X_extra['Material compositions joined'], 
                                 feature_types=['composition_avg', 'arithmetic_avg', 'max', 'min', 'difference'], 
                                 remove_constant_columns=True).evaluate(X=X, y=y, savepath=savepath, make_new_dir=True)

Dropping 5/440 generated columns due to missing values


In [7]:
# Now let's examine what the X data look like:
X

# We can see that we now have 404 generated features based on properties of the elements

Unnamed: 0,AtomicNumber_arithmetic_average,AtomicNumber_composition_average,AtomicNumber_difference,AtomicNumber_max_value,AtomicNumber_min_value,AtomicRadii_arithmetic_average,AtomicRadii_composition_average,AtomicRadii_difference,AtomicRadii_max_value,AtomicRadii_min_value,...,phi_arithmetic_average,phi_composition_average,phi_difference,phi_max_value,phi_min_value,valence_arithmetic_average,valence_composition_average,valence_difference,valence_max_value,valence_min_value
0,47.0,47.0,0.0,47.0,47.0,1.4440,1.4440,0.000,1.444,1.444,...,4.350,4.350,0.00,4.35,4.35,2.0,2.0,0.0,2.0,2.0
1,37.0,37.0,20.0,47.0,27.0,1.3485,1.3485,0.191,1.444,1.253,...,4.725,4.725,0.75,5.10,4.35,2.5,2.5,1.0,3.0,2.0
2,35.5,35.5,23.0,47.0,24.0,1.3465,1.3465,0.195,1.444,1.249,...,4.500,4.500,0.30,4.65,4.35,4.0,4.0,4.0,6.0,2.0
3,38.0,38.0,18.0,47.0,29.0,1.3610,1.3610,0.166,1.444,1.278,...,4.400,4.400,0.10,4.45,4.35,2.0,2.0,0.0,2.0,2.0
4,36.5,36.5,21.0,47.0,26.0,1.3425,1.3425,0.203,1.444,1.241,...,4.640,4.640,0.58,4.93,4.35,4.0,4.0,4.0,6.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,40.5,40.5,1.0,41.0,40.0,1.5145,1.5145,0.171,1.600,1.429,...,3.750,3.750,0.60,4.05,3.45,4.5,4.5,1.0,5.0,4.0
404,56.5,56.5,33.0,73.0,40.0,1.5150,1.5150,0.170,1.600,1.430,...,3.750,3.750,0.60,4.05,3.45,4.5,4.5,1.0,5.0,4.0
405,31.0,31.0,18.0,40.0,22.0,1.5240,1.5240,0.152,1.600,1.448,...,3.625,3.625,0.35,3.80,3.45,4.0,4.0,0.0,4.0,4.0
406,56.0,56.0,32.0,72.0,40.0,1.5820,1.5820,0.036,1.600,1.564,...,3.525,3.525,0.15,3.60,3.45,4.0,4.0,0.0,4.0,4.0


In [8]:
######################################################################
#
# Task 2: Generate one-hot encoded features based on group labels
#
######################################################################

In [9]:
# We have generated a large feature set based on properties of the elements,
# but we can also add more features based on one-hot encoding of the 
# different host groups each data point corresponds to. The host elements
# are contained in the Material Compositions 1 column in X_extra

# We can also view the unique groups only:

np.unique(X_extra['Material compositions 1'])

array(['Ag', 'Al', 'Au', 'Ca', 'Cu', 'Fe', 'Ir', 'Mg', 'Mo', 'Ni', 'Pb',
       'Pd', 'Pt', 'W', 'Zr'], dtype=object)

In [10]:
# There are 15 unique groups. We can create one-hot encoded vectors of 
# each group with the OneHotGroupGenerator class. Like with the previous
# feature generation scheme, running the evaluate() method for this class
# will create an output folder with the generated data saved as the file
# generated_features.xlsx

X, y = OneHotGroupGenerator(groups=X_extra['Material compositions 1']).evaluate(X=X, y=y, savepath=savepath, make_new_dir=True)

In [11]:
# Let's look at our generated features once more:
X

# We can now see that we have 419 features, meaning we added 15 new one-hot 
# encoded features to our previous set of 404 elemental features. The new
# one-hot encoded features are appended to the right-hand end of the 
# dataframe.

Unnamed: 0,AtomicNumber_arithmetic_average,AtomicNumber_composition_average,AtomicNumber_difference,AtomicNumber_max_value,AtomicNumber_min_value,AtomicRadii_arithmetic_average,AtomicRadii_composition_average,AtomicRadii_difference,AtomicRadii_max_value,AtomicRadii_min_value,...,Material compositions 1_5,Material compositions 1_6,Material compositions 1_7,Material compositions 1_8,Material compositions 1_9,Material compositions 1_10,Material compositions 1_11,Material compositions 1_12,Material compositions 1_13,Material compositions 1_14
0,47.0,47.0,0.0,47.0,47.0,1.4440,1.4440,0.000,1.444,1.444,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,37.0,37.0,20.0,47.0,27.0,1.3485,1.3485,0.191,1.444,1.253,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,35.5,35.5,23.0,47.0,24.0,1.3465,1.3465,0.195,1.444,1.249,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,38.0,38.0,18.0,47.0,29.0,1.3610,1.3610,0.166,1.444,1.278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,36.5,36.5,21.0,47.0,26.0,1.3425,1.3425,0.203,1.444,1.241,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,40.5,40.5,1.0,41.0,40.0,1.5145,1.5145,0.171,1.600,1.429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
404,56.5,56.5,33.0,73.0,40.0,1.5150,1.5150,0.170,1.600,1.430,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
405,31.0,31.0,18.0,40.0,22.0,1.5240,1.5240,0.152,1.600,1.448,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
406,56.0,56.0,32.0,72.0,40.0,1.5820,1.5820,0.036,1.600,1.564,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
#######################################################
#
# Task 3: Preprocess features to be normalized
#
#######################################################

In [13]:
# We now have a complete set of reasonable features to use going forward.
# However, our features span different ranges based on their physical meaning, 
# which can distort how a machine learning model may behave. It is common
# to normalize or standardize the data in some way prior to fitting. Here,
# we invoke a common approach which is to normalize the data so that
# each feature has mean of 0 and standard deviation of 1. We can do this
# with the StandardScaler method in the SklearnPreprocessor class

preprocessor = SklearnPreprocessor(preprocessor='StandardScaler', as_frame=True)

In [14]:
# If we want to go ahead an normalize our full dataset in one go, we can
# do that now. Alternatively, we will see in the next tutorial that you can 
# pass the preprocessor object into a MAST-ML data splitter workflow, where
# the preprocessor will be used in each train/test split to standardize the
# data.

X = preprocessor.evaluate(X=X, y=y, savepath=savepath, make_new_dir=True)
Xcopy = copy(X) # Reserve this copy of the full dataset for later

# The resulting normalized dataset is saved to the file data_preprocessed.xlsx,
# and the fitted preprocessor class is saved as a .pkl file. This will be useful
# for preprocessing new data used for future predictions or if the model is 
# to be uploaded to the model hosting service DLHub

In [15]:
# Let's look at our newly preprocessed data:
X

Unnamed: 0,AtomicNumber_arithmetic_average,AtomicNumber_composition_average,AtomicNumber_difference,AtomicNumber_max_value,AtomicNumber_min_value,AtomicRadii_arithmetic_average,AtomicRadii_composition_average,AtomicRadii_difference,AtomicRadii_max_value,AtomicRadii_min_value,...,Material compositions 1_5,Material compositions 1_6,Material compositions 1_7,Material compositions 1_8,Material compositions 1_9,Material compositions 1_10,Material compositions 1_11,Material compositions 1_12,Material compositions 1_13,Material compositions 1_14
0,0.348272,0.348272,-1.369606,-0.358844,1.218017,0.137574,0.137574,-1.151827,-0.370794,0.928492,...,-0.32969,-0.166457,-0.373718,-0.29173,-0.32969,-0.141421,-0.281718,-0.281718,-0.296648,-0.195366
1,-0.285939,-0.285939,-0.322479,-0.358844,-0.093718,-0.624665,-0.624665,0.196745,-0.370794,-0.866396,...,-0.32969,-0.166457,-0.373718,-0.29173,-0.32969,-0.141421,-0.281718,-0.281718,-0.296648,-0.195366
2,-0.381070,-0.381070,-0.165410,-0.358844,-0.290479,-0.640628,-0.640628,0.224987,-0.370794,-0.903985,...,-0.32969,-0.166457,-0.373718,-0.29173,-0.32969,-0.141421,-0.281718,-0.281718,-0.296648,-0.195366
3,-0.222518,-0.222518,-0.427192,-0.358844,0.037455,-0.524895,-0.524895,0.020230,-0.370794,-0.631463,...,-0.32969,-0.166457,-0.373718,-0.29173,-0.32969,-0.141421,-0.281718,-0.281718,-0.296648,-0.195366
4,-0.317649,-0.317649,-0.270123,-0.358844,-0.159305,-0.672554,-0.672554,0.281472,-0.370794,-0.979164,...,-0.32969,-0.166457,-0.373718,-0.29173,-0.32969,-0.141421,-0.281718,-0.281718,-0.296648,-0.195366
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,-0.063965,-0.063965,-1.317250,-0.642580,0.758909,0.700275,0.700275,0.055533,0.528368,0.787532,...,-0.32969,-0.166457,-0.373718,-0.29173,-0.32969,-0.141421,-0.281718,-0.281718,-0.296648,5.118594
404,0.950772,0.950772,0.358153,0.870682,0.758909,0.704265,0.704265,0.048472,0.528368,0.796930,...,-0.32969,-0.166457,-0.373718,-0.29173,-0.32969,-0.141421,-0.281718,-0.281718,-0.296648,5.118594
405,-0.666465,-0.666465,-0.427192,-0.689870,-0.421652,0.776099,0.776099,-0.078618,0.528368,0.966081,...,-0.32969,-0.166457,-0.373718,-0.29173,-0.32969,-0.141421,-0.281718,-0.281718,-0.296648,5.118594
406,0.919061,0.919061,0.305797,0.823393,0.758909,1.239030,1.239030,-0.897646,0.528368,2.056170,...,-0.32969,-0.166457,-0.373718,-0.29173,-0.32969,-0.141421,-0.281718,-0.281718,-0.296648,5.118594


In [16]:
# The data values are very different than they used to be! Let's look at the
# mean and standard deviation of the first feature:
print(np.mean(X['AtomicNumber_arithmetic_average']), np.std(X['AtomicNumber_arithmetic_average']))

# Indeed, we can see that the mean is very close to zero and the standard
# deviation is very close to one

2.2653447745108894e-16 1.0000000000000002


In [17]:
##########################################################################
#
# Task 4: Select features using an ensemble model-based approach
#
##########################################################################

In [18]:
# As you saw previously, we now have a lot of features in our feature set- 419
# to be precise! It is unlikely that all of these features will be meaningful
# in whatever model we develop. This is where feature selection methods come
# into play. For the purposes of this tutorial, we will show off a couple
# feature selection routines by selecting features for this full dataset. In
# a later tutorial, we will show how the feature selection process can be
# conducted for every train/test evaluation split, which is a more methodical
# approach to feature engineering to prevent overfitting

# Let's start by using the EnsembleModelFeatureSelector, which fits the data
# using a random forest model, and selects features based on the resulting
# random forest feature importance ranking:

model = SklearnModel(model='RandomForestRegressor')
selector = EnsembleModelFeatureSelector(model=model, 
                                        n_features_to_select=20)
X = selector.evaluate(X=X, y=y, savepath=savepath, make_new_dir=True)
Xreduced = copy(X)

# We can see that a full spreadsheet of calculated feature importance rankings
# has been saved in the EnsembleModelFeatureSelector_feature_importances.xlsx file,
# and a text file of the 20 selected features is printed in the selected_features.txt file.
# Also, the resulting selected feature set is saved as selected_features.xlsx

In [None]:
# Let's look at our X data once again:
X

# We can see that we now have just 20 columns (features), 
# indicating the 20 features that were ranked most highly from
# this approach

In [None]:
#####################################################################################
#
# Task 5: Generate learning curves to assess behavior with data and feature number
#
#####################################################################################

In [None]:
# We now have our set of 20 selected features, but we can do another piece of 
# analysis to help further our understanding of how we may expect a model
# to behave with the feature set we have. We can do this by making two types
# of learning curves: the first will use our set of 20 features and assess
# how well a model performs as a function of amount of training data. This 
# learning curve can help assess whether additional training data may result
# in improved model performance. The second will use the full training data 
# set and assess how well the model performs as more features are added. 
# This second learning curve can help one assess whether more or fewer features 
# may be needed to optimize model performance.

In [None]:
LearningCurve().evaluate(model=model,
                        X=X,
                        y=y,
                        savepath=savepath,
                        selector=selector,
                        make_new_dir=True)

# Let's check what was output- it can be found in the newly created
# "LearningCurve_..." folder. The data_learning_curve.png plot shows the
# training and validation mean absolute error (MAE) as a function of number
# of training data points. We can see this set may benefit from additional
# training data, but the validation score isn't changing too much past
# about 250 data points.
#
# If we look at the feature_learning_curve.png, we see the behavior of the 
# model as a function of number of features, from just a single feature
# out to the 20 features we selected previously. Here, we can see the
# validation score becomes essentially flat after about the 10th feature.
# This indicates that we likely only need to use the top 10 features for
# our future models.

In [None]:
##########################################################
#
# Task 6: Select features using forward selection
#
##########################################################

In [19]:
# In the previous two tasks, we performed feature selection using ensemble
# models. Now, let's try a slightly more time consuming but often more
# accurate method- forward selection. We will start with our X data after
# performing the first round of feature selection, and list time re-select
# the top 10 features. We will perform the learning curve up to 10 features. 
# Which method resulted in the lower validation MAE at 10 features? 
# Which features did the superior method find resulting in an improved score?
#
# Be patient- this method will take several minutes to complete!

model = SklearnModel(model='RandomForestRegressor')
selector = SklearnFeatureSelector(selector='SequentialFeatureSelector',
                                  estimator=model.model,
                                  n_features_to_select=5)
X = selector.evaluate(X=X, y=y, savepath=savepath, make_new_dir=True)

LearningCurve().evaluate(model=model,
                        X=Xreduced,
                        y=y,
                        savepath=savepath,
                        selector=selector,
                        make_new_dir=True)

found new col MendeleevNumber_composition_average
found new col BCCvolume_padiff_difference
found new col BCCmagmom_min_value
found new col valence_max_value
found new col AtomicRadii_arithmetic_average
found new col MendeleevNumber_composition_average
found new col BCCvolume_padiff_difference
found new col BCCmagmom_min_value
found new col valence_max_value
found new col AtomicRadii_arithmetic_average


In [None]:
# You've now completed your third MAST-ML tutorial notebook! Now that you're more familiar with feature
# engineering methods in MAST-ML, the next task is to examine the breadth of models and tests we can perform 
# on our data.
#
# The next example in this notebook series is titled MASTML_Tutorial_4_Models_and_Tests.ipynb, and will guide you 
# through the process of setting up different types of runs that evaluate a few select model types and perform
# different varieties of data split tests.