In [1]:
# Import needed MASTML modules

from mastml.mastml import Mastml
from mastml.datasets import LocalDatasets, SklearnDatasets, FoundryDatasets, FigshareDatasets, DataUtilities, DataCleaning
from mastml.preprocessing import SklearnPreprocessor, MeanStdevScaler
from mastml.models import SklearnModel
from mastml.data_splitters import SklearnDataSplitter, NoSplit, LeaveOutTwinCV
from mastml.plots import Histogram, Scatter
from mastml.feature_selectors import EnsembleModelFeatureSelector, NoSelect
from mastml.feature_generators import ElementalFeatureGenerator

To import data from figshare, manually install figshare via git clone of git clone https://github.com/cognoma/figshare.git
If you want to use XGBoost models, please manually install xgboost package


In [2]:
# Initialize the MASTML run, write savepath and metadata file

mastml = Mastml(savepath='results/test_output')
savepath = mastml.get_savepath
mastml_metadata = mastml.get_mastml_metadata

results/test_output not empty. Renaming...


In [3]:
# Download diffusion data from Figshare (done previously)

#article_id = 7418492
#FigshareDatasets().download_data(article_id=article_id)

In [4]:
# Load in diffusion data downloaded from Figshare 

target = 'E_regression.1'
extra_columns = ['E_regression', 'Material compositions 1', 'Material compositions 2', 'Hop activation barrier']
d = LocalDatasets(file_path='figshare_7418492/All_Model_Data_missing.xlsx', 
                  target=target, 
                  extra_columns=extra_columns, 
                  as_frame=True)

X, y = d.load_data()



In [5]:
X

Unnamed: 0,valence_composition_average,phi_composition_average,NdValence_composition_average,MiracleRadius_composition_average,GSestFCClatcnt_composition_average,SecondIonizationEnergy_composition_average,IonizationEnergy_composition_average,n_ws^third_composition_average,ThermalConductivity_composition_average,CovalentRadius_composition_average,...,Site2_MendeleevNumber,Site2_ElasticModulus,Site2_Electronegativity,Site2_AtomicWeight,Site2_HeatFusion,Site2_SpecificHeatCapacity,Site2_AtomicRadii,Site2_ThermalExpansionCoefficient,Site2_BCCefflatcnt,Site2_AtomicVolume
0,,4.350,10.0,144.0,4.027313,21.4900,731.00,1.360,429.00,145.0,...,65,80.0,1.93,107.868200,11.30,0.235,1.444,18.9,6.375951,17.075648
1,,4.725,8.5,134.5,3.737485,19.2750,744.50,1.555,264.50,135.5,...,58,208.0,1.88,58.933195,16.20,0.421,1.253,13.0,5.507318,10.995861
2,,4.500,7.5,137.0,3.788936,18.9950,691.90,1.545,261.35,142.0,...,49,259.0,1.66,51.996100,21.00,0.449,1.249,4.9,5.632801,12.092937
3,,4.400,10.0,135.0,3.782567,20.8910,738.20,1.415,415.00,138.5,...,64,124.0,1.90,63.546000,13.60,0.385,1.278,16.5,5.617632,11.829942
4,,4.640,8.0,134.5,3.764269,18.8350,745.15,1.565,254.60,138.5,...,55,211.0,1.83,55.845000,13.81,0.449,1.241,11.8,5.557847,11.777365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,4.5,3.750,3.0,150.5,4.350534,13.7250,662.00,1.525,38.20,169.5,...,47,104.0,1.60,92.906380,30.00,0.265,1.429,7.3,6.625794,18.002133
404,4.5,3.750,2.5,151.5,4.348236,6.5650,710.50,1.520,40.10,172.5,...,48,183.0,1.50,180.947880,36.57,0.140,1.430,6.3,6.618497,18.046730
405,4.0,3.625,2.0,150.0,4.291886,13.3550,659.00,1.465,22.30,167.5,...,43,110.0,1.54,47.867000,14.15,0.523,1.448,8.6,6.458833,17.636317
406,4.0,3.525,2.0,158.0,4.494239,14.0275,651.00,1.430,22.85,175.0,...,45,139.0,1.30,178.490000,27.20,0.140,1.564,5.9,7.046762,22.268711


In [6]:
# The data contains missing values. Clean the data, which corrects missing values
# and provides some basic analysis of the input data

cleaner = DataCleaning()
X, y = cleaner.evaluate(X=X, 
                        y=y, 
                        method='imputation', 
                        strategy='mean', 
                        savepath=savepath)

In [7]:
X

Unnamed: 0,valence_composition_average,phi_composition_average,NdValence_composition_average,MiracleRadius_composition_average,GSestFCClatcnt_composition_average,SecondIonizationEnergy_composition_average,IonizationEnergy_composition_average,n_ws^third_composition_average,ThermalConductivity_composition_average,CovalentRadius_composition_average,...,Site2_MendeleevNumber,Site2_ElasticModulus,Site2_Electronegativity,Site2_AtomicWeight,Site2_HeatFusion,Site2_SpecificHeatCapacity,Site2_AtomicRadii,Site2_ThermalExpansionCoefficient,Site2_BCCefflatcnt,Site2_AtomicVolume
0,3.981108,4.350,10.0,144.0,4.027313,21.4900,731.00,1.360,429.00,145.0,...,65.0,80.0,1.93,107.868200,11.30,0.235,1.444,18.9,6.375951,17.075648
1,3.981108,4.725,8.5,134.5,3.737485,19.2750,744.50,1.555,264.50,135.5,...,58.0,208.0,1.88,58.933195,16.20,0.421,1.253,13.0,5.507318,10.995861
2,3.981108,4.500,7.5,137.0,3.788936,18.9950,691.90,1.545,261.35,142.0,...,49.0,259.0,1.66,51.996100,21.00,0.449,1.249,4.9,5.632801,12.092937
3,3.981108,4.400,10.0,135.0,3.782567,20.8910,738.20,1.415,415.00,138.5,...,64.0,124.0,1.90,63.546000,13.60,0.385,1.278,16.5,5.617632,11.829942
4,3.981108,4.640,8.0,134.5,3.764269,18.8350,745.15,1.565,254.60,138.5,...,55.0,211.0,1.83,55.845000,13.81,0.449,1.241,11.8,5.557847,11.777365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,4.500000,3.750,3.0,150.5,4.350534,13.7250,662.00,1.525,38.20,169.5,...,47.0,104.0,1.60,92.906380,30.00,0.265,1.429,7.3,6.625794,18.002133
404,4.500000,3.750,2.5,151.5,4.348236,6.5650,710.50,1.520,40.10,172.5,...,48.0,183.0,1.50,180.947880,36.57,0.140,1.430,6.3,6.618497,18.046730
405,4.000000,3.625,2.0,150.0,4.291886,13.3550,659.00,1.465,22.30,167.5,...,43.0,110.0,1.54,47.867000,14.15,0.523,1.448,8.6,6.458833,17.636317
406,4.000000,3.525,2.0,158.0,4.494239,14.0275,651.00,1.430,22.85,175.0,...,45.0,139.0,1.30,178.490000,27.20,0.140,1.564,5.9,7.046762,22.268711


In [8]:
# Preprocess the cleaned data to be normalized

#preprocessor = SklearnPreprocessor(preprocessor='StandardScaler', as_frame=True)
#X = preprocessor.evaluate(X=X, 
#                          savepath=savepath)

preprocessor = MeanStdevScaler(mean=0, stdev=1)
X = preprocessor.fit_transform(X)

In [9]:
print(X['valence_composition_average'].mean(), X['valence_composition_average'].std())

2.612289469706251e-16 1.0


In [10]:
X

Unnamed: 0,valence_composition_average,phi_composition_average,NdValence_composition_average,MiracleRadius_composition_average,GSestFCClatcnt_composition_average,SecondIonizationEnergy_composition_average,IonizationEnergy_composition_average,n_ws^third_composition_average,ThermalConductivity_composition_average,CovalentRadius_composition_average,...,Site2_MendeleevNumber,Site2_ElasticModulus,Site2_Electronegativity,Site2_AtomicWeight,Site2_HeatFusion,Site2_SpecificHeatCapacity,Site2_AtomicRadii,Site2_ThermalExpansionCoefficient,Site2_BCCefflatcnt,Site2_AtomicVolume
0,3.876616e-16,-0.188044,1.526651,0.152672,-0.112276,1.350633,-0.048616,-0.767812,3.858846,0.169038,...,0.445555,-0.578574,0.295503,0.048064,-0.637562,-0.382002,-0.048760,0.182911,-0.240464,-0.328259
1,3.876616e-16,0.401340,0.993825,-0.581063,-0.938868,0.848052,0.142057,0.158312,1.699250,-0.568374,...,0.081045,0.331357,0.162105,-0.824720,-0.277377,0.311258,-1.098431,-0.176227,-1.351695,-1.080428
2,3.876616e-16,0.047709,0.638608,-0.387975,-0.792129,0.784520,-0.600860,0.110818,1.657896,-0.063829,...,-0.387611,0.693908,-0.424847,-0.948446,0.075456,0.415620,-1.120413,-0.669281,-1.191167,-0.944702
3,3.876616e-16,-0.109460,1.526651,-0.542445,-0.810293,1.214720,0.053076,-0.506597,3.675050,-0.335507,...,0.393482,-0.265785,0.215464,-0.742448,-0.468496,0.177079,-0.961039,0.036821,-1.210573,-0.977239
4,3.876616e-16,0.267746,0.816216,-0.581063,-0.862480,0.748216,0.151238,0.205805,1.569280,-0.335507,...,-0.075174,0.352684,0.028707,-0.879799,-0.453059,0.415620,-1.164378,-0.249272,-1.287054,-0.983743
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,4.529594e-01,-1.131059,-0.959870,0.654702,0.809553,-0.411238,-1.023165,0.015831,-1.271671,2.070784,...,-0.491757,-0.407962,-0.584924,-0.218789,0.737020,-0.270186,-0.131195,-0.523191,0.079157,-0.213637
404,4.529594e-01,-1.131059,-1.137479,0.731937,0.802997,-2.035835,-0.338156,-0.007916,-1.246727,2.303651,...,-0.439684,0.153636,-0.851720,1.351481,1.219961,-0.736087,-0.125700,-0.584062,0.069822,-0.208120
405,1.649124e-02,-1.327521,-1.315088,0.616084,0.642288,-0.495191,-1.065537,-0.269130,-1.480410,1.915540,...,-0.700048,-0.365309,-0.745002,-1.022091,-0.428067,0.691433,-0.026778,-0.444059,-0.134435,-0.258895
406,1.649124e-02,-1.484690,-1.315088,1.233966,1.219399,-0.342601,-1.178528,-0.435357,-1.473189,2.497707,...,-0.595903,-0.159153,-1.385312,1.307643,0.531200,-0.736087,0.610718,-0.608410,0.617695,0.314209


In [11]:
# Define two models and two feature selector types to perform

model1 = SklearnModel(model='KernelRidge', kernel='rbf')
model2 = SklearnModel(model='LinearRegression')
models = [model1, model2]

selector1 = NoSelect()
selector2 = EnsembleModelFeatureSelector(model=SklearnModel(model='RandomForestRegressor'), k_features=10)
selectors = [selector1, selector2]


In [12]:
# Define and run the case where no data split is performed (full fit)

# splitter = NoSplit()
# splitter.evaluate(X=X,
#                   y=y, 
#                   models=models,
#                   selectors=selectors,
#                   savepath=savepath)

In [13]:
# Define and run the case where random KFold CV is performed

# splitter = SklearnDataSplitter(splitter='KFold', shuffle=True)
# splitter.evaluate(X=X,
#                   y=y, 
#                   models=models,
#                   selectors=selectors,
#                   savepath=savepath)

In [14]:
# Define and run the case where leave out twins CV is performed

# splitter = LeaveOutTwinCV(cv='KFold', threshold='1')
# splitter.evaluate(X=X,
#                   y=y, 
#                   models=models,
#                   selectors=selectors,
#                   savepath=savepath)

In [15]:
import sklearn, pandas, numpy, matplotlib, pymatgen, matminer
print(sklearn.__version__, pandas.__version__, numpy.__version__, matplotlib.__version__, matminer.__version__, pymatgen.__version__)

0.23.2 1.1.5 1.19.5 3.3.3 0.6.4 2020.12.31


In [16]:
!which python

/Users/averychan/opt/anaconda3/envs/MAST_ML/bin/python


In [18]:
from mastml.tests.unit_tests.test_data_splitters import TestSplitters

In [19]:
TestSplitters

mastml.tests.unit_tests.test_data_splitters.TestSplitters