In [2]:
# Import needed MASTML modules

from mastml.mastml import Mastml
from mastml.datasets import LocalDatasets, SklearnDatasets, FoundryDatasets, FigshareDatasets, DataUtilities, DataCleaning
from mastml.preprocessing import SklearnPreprocessor
from mastml.models import SklearnModel
from mastml.data_splitters import SklearnDataSplitter, NoSplit
from mastml.plots import Histogram, Scatter
from mastml.feature_selectors import EnsembleModelFeatureSelector, NoSelect
from mastml.feature_generators import ElementalFeatureGenerator



In [3]:
# Initialize the MASTML run, write savepath and metadata file

mastml = Mastml(savepath='results/test_output')
savepath = mastml.get_savepath
mastml_metadata = mastml.get_mastml_metadata

results/test_output not empty. Renaming...


In [4]:
# Download diffusion data from Figshare (done previously)

#article_id = 7418492
#FigshareDatasets().download_data(article_id=article_id)

In [5]:
# Load in diffusion data downloaded from Figshare 

target = 'E_regression.1'
extra_columns = ['E_regression', 'Material compositions 1', 'Material compositions 2', 'Hop activation barrier']
d = LocalDatasets(file_path='figshare_7418492/All_Model_Data_missing.xlsx', 
                  target=target, 
                  extra_columns=extra_columns, 
                  as_frame=True)

X, y = d.load_data()



In [6]:
X

Unnamed: 0,valence_composition_average,phi_composition_average,NdValence_composition_average,MiracleRadius_composition_average,GSestFCClatcnt_composition_average,SecondIonizationEnergy_composition_average,IonizationEnergy_composition_average,n_ws^third_composition_average,ThermalConductivity_composition_average,CovalentRadius_composition_average,...,Site2_MendeleevNumber,Site2_ElasticModulus,Site2_Electronegativity,Site2_AtomicWeight,Site2_HeatFusion,Site2_SpecificHeatCapacity,Site2_AtomicRadii,Site2_ThermalExpansionCoefficient,Site2_BCCefflatcnt,Site2_AtomicVolume
0,,4.350,10.0,144.0,4.027313,21.4900,731.00,1.360,429.00,145.0,...,65,80.0,1.93,107.868200,11.30,0.235,1.444,18.9,6.375951,17.075648
1,,4.725,8.5,134.5,3.737485,19.2750,744.50,1.555,264.50,135.5,...,58,208.0,1.88,58.933195,16.20,0.421,1.253,13.0,5.507318,10.995861
2,,4.500,7.5,137.0,3.788936,18.9950,691.90,1.545,261.35,142.0,...,49,259.0,1.66,51.996100,21.00,0.449,1.249,4.9,5.632801,12.092937
3,,4.400,10.0,135.0,3.782567,20.8910,738.20,1.415,415.00,138.5,...,64,124.0,1.90,63.546000,13.60,0.385,1.278,16.5,5.617632,11.829942
4,,4.640,8.0,134.5,3.764269,18.8350,745.15,1.565,254.60,138.5,...,55,211.0,1.83,55.845000,13.81,0.449,1.241,11.8,5.557847,11.777365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,4.5,3.750,3.0,150.5,4.350534,13.7250,662.00,1.525,38.20,169.5,...,47,104.0,1.60,92.906380,30.00,0.265,1.429,7.3,6.625794,18.002133
404,4.5,3.750,2.5,151.5,4.348236,6.5650,710.50,1.520,40.10,172.5,...,48,183.0,1.50,180.947880,36.57,0.140,1.430,6.3,6.618497,18.046730
405,4.0,3.625,2.0,150.0,4.291886,13.3550,659.00,1.465,22.30,167.5,...,43,110.0,1.54,47.867000,14.15,0.523,1.448,8.6,6.458833,17.636317
406,4.0,3.525,2.0,158.0,4.494239,14.0275,651.00,1.430,22.85,175.0,...,45,139.0,1.30,178.490000,27.20,0.140,1.564,5.9,7.046762,22.268711


In [7]:
# The data contains missing values. Clean the data, which corrects missing values
# and provides some basic analysis of the input data

cleaner = DataCleaning()
X, y = cleaner.evaluate(X=X, 
                        y=y, 
                        method='imputation', 
                        strategy='mean', 
                        savepath=savepath)

In [8]:
X

Unnamed: 0,valence_composition_average,phi_composition_average,NdValence_composition_average,MiracleRadius_composition_average,GSestFCClatcnt_composition_average,SecondIonizationEnergy_composition_average,IonizationEnergy_composition_average,n_ws^third_composition_average,ThermalConductivity_composition_average,CovalentRadius_composition_average,...,Site2_MendeleevNumber,Site2_ElasticModulus,Site2_Electronegativity,Site2_AtomicWeight,Site2_HeatFusion,Site2_SpecificHeatCapacity,Site2_AtomicRadii,Site2_ThermalExpansionCoefficient,Site2_BCCefflatcnt,Site2_AtomicVolume
0,3.981108,4.350,10.0,144.0,4.027313,21.4900,731.00,1.360,429.00,145.0,...,65.0,80.0,1.93,107.868200,11.30,0.235,1.444,18.9,6.375951,17.075648
1,3.981108,4.725,8.5,134.5,3.737485,19.2750,744.50,1.555,264.50,135.5,...,58.0,208.0,1.88,58.933195,16.20,0.421,1.253,13.0,5.507318,10.995861
2,3.981108,4.500,7.5,137.0,3.788936,18.9950,691.90,1.545,261.35,142.0,...,49.0,259.0,1.66,51.996100,21.00,0.449,1.249,4.9,5.632801,12.092937
3,3.981108,4.400,10.0,135.0,3.782567,20.8910,738.20,1.415,415.00,138.5,...,64.0,124.0,1.90,63.546000,13.60,0.385,1.278,16.5,5.617632,11.829942
4,3.981108,4.640,8.0,134.5,3.764269,18.8350,745.15,1.565,254.60,138.5,...,55.0,211.0,1.83,55.845000,13.81,0.449,1.241,11.8,5.557847,11.777365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,4.500000,3.750,3.0,150.5,4.350534,13.7250,662.00,1.525,38.20,169.5,...,47.0,104.0,1.60,92.906380,30.00,0.265,1.429,7.3,6.625794,18.002133
404,4.500000,3.750,2.5,151.5,4.348236,6.5650,710.50,1.520,40.10,172.5,...,48.0,183.0,1.50,180.947880,36.57,0.140,1.430,6.3,6.618497,18.046730
405,4.000000,3.625,2.0,150.0,4.291886,13.3550,659.00,1.465,22.30,167.5,...,43.0,110.0,1.54,47.867000,14.15,0.523,1.448,8.6,6.458833,17.636317
406,4.000000,3.525,2.0,158.0,4.494239,14.0275,651.00,1.430,22.85,175.0,...,45.0,139.0,1.30,178.490000,27.20,0.140,1.564,5.9,7.046762,22.268711


In [9]:
# Preprocess the cleaned data to be normalized

preprocessor = SklearnPreprocessor(preprocessor='StandardScaler', as_frame=True)
X = preprocessor.evaluate(X=X, 
                          savepath=savepath)

In [10]:
X

Unnamed: 0,valence_composition_average,phi_composition_average,NdValence_composition_average,MiracleRadius_composition_average,GSestFCClatcnt_composition_average,SecondIonizationEnergy_composition_average,IonizationEnergy_composition_average,n_ws^third_composition_average,ThermalConductivity_composition_average,CovalentRadius_composition_average,...,Site2_MendeleevNumber,Site2_ElasticModulus,Site2_Electronegativity,Site2_AtomicWeight,Site2_HeatFusion,Site2_SpecificHeatCapacity,Site2_AtomicRadii,Site2_ThermalExpansionCoefficient,Site2_BCCefflatcnt,Site2_AtomicVolume
0,3.881375e-16,-0.188275,1.528525,0.152860,-0.112414,1.352291,-0.048675,-0.768754,3.863583,0.169245,...,0.446102,-0.579284,0.295866,0.048123,-0.638344,-0.382471,-0.048820,0.183136,-0.240759,-0.328662
1,3.881375e-16,0.401833,0.995045,-0.581776,-0.940020,0.849093,0.142231,0.158506,1.701336,-0.569072,...,0.081144,0.331764,0.162304,-0.825732,-0.277718,0.311640,-1.099779,-0.176443,-1.353355,-1.081755
2,3.881375e-16,0.047768,0.639392,-0.388451,-0.793101,0.785483,-0.601598,0.110954,1.659932,-0.063907,...,-0.388087,0.694760,-0.425368,-0.949611,0.075549,0.416130,-1.121789,-0.670103,-1.192629,-0.945862
3,3.881375e-16,-0.109594,1.528525,-0.543111,-0.811287,1.216212,0.053142,-0.507219,3.679562,-0.335919,...,0.393965,-0.266111,0.215729,-0.743359,-0.469071,0.177296,-0.962219,0.036866,-1.212059,-0.978438
4,3.881375e-16,0.268075,0.817218,-0.581776,-0.863539,0.749135,0.151423,0.206058,1.571207,-0.335919,...,-0.075266,0.353117,0.028742,-0.880879,-0.453615,0.416130,-1.165808,-0.249578,-1.288635,-0.984951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,4.535155e-01,-1.132448,-0.961049,0.655505,0.810546,-0.411743,-1.024421,0.015851,-1.273232,2.073327,...,-0.492361,-0.408463,-0.585642,-0.219057,0.737924,-0.270518,-0.131356,-0.523833,0.079254,-0.213899
404,4.535155e-01,-1.132448,-1.138875,0.732836,0.803983,-2.038335,-0.338571,-0.007925,-1.248258,2.306480,...,-0.440224,0.153825,-0.852766,1.353140,1.221458,-0.736991,-0.125854,-0.584779,0.069908,-0.208375
405,1.651149e-02,-1.329151,-1.316702,0.616840,0.643076,-0.495799,-1.066845,-0.269460,-1.482227,1.917892,...,-0.700908,-0.365757,-0.745917,-1.023346,-0.428592,0.692282,-0.026811,-0.444604,-0.134600,-0.259212
406,1.651149e-02,-1.486513,-1.316702,1.235481,1.220896,-0.343022,-1.179975,-0.435892,-1.474998,2.500774,...,-0.596634,-0.159348,-1.387013,1.309249,0.531852,-0.736991,0.611468,-0.609157,0.618454,0.314594


In [11]:
# Define two models and two feature selector types to perform

model1 = SklearnModel(model='KernelRidge', kernel='rbf')
model2 = SklearnModel(model='LinearRegression')
models = [model1, model2]

selector1 = NoSelect()
selector2 = EnsembleModelFeatureSelector(model=SklearnModel(model='RandomForestRegressor'), k_features=10)
selectors = [selector1, selector2]


In [12]:
# Define and run the case where no data split is performed (full fit)

splitter = NoSplit()
splitter.evaluate(X=X,
                  y=y, 
                  models=models,
                  selectors=selectors,
                  savepath=savepath)

In [None]:
# Define and run the case where random KFold CV is performed

splitter = SklearnDataSplitter(splitter='KFold', shuffle=True)
splitter.evaluate(X=X,
                  y=y, 
                  models=models,
                  selectors=selectors,
                  savepath=savepath)

In [1]:
import sklearn, pandas, numpy, matplotlib, pymatgen, matminer
print(sklearn.__version__, pandas.__version__, numpy.__version__, matplotlib.__version__, matminer.__version__, pymatgen.__version__)

0.24.1 1.2.1 1.19.5 3.3.4 0.6.4 2020.12.31
