In [1]:
# Import needed MASTML modules

from mastml.mastml import Mastml
from mastml.datasets import LocalDatasets, SklearnDatasets, FoundryDatasets, FigshareDatasets, DataUtilities, DataCleaning
from mastml.preprocessing import SklearnPreprocessor, MeanStdevScaler
from mastml.models import SklearnModel
from mastml.data_splitters import SklearnDataSplitter, NoSplit
from mastml.plots import Histogram, Scatter
from mastml.feature_selectors import EnsembleModelFeatureSelector, NoSelect
from mastml.feature_generators import ElementalFeatureGenerator

In [2]:
# Initialize the MASTML run, write savepath and metadata file

mastml = Mastml(savepath='results/test_output')
savepath = mastml.get_savepath
mastml_metadata = mastml.get_mastml_metadata

results/test_output not empty. Renaming...


In [3]:
# Download diffusion data from Figshare (done previously)

#article_id = 7418492
#FigshareDatasets().download_data(article_id=article_id)

In [4]:
# Load in diffusion data downloaded from Figshare 

target = 'E_regression.1'
extra_columns = ['E_regression', 'Material compositions 1', 'Material compositions 2', 'Hop activation barrier']
d = LocalDatasets(file_path='figshare_7418492/All_Model_Data_missing.xlsx', 
                  target=target, 
                  extra_columns=extra_columns, 
                  as_frame=True)

X, y = d.load_data()



In [5]:
X

Unnamed: 0,valence_composition_average,phi_composition_average,NdValence_composition_average,MiracleRadius_composition_average,GSestFCClatcnt_composition_average,SecondIonizationEnergy_composition_average,IonizationEnergy_composition_average,n_ws^third_composition_average,ThermalConductivity_composition_average,CovalentRadius_composition_average,...,Site2_MendeleevNumber,Site2_ElasticModulus,Site2_Electronegativity,Site2_AtomicWeight,Site2_HeatFusion,Site2_SpecificHeatCapacity,Site2_AtomicRadii,Site2_ThermalExpansionCoefficient,Site2_BCCefflatcnt,Site2_AtomicVolume
0,,4.350,10.0,144.0,4.027313,21.4900,731.00,1.360,429.00,145.0,...,65,80.0,1.93,107.868200,11.30,0.235,1.444,18.9,6.375951,17.075648
1,,4.725,8.5,134.5,3.737485,19.2750,744.50,1.555,264.50,135.5,...,58,208.0,1.88,58.933195,16.20,0.421,1.253,13.0,5.507318,10.995861
2,,4.500,7.5,137.0,3.788936,18.9950,691.90,1.545,261.35,142.0,...,49,259.0,1.66,51.996100,21.00,0.449,1.249,4.9,5.632801,12.092937
3,,4.400,10.0,135.0,3.782567,20.8910,738.20,1.415,415.00,138.5,...,64,124.0,1.90,63.546000,13.60,0.385,1.278,16.5,5.617632,11.829942
4,,4.640,8.0,134.5,3.764269,18.8350,745.15,1.565,254.60,138.5,...,55,211.0,1.83,55.845000,13.81,0.449,1.241,11.8,5.557847,11.777365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,4.5,3.750,3.0,150.5,4.350534,13.7250,662.00,1.525,38.20,169.5,...,47,104.0,1.60,92.906380,30.00,0.265,1.429,7.3,6.625794,18.002133
404,4.5,3.750,2.5,151.5,4.348236,6.5650,710.50,1.520,40.10,172.5,...,48,183.0,1.50,180.947880,36.57,0.140,1.430,6.3,6.618497,18.046730
405,4.0,3.625,2.0,150.0,4.291886,13.3550,659.00,1.465,22.30,167.5,...,43,110.0,1.54,47.867000,14.15,0.523,1.448,8.6,6.458833,17.636317
406,4.0,3.525,2.0,158.0,4.494239,14.0275,651.00,1.430,22.85,175.0,...,45,139.0,1.30,178.490000,27.20,0.140,1.564,5.9,7.046762,22.268711


In [6]:
# The data contains missing values. Clean the data, which corrects missing values
# and provides some basic analysis of the input data

cleaner = DataCleaning()
X, y = cleaner.evaluate(X=X, 
                        y=y, 
                        method='imputation', 
                        strategy='mean', 
                        savepath=savepath)

In [7]:
X

Unnamed: 0,valence_composition_average,phi_composition_average,NdValence_composition_average,MiracleRadius_composition_average,GSestFCClatcnt_composition_average,SecondIonizationEnergy_composition_average,IonizationEnergy_composition_average,n_ws^third_composition_average,ThermalConductivity_composition_average,CovalentRadius_composition_average,...,Site2_MendeleevNumber,Site2_ElasticModulus,Site2_Electronegativity,Site2_AtomicWeight,Site2_HeatFusion,Site2_SpecificHeatCapacity,Site2_AtomicRadii,Site2_ThermalExpansionCoefficient,Site2_BCCefflatcnt,Site2_AtomicVolume
0,3.981108,4.350,10.0,144.0,4.027313,21.4900,731.00,1.360,429.00,145.0,...,65.0,80.0,1.93,107.868200,11.30,0.235,1.444,18.9,6.375951,17.075648
1,3.981108,4.725,8.5,134.5,3.737485,19.2750,744.50,1.555,264.50,135.5,...,58.0,208.0,1.88,58.933195,16.20,0.421,1.253,13.0,5.507318,10.995861
2,3.981108,4.500,7.5,137.0,3.788936,18.9950,691.90,1.545,261.35,142.0,...,49.0,259.0,1.66,51.996100,21.00,0.449,1.249,4.9,5.632801,12.092937
3,3.981108,4.400,10.0,135.0,3.782567,20.8910,738.20,1.415,415.00,138.5,...,64.0,124.0,1.90,63.546000,13.60,0.385,1.278,16.5,5.617632,11.829942
4,3.981108,4.640,8.0,134.5,3.764269,18.8350,745.15,1.565,254.60,138.5,...,55.0,211.0,1.83,55.845000,13.81,0.449,1.241,11.8,5.557847,11.777365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,4.500000,3.750,3.0,150.5,4.350534,13.7250,662.00,1.525,38.20,169.5,...,47.0,104.0,1.60,92.906380,30.00,0.265,1.429,7.3,6.625794,18.002133
404,4.500000,3.750,2.5,151.5,4.348236,6.5650,710.50,1.520,40.10,172.5,...,48.0,183.0,1.50,180.947880,36.57,0.140,1.430,6.3,6.618497,18.046730
405,4.000000,3.625,2.0,150.0,4.291886,13.3550,659.00,1.465,22.30,167.5,...,43.0,110.0,1.54,47.867000,14.15,0.523,1.448,8.6,6.458833,17.636317
406,4.000000,3.525,2.0,158.0,4.494239,14.0275,651.00,1.430,22.85,175.0,...,45.0,139.0,1.30,178.490000,27.20,0.140,1.564,5.9,7.046762,22.268711


In [9]:
# Preprocess the cleaned data to be normalized

#preprocessor = SklearnPreprocessor(preprocessor='StandardScaler', as_frame=True)
#X = preprocessor.evaluate(X=X, 
#                          savepath=savepath)

preprocessor = MeanStdevScaler(mean=0, stdev=1)
X = preprocessor.fit_transform(X)

In [10]:
print(X['valence_composition_average'].mean(), X['valence_composition_average'].std())

-0.25092935968119484 0.0006533240342396751


In [11]:
X

Unnamed: 0,valence_composition_average,phi_composition_average,NdValence_composition_average,MiracleRadius_composition_average,GSestFCClatcnt_composition_average,SecondIonizationEnergy_composition_average,IonizationEnergy_composition_average,n_ws^third_composition_average,ThermalConductivity_composition_average,CovalentRadius_composition_average,...,Site2_MendeleevNumber,Site2_ElasticModulus,Site2_Electronegativity,Site2_AtomicWeight,Site2_HeatFusion,Site2_SpecificHeatCapacity,Site2_AtomicRadii,Site2_ThermalExpansionCoefficient,Site2_BCCefflatcnt,Site2_AtomicVolume
0,-0.250929,-0.250719,-0.247497,-0.171075,-0.250903,-0.240944,0.163697,-0.252424,-0.008537,-0.170505,...,-0.216130,-0.207575,-0.252099,-0.191681,-0.246755,-0.253066,-0.252376,-0.242421,-0.249564,-0.243461
1,-0.250929,-0.250505,-0.248352,-0.176493,-0.251068,-0.242207,0.171396,-0.252313,-0.102353,-0.175923,...,-0.220122,-0.134575,-0.252128,-0.219590,-0.243961,-0.252960,-0.252485,-0.245786,-0.250059,-0.246929
2,-0.250929,-0.250633,-0.248922,-0.175067,-0.251039,-0.242367,0.141398,-0.252319,-0.104149,-0.172216,...,-0.225255,-0.105489,-0.252253,-0.223546,-0.241223,-0.252944,-0.252488,-0.250405,-0.249987,-0.246303
3,-0.250929,-0.250690,-0.247497,-0.176208,-0.251043,-0.241285,0.167803,-0.252393,-0.016521,-0.174212,...,-0.216700,-0.182481,-0.252116,-0.216959,-0.245444,-0.252980,-0.252471,-0.243790,-0.249996,-0.246453
4,-0.250929,-0.250554,-0.248637,-0.176493,-0.251053,-0.242458,0.171767,-0.252307,-0.107999,-0.174212,...,-0.221833,-0.132864,-0.252156,-0.221351,-0.245324,-0.252944,-0.252492,-0.246470,-0.250030,-0.246483
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,-0.250633,-0.251061,-0.251489,-0.167368,-0.250719,-0.245372,0.124346,-0.252330,-0.231414,-0.156532,...,-0.226395,-0.193888,-0.252287,-0.200214,-0.236091,-0.253049,-0.252385,-0.249037,-0.249421,-0.242933
404,-0.250633,-0.251061,-0.251774,-0.166798,-0.250720,-0.249456,0.152006,-0.252333,-0.230330,-0.154821,...,-0.225825,-0.148833,-0.252344,-0.150003,-0.232344,-0.253120,-0.252384,-0.249607,-0.249425,-0.242908
405,-0.250919,-0.251132,-0.252059,-0.167653,-0.250752,-0.245583,0.122635,-0.252364,-0.240482,-0.157673,...,-0.228676,-0.190466,-0.252322,-0.225901,-0.245130,-0.252902,-0.252374,-0.248295,-0.249516,-0.243142
406,-0.250919,-0.251189,-0.252059,-0.163091,-0.250637,-0.245200,0.118072,-0.252384,-0.240168,-0.153396,...,-0.227536,-0.173927,-0.252458,-0.151405,-0.237687,-0.253120,-0.252308,-0.249835,-0.249181,-0.240500


In [None]:
# Define two models and two feature selector types to perform

model1 = SklearnModel(model='KernelRidge', kernel='rbf')
model2 = SklearnModel(model='LinearRegression')
models = [model1, model2]

selector1 = NoSelect()
selector2 = EnsembleModelFeatureSelector(model=SklearnModel(model='RandomForestRegressor'), k_features=10)
selectors = [selector1, selector2]


In [None]:
# Define and run the case where no data split is performed (full fit)

splitter = NoSplit()
splitter.evaluate(X=X,
                  y=y, 
                  models=models,
                  selectors=selectors,
                  savepath=savepath)

In [None]:
# Define and run the case where random KFold CV is performed

splitter = SklearnDataSplitter(splitter='KFold', shuffle=True)
splitter.evaluate(X=X,
                  y=y, 
                  models=models,
                  selectors=selectors,
                  savepath=savepath)

In [None]:
import sklearn, pandas, numpy, matplotlib, pymatgen, matminer
print(sklearn.__version__, pandas.__version__, numpy.__version__, matplotlib.__version__, matminer.__version__, pymatgen.__version__)

In [None]:
!which python