In [1]:
# Import needed MASTML modules

from mastml.mastml import Mastml
from mastml.datasets import LocalDatasets, SklearnDatasets, FoundryDatasets, FigshareDatasets, DataUtilities, DataCleaning
from mastml.preprocessing import SklearnPreprocessor, MeanStdevScaler
from mastml.models import SklearnModel
from mastml.data_splitters import SklearnDataSplitter, NoSplit
from mastml.plots import Histogram, Scatter
from mastml.feature_selectors import EnsembleModelFeatureSelector, NoSelect, PearsonSelector, MASTMLFeatureSelector
from mastml.feature_generators import ElementalFeatureGenerator

In [2]:
# Initialize the MASTML run, write savepath and metadata file

mastml = Mastml(savepath='results/test_output')
savepath = mastml.get_savepath
mastml_metadata = mastml.get_mastml_metadata

results/test_output not empty. Renaming...


In [3]:
# Download diffusion data from Figshare (done previously)

#article_id = 7418492
#FigshareDatasets().download_data(article_id=article_id)

In [4]:
# Load in diffusion data downloaded from Figshare 

target = 'E_regression.1'
extra_columns = ['E_regression', 'Material compositions 1', 'Material compositions 2', 'Hop activation barrier']
d = LocalDatasets(file_path='figshare_7418492/All_Model_Data_Missing.xlsx', 
                  target=target, 
                  extra_columns=extra_columns, 
                  group_column='Material compositions 1',
                  as_frame=True)

X, y, groups = d.load_data()




In [5]:
X

Unnamed: 0,valence_composition_average,phi_composition_average,NdValence_composition_average,MiracleRadius_composition_average,GSestFCClatcnt_composition_average,SecondIonizationEnergy_composition_average,IonizationEnergy_composition_average,n_ws^third_composition_average,ThermalConductivity_composition_average,CovalentRadius_composition_average,...,Site2_MendeleevNumber,Site2_ElasticModulus,Site2_Electronegativity,Site2_AtomicWeight,Site2_HeatFusion,Site2_SpecificHeatCapacity,Site2_AtomicRadii,Site2_ThermalExpansionCoefficient,Site2_BCCefflatcnt,Site2_AtomicVolume
0,,4.350,10.0,144.0,4.027313,21.4900,731.00,1.360,429.00,145.0,...,65,80.0,1.93,107.868200,11.30,0.235,1.444,18.9,6.375951,17.075648
1,,4.725,8.5,134.5,3.737485,19.2750,744.50,1.555,264.50,135.5,...,58,208.0,1.88,58.933195,16.20,0.421,1.253,13.0,5.507318,10.995861
2,,4.500,7.5,137.0,3.788936,18.9950,691.90,1.545,261.35,142.0,...,49,259.0,1.66,51.996100,21.00,0.449,1.249,4.9,5.632801,12.092937
3,,4.400,10.0,135.0,3.782567,20.8910,738.20,1.415,415.00,138.5,...,64,124.0,1.90,63.546000,13.60,0.385,1.278,16.5,5.617632,11.829942
4,,4.640,8.0,134.5,3.764269,18.8350,745.15,1.565,254.60,138.5,...,55,211.0,1.83,55.845000,13.81,0.449,1.241,11.8,5.557847,11.777365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,4.5,3.750,3.0,150.5,4.350534,13.7250,662.00,1.525,38.20,169.5,...,47,104.0,1.60,92.906380,30.00,0.265,1.429,7.3,6.625794,18.002133
404,4.5,3.750,2.5,151.5,4.348236,6.5650,710.50,1.520,40.10,172.5,...,48,183.0,1.50,180.947880,36.57,0.140,1.430,6.3,6.618497,18.046730
405,4.0,3.625,2.0,150.0,4.291886,13.3550,659.00,1.465,22.30,167.5,...,43,110.0,1.54,47.867000,14.15,0.523,1.448,8.6,6.458833,17.636317
406,4.0,3.525,2.0,158.0,4.494239,14.0275,651.00,1.430,22.85,175.0,...,45,139.0,1.30,178.490000,27.20,0.140,1.564,5.9,7.046762,22.268711


In [6]:
groups

0      Ag
1      Ag
2      Ag
3      Ag
4      Ag
       ..
403    Zr
404    Zr
405    Zr
406    Zr
407    Zr
Name: Material compositions 1, Length: 408, dtype: object

In [7]:
# The data contains missing values. Clean the data, which corrects missing values
# and provides some basic analysis of the input data

cleaner = DataCleaning()
X, y = cleaner.evaluate(X=X, 
                        y=y, 
                        method='imputation', 
                        strategy='mean', 
                        savepath=savepath)

In [8]:
X

Unnamed: 0,valence_composition_average,phi_composition_average,NdValence_composition_average,MiracleRadius_composition_average,GSestFCClatcnt_composition_average,SecondIonizationEnergy_composition_average,IonizationEnergy_composition_average,n_ws^third_composition_average,ThermalConductivity_composition_average,CovalentRadius_composition_average,...,Site2_MendeleevNumber,Site2_ElasticModulus,Site2_Electronegativity,Site2_AtomicWeight,Site2_HeatFusion,Site2_SpecificHeatCapacity,Site2_AtomicRadii,Site2_ThermalExpansionCoefficient,Site2_BCCefflatcnt,Site2_AtomicVolume
0,3.981108,4.350,10.0,144.0,4.027313,21.4900,731.00,1.360,429.00,145.0,...,65.0,80.0,1.93,107.868200,11.30,0.235,1.444,18.9,6.375951,17.075648
1,3.981108,4.725,8.5,134.5,3.737485,19.2750,744.50,1.555,264.50,135.5,...,58.0,208.0,1.88,58.933195,16.20,0.421,1.253,13.0,5.507318,10.995861
2,3.981108,4.500,7.5,137.0,3.788936,18.9950,691.90,1.545,261.35,142.0,...,49.0,259.0,1.66,51.996100,21.00,0.449,1.249,4.9,5.632801,12.092937
3,3.981108,4.400,10.0,135.0,3.782567,20.8910,738.20,1.415,415.00,138.5,...,64.0,124.0,1.90,63.546000,13.60,0.385,1.278,16.5,5.617632,11.829942
4,3.981108,4.640,8.0,134.5,3.764269,18.8350,745.15,1.565,254.60,138.5,...,55.0,211.0,1.83,55.845000,13.81,0.449,1.241,11.8,5.557847,11.777365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,4.500000,3.750,3.0,150.5,4.350534,13.7250,662.00,1.525,38.20,169.5,...,47.0,104.0,1.60,92.906380,30.00,0.265,1.429,7.3,6.625794,18.002133
404,4.500000,3.750,2.5,151.5,4.348236,6.5650,710.50,1.520,40.10,172.5,...,48.0,183.0,1.50,180.947880,36.57,0.140,1.430,6.3,6.618497,18.046730
405,4.000000,3.625,2.0,150.0,4.291886,13.3550,659.00,1.465,22.30,167.5,...,43.0,110.0,1.54,47.867000,14.15,0.523,1.448,8.6,6.458833,17.636317
406,4.000000,3.525,2.0,158.0,4.494239,14.0275,651.00,1.430,22.85,175.0,...,45.0,139.0,1.30,178.490000,27.20,0.140,1.564,5.9,7.046762,22.268711


In [9]:
# Preprocess the cleaned data to be normalized

preprocessor = SklearnPreprocessor(preprocessor='StandardScaler', as_frame=True)
X = preprocessor.evaluate(X=X, 
                          savepath=savepath)

#preprocessor = MeanStdevScaler(mean=0, stdev=2, as_frame=True)
#X = preprocessor.evaluate(X=X,
#                         savepath=savepath)

In [10]:
#print(X['valence_composition_average'].mean(), X['valence_composition_average'].std())

In [11]:
#X

In [12]:
# Define two models and two feature selector types to perform

#model1 = SklearnModel(model='KernelRidge', kernel='rbf')
model2 = SklearnModel(model='RandomForestRegressor')
#model2 = SklearnModel(model='LinearRegression')
#models = [model1, model2]
models = [model2]

selector1 = NoSelect()
selector2 = EnsembleModelFeatureSelector(model=SklearnModel(model='RandomForestRegressor'), n_features_to_select=10)
selector3 = PearsonSelector(threshold_between_features=0.9, threshold_with_target=0.7, flag_highly_correlated_features=True, n_features_to_select=15)
selector4 = MASTMLFeatureSelector(model=SklearnModel(model='LinearRegression'), n_features_to_select=1)
#selectors = [selector1, selector2]
selectors = [selector4]

metrics = ['r2_score', 'mean_absolute_error', 'root_mean_squared_error', 'rmse_over_stdev']

plots = ['Scatter']

In [13]:
# Define and run the case where no data split is performed (full fit)
#
#splitter = NoSplit()
#splitter.evaluate(X=X,
#                  y=y, 
#                  models=models,
#                  selectors=selectors,
#                  metrics=metrics,
#                  savepath=savepath)

In [14]:
# Define and run the case where random RepeatedKFold CV is performed

splitter = SklearnDataSplitter(splitter='RepeatedKFold', n_repeats=2, n_splits=5)
splitter.evaluate(X=X,
                  y=y, 
                  models=models,
                  selectors=selectors,
                  metrics=metrics,
                  plots=plots,
                  savepath=savepath)

after select
(326, 1) (82, 1)
after select
(326, 1) (82, 1)
after select
(326, 1) (82, 1)
after select
(327, 1) (81, 1)
after select
(327, 1) (81, 1)
after select
(326, 1) (82, 1)
after select
(326, 1) (82, 1)
after select
(326, 1) (82, 1)
after select
(327, 1) (81, 1)
after select
(327, 1) (81, 1)


In [None]:
# Define and run the case where LeaveOutGroup CV is performed

splitter = SklearnDataSplitter(splitter='LeaveOneGroupOut')
splitter.evaluate(X=X,
                  y=y, 
                  groups=groups,
                  models=models,
                  selectors=selectors,
                  metrics=metrics,
                  savepath=savepath)

In [None]:
import sklearn, pandas, numpy, matplotlib, pymatgen, matminer
print(sklearn.__version__, pandas.__version__, numpy.__version__, matplotlib.__version__, matminer.__version__, pymatgen.__version__)

In [None]:
!which python

In [None]:
import pandas as pd
d = {"mae": 1, "rmse": 0.5}
df = pd.DataFrame(d, index=[0])
df

In [None]:
df.to_excel('test.xlsx', index=None)

In [None]:
df_import = pd.read_excel('test.xlsx')
d = df_import.to_dict('records')[0]
print(df_import)
print(d)

In [None]:
import numpy as np
yt = np.array([1, 1, 2, 3])
yp = np.array([1.2, 0.9, 2, 3])
y_unique = np.unique(yt)
res = abs(yp-yt)
print(res)
for y in list(y_unique):
    print(y)
    print(yp[np.where(yt==y)])
    best = min(abs(yp[np.where(yt==y)]-yt[np.where(yt==y)]))
    print(best)
    print(yp[np.where(res==best)])

In [None]:
np.where(yt<2)

In [None]:
a = [1, 2, 3]
b = [4, 5, 6]
c = [7, 8, 9]
d = [10, 11, 12]
for (i1, i2), (j1, j2) in zip(zip(a, b), zip(c, d)):
    print(i1, i2, j1, j2)