In [1]:
# Load sklearn modules
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import TransformerMixin, BaseEstimator

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR, LinearSVR

from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RepeatedKFold, cross_val_score, cross_val_predict, train_test_split, GridSearchCV, RandomizedSearchCV

import numpy as np
from pandas import DataFrame
from scipy.stats import randint as sp_randint

# Load featurizers and conversion functions
from matminer.featurizers.composition import ElementProperty, OxidationStates
from matminer.featurizers.structure import DensityFeatures
from matminer.featurizers.conversions import CompositionToOxidComposition, StrToComposition

## Load Dataset

In [2]:
from matminer.datasets.convenience_loaders import load_elastic_tensor
df = load_elastic_tensor() # loads dataset in a pandas DataFrame 
unwanted_columns = ["volume", "nsites", "compliance_tensor", "elastic_tensor", 
                    "elastic_tensor_original", "K_Voigt", "G_Voigt", "K_Reuss", "G_Reuss"]
df = df.drop(unwanted_columns, axis=1)

Fetching elastic_tensor_2015.json.gz from https://ndownloader.figshare.com/files/13220603 to C:\Users\tikam\AppData\Local\Programs\Python\Python39\lib\site-packages\matminer\datasets\elastic_tensor_2015.json.gz


Fetching https://ndownloader.figshare.com/files/13220603 in MB: 1.1182079999999999MB [00:00, 62.46MB/s]                


In [3]:
# seperate out values to be estimated
y = df['K_VRH'].values

## Data Preprocessing

In [None]:
df = StrToComposition().featurize_dataframe(df, "formula")
df = CompositionToOxidComposition().featurize_dataframe(df, "composition")

StrToComposition:   0%|          | 0/1181 [00:00<?, ?it/s]

CompositionToOxidComposition:   0%|          | 0/1181 [00:00<?, ?it/s]

In [None]:
from matminer.utils.pipeline import DropExcluded, ItemSelector

## Making Feature union pipeline for featurizers

In [None]:
# columns to remove before regression
excluded = ["G_VRH", "K_VRH", "elastic_anisotropy", "formula", "material_id", 
            "poisson_ratio", "structure", "composition", "composition_oxid"]

# featurization transformations
featurizer = FeatureUnion(
    transformer_list=[
        ('drop', DropExcluded(excluded)),
        ('density', Pipeline([
            ('select', ItemSelector("structure")),
            ('density_feat', DensityFeatures())
        ])),
        ('element', Pipeline([
            ('select', ItemSelector("composition")),
            ('oxidation_feat', ElementProperty.from_preset(preset_name="magpie"))
        ])),
        ('oxidation', Pipeline([
            ('select', ItemSelector("composition_oxid")),
            ('oxidation_feat', OxidationStates())
        ])),
    ]
)

## Making a Regression Pipeline

In [None]:
# make the pipeline
pipeline = Pipeline([
    ('featurize', featurizer),
    ('regress', LinearRegression()),
])

pipeline.fit(df, y)

# get fit statistics
print('training R2 = ' + str(round(pipeline.score(df, y), 3)))
print('training RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y, y_pred=pipeline.predict(df))))

## Making a Random Forest Pipeline

In [None]:
# make the pipeline
pipeline = Pipeline([
    ('featurize', featurizer),
    ('regress', RandomForestRegressor(n_estimators=50, random_state=1)),
])

pipeline.fit(df, y)

# get fit statistics
print('training R2 = ' + str(round(pipeline.score(df, y), 3)))
print('training RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y, y_pred=pipeline.predict(df))))

## Cross Validation

In [None]:
X = featurizer.transform(df)

In [None]:
crossvalidation = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

In [None]:
lr = LinearRegression()

scores = cross_val_score(lr, X, y, scoring='neg_mean_squared_error', cv=crossvalidation, n_jobs=1)
rmse_scores = [np.sqrt(abs(s)) for s in scores]
r2_scores = cross_val_score(lr, X, y, scoring='r2', cv=crossvalidation, n_jobs=1)

print('Cross-validation results:')
print('Folds: %i, mean R2: %.3f' % (len(scores), np.mean(np.abs(r2_scores))))
print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores))))

In [None]:
# compute cross validation scores for random forest model
rf = RandomForestRegressor(n_estimators=50, random_state=1)

r2_scores = cross_val_score(rf, X, y, scoring='r2', cv=crossvalidation, n_jobs=1)
scores = cross_val_score(rf, X, y, scoring='neg_mean_squared_error', cv=crossvalidation, n_jobs=1)
rmse_scores = [np.sqrt(abs(s)) for s in scores]

print('Cross-validation results:')
print('Folds: %i, mean R2: %.3f' % (len(scores), np.mean(np.abs(r2_scores))))
print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores))))

## Model Selection with Grid Search

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [None]:
rf = RandomForestRegressor(n_estimators=50, random_state=1)
param_grid = [
  {'n_estimators': [10,15,20,25,30,50,100]},
]
gs = GridSearchCV(rf, param_grid, n_jobs=4, cv=5)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)
print(gs.score(X_test, y_test))

## Random Seerch

In [None]:
pipe = Pipeline([
    ('scale', StandardScaler()),
    ('regress', RandomForestRegressor(random_state=1)), 
])

param_dist = {'regress__n_estimators': sp_randint(10,150)}

gs = RandomizedSearchCV(pipe, param_dist, cv=crossvalidation, n_jobs=-1)
gs.fit(X_train, y_train)

print('best crossval score ' + str(round(gs.best_score_, 3)))
print('best params ' + str(gs.best_params_))

# get fit statistics
print('training R2 = ' + str(round(gs.score(X_train, y_train), 3)))
print('training RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y_train, y_pred=gs.predict(X_train))))
print('test R2 = ' + str(round(gs.score(X_test, y_test), 3)))
print('test RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y_test, y_pred=gs.predict(X_test))))