# **Superconductivity Regression Notebook**
Trains models to predict critical temperatures based on features found with *get_featurizers.ipynb*.

*Author: Kirk Kleinsasser*

In [None]:
import dill
#dill.load_session('../data/latest-run.db') #this can load a saved python session so I don't need to rerun computationally expensive cells
%autosave 300 
#autosaves code every five minutes

In [3]:
#general imports:
import warnings #to suppress grid search warnings
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns #heatmaps

#regression models:
from mlens.ensemble import SuperLearner
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from xgboost import XGBRegressor

#various ML tools:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, recall_score, r2_score, mean_absolute_error, mean_squared_error
from skopt import BayesSearchCV #bayesian optimization

#imports the data from get_featurizers. Function because some models we may want infinity:
def import_data(replace_inf=False):
    global data, target, train_data, test_data, train_target, test_target #variables that we want to define globally
    data = pd.DataFrame(pd.read_csv('supercon_feat.csv'))
    target = data.pop('Tc')

    #TODO: add feature for infinite values or otherwise handle for models that cannot handle infinite data
    if replace_inf:
        data.replace([np.inf, -np.inf], np.nan, inplace=True)

    #TODO: debug feaurizers - NaN is entered when there is an error in the featurizer
    data.drop(['name','Unnamed: 0', 'composition'], axis=1, inplace=True)
    data = data[data.columns[data.notnull().any()]] #drop columns that are entirely NaN (12 columns) 
    data.head()

    for col in data:
        data[col] = pd.to_numeric(data[col], errors ='coerce').fillna(0).astype('float')

    #test train split, with shuffle and random state for reproducibility 
    train_data, test_data, train_target, test_target = train_test_split(data, target, test_size=0.15, random_state=43, shuffle=True)


## Data Analysis

In [None]:
import_data()
#make a corelation matrix
matrix_data = pd.DataFrame(data)
matrix_data['target'] = target.values
corr_matrix = matrix_data.corr()
fig, ax = plt.subplots(figsize=(36, 36), dpi = 480)
ax = sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="YlGnBu")
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

#set various colors
ax.tick_params(colors='grey', which='both')  

#save heatmap
figure = ax.get_figure()    
figure.savefig('../data/feature_heatmap.png', dpi=480)

## Linear Regression

In [5]:
import_data(replace_inf=True) #reimport data without infinities

linear = LinearRegression()
linear.fit(train_data, train_target)
linear_pred = linear.predict(test_data)

mse = mean_squared_error(test_target, linear_pred)
r_squared = r2_score(test_target, linear_pred)

print("MSE:", mse)
print("R2 :", r_squared)

MSE: 442.16314939573556
R2 : 0.3918631341771974


## Support Vector Regression

In [None]:
#start searching for the best nonlinear model with grid search

#parameter grid
parameters = {
    "kernel": ["rbf"],
    "C": [1, 10, 100, 1000],
    "epsilon": [0.001, 0.01, 0.1, 1, 10, 100],
    "gamma": [0.001, 0.01, 0.1, 1]
}

#define model, do grid search
reg = SVR(gamma = 'auto')
model = GridSearchCV(reg,                    #model
                   param_grid = parameters,   #hyperparameters
                   scoring="neg_mean_squared_error",        #metric for scoring
                   return_train_score=False,
                   #cv=10,
                   n_jobs = -1,
                   verbose = 0)                     #number of folds

#fit the models
model.fit(train_data,train_target)

print("\nTuned Hyperparameters :", model.best_params_,"\nBest Estimator:",model.best_estimator_)
print("MSE :",model.best_score_)

In [None]:
svr = SVR(kernel='rbf', C=1, epsilon=0.001, gamma=0.1)
svr.fit(train_data, train_target)
svr_pred = svr.predict(test_data)

mse = mean_squared_error(test_target, svr_pred)
r_squared = r2_score(test_target, svr_pred)

print("MSE:", mse)
print("R2 :", r_squared)

In [None]:
#start searching for the best linear model with grid search

#parameter grid
parameters = {
    "kernel": ["linear"],
    "C": [1, 10, 100, 1000],
    "epsilon": [0.001, 0.01, 0.1, 1, 10, 100],
    "gamma": [0.001, 0.01, 0.1, 1]
}

#define model, do grid search
reg = SVR()
model = GridSearchCV(reg,                    #model
                   param_grid = parameters,   #hyperparameters
                   scoring="neg_mean_squared_error",        #metric for scoring
                   return_train_score=False,
                   #cv=10,
                   n_jobs = -1,
                   verbose = 0)

#fit the models
model.fit(train_data,train_target)

print("\nTuned Hyperparameters :", model.best_params_,"\nBest Estimator:",model.best_estimator_)
print("MSE :",model.best_score_)

In [None]:
svr = SVR(C=1, epsilon=10, gamma='auto', kernel='linear')
svr.fit(train_data, train_target)
svr_pred = svr.predict(test_data)

mse = mean_squared_error(test_target, svr_pred)
r_squared = r2_score(test_target, svr_pred)

print("MSE:", mse)
print("R2 :", r_squared)

## Random Forest Regressor

In [None]:
parameters = {"n_estimators": [100, 200, 500],
                   "max_depth": [None, 2, 3, 5],
                   "min_samples_split": [2, 5, 10],
                   "min_samples_leaf": [1, 2, 5]}

#define model, do grid search
reg = RandomForestRegressor()
model = GridSearchCV(reg,                    #model
                   param_grid = parameters,   #hyperparameters
                   scoring="neg_mean_squared_error",        #metric for scoring
                   return_train_score=False,
                   #cv=10,
                   n_jobs = -1,
                   verbose = 0)                     #number of folds

#fit the models
model.fit(reg_train_data,reg_train_target)

print("\nTuned Hyperparameters :", model.best_params_,"\nBest Estimator:",model.best_estimator_)
print("MSE :",model.best_score_)

In [None]:
rfr = RandomForestRegressor() #not using grid result b/c I don't want to overfit. We get almost identical results with n_estimators = default (100)
rfr.fit(reg_train_data, reg_train_target)
rfr_pred = rfr.predict(reg_test_data)

mse = mean_squared_error(reg_test_target, rfr_pred)
r_squared = r2_score(reg_test_target, rfr_pred)

print("MSE:", mse)
print("R2 :", r_squared)

## Superlearner

In [6]:
# create a list of base-models
def get_models():
	models = list()
	models.append(LinearRegression())
	models.append(ElasticNet())
	models.append(SVR(C=1, epsilon=10, gamma='auto', kernel='linear'))
	models.append(DecisionTreeRegressor())
	models.append(KNeighborsRegressor())
	models.append(AdaBoostRegressor())
	models.append(BaggingRegressor())
	models.append(RandomForestRegressor())
	models.append(ExtraTreesRegressor())
	return models

In [7]:
def get_super_learner(X):
	ensemble = SuperLearner(scorer=r2_score, folds=10, shuffle=True, sample_size=len(X))
	# add base models
	models = get_models()
	ensemble.add(models)
	# add the meta model
	ensemble.add_meta(LinearRegression())

	return ensemble
ensemble = get_super_learner(train_data)

In [9]:
warnings.filterwarnings('ignore') #got tired of non-converging erros

#fit the super learner
ensemble.fit(train_data.values,train_target.values)
#summarize base learners
print(ensemble.data)

train_pred = ensemble.predict(train_data)
test_pred = ensemble.predict(test_data)

#obtain scores for the model
training_r2 = r2_score(train_target,train_pred)
test_r2 = r2_score(test_target,test_pred)

training_mse = mean_squared_error(train_target,train_pred)
test_mse = mean_squared_error(test_target,test_pred)

print("\nTraining R2",training_r2,"\nCV R2",test_r2)

print("\nTraining MSE",training_mse,"\nCV MSE",test_mse)

KeyboardInterrupt: 

In [None]:
with plt.rc_context({'xtick.color':'white', 'ytick.color':'white','axes.titlecolor':'white','figure.facecolor':'#1e1e1e','text.color':'white'}):
    x = ['adaboost', 'dt', 'bag', 'elastic', 'extr', 'kn', 'linear','rfr', 'svm']
    energy = [78, 83, 89, 59, 91, 66, 59, 90, 58]

    x_pos = [i for i, _ in enumerate(x)]

    plt.bar(x_pos, energy, color='green')
    plt.xlabel("Model",c='white')
    plt.ylabel("R2 Score",c='white')
    plt.title("Regression Model Scores")

    plt.xticks(x_pos, x)

    plt.savefig('../data/model.png')

In [None]:
dill.dump_session('../data/latest-run.db') #this can dump a python session so I can resume later, after restarts and such