# Notebook for developing models

In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

## Data preparation for model training

### Importing data

In [2]:
# read in data
train = pd.read_csv("train.csv", delimiter=",")
test = pd.read_csv("test.csv", delimiter=",")
X_additional = pd.read_csv("additionalAttributes.csv", delimiter=",").drop("Unnamed: 0", axis=1)
X = train.drop(["formation_energy_ev_natom", "bandgap_energy_ev", "id"], axis=1)
y_fe = train.formation_energy_ev_natom
y_be = train.bandgap_energy_ev
X_full = pd.concat([X, X_additional], axis=1)

In [3]:
X_full

Unnamed: 0,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,...,distGaO,distInAl,distInGa,distInO,elInt,qAl,qGa,qIn,qO,xEq
0,33,80.0,0.6250,0.3750,0.0000,9.9523,8.5513,9.1775,90.0026,90.0023,...,2.025,,,,-41.086176,0.349004,0.345264,,-0.231734,4.896453
1,194,80.0,0.6250,0.3750,0.0000,6.1840,6.1838,23.6287,90.0186,89.9980,...,1.775,0.025,0.025,1.775,-38.554098,0.344690,0.347084,,-0.230392,4.905702
2,227,40.0,0.8125,0.1875,0.0000,9.7510,5.6595,13.9630,90.9688,91.1228,...,1.825,0.025,0.025,1.825,-33.957596,0.343019,0.339820,,-0.228280,4.925995
3,167,30.0,0.7500,0.0000,0.2500,5.0036,5.0034,13.5318,89.9888,90.0119,...,0.025,2.925,0.025,0.025,-35.763321,0.337616,,0.375759,-0.231435,4.898196
4,194,80.0,0.0000,0.6250,0.3750,6.6614,6.6612,24.5813,89.9960,90.0006,...,2.025,0.025,3.625,2.025,-36.645650,,0.342521,0.359175,-0.232511,4.869110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,33,40.0,0.7500,0.2500,0.0000,4.9469,8.5014,9.1298,90.0038,90.0023,...,1.775,0.025,0.025,1.775,-39.262887,0.346869,0.345489,,-0.231016,4.905482
2396,167,30.0,0.4167,0.5833,0.0000,4.9566,4.9562,13.4178,89.9938,90.0075,...,2.025,0.025,0.025,2.025,-35.323868,0.345140,0.342118,,-0.228918,4.928134
2397,206,80.0,0.4375,0.5625,0.0000,9.2204,9.2200,9.2199,90.0047,90.0046,...,1.975,0.025,0.025,1.975,-38.566520,0.364599,0.330046,,-0.230109,4.873503
2398,33,80.0,0.3125,0.1875,0.5000,10.6529,9.0954,9.7210,90.0015,89.9996,...,2.125,3.375,3.525,2.125,-39.874295,0.335601,0.329533,0.372616,-0.235314,4.842888


### Remove nan values From data

In [4]:
X_full.columns

Index(['spacegroup', 'number_of_total_atoms', 'percent_atom_al',
       'percent_atom_ga', 'percent_atom_in', 'lattice_vector_1_ang',
       'lattice_vector_2_ang', 'lattice_vector_3_ang',
       'lattice_angle_alpha_degree', 'lattice_angle_beta_degree',
       'lattice_angle_gamma_degree', 'Vatom', 'cAlGa', 'cAlIn', 'cAlO',
       'cGaAl', 'cGaIn', 'cGaO', 'cInAl', 'cInGa', 'cInO', 'distAlGa',
       'distAlIn', 'distAlO', 'distGaAl', 'distGaIn', 'distGaO', 'distInAl',
       'distInGa', 'distInO', 'elInt', 'qAl', 'qGa', 'qIn', 'qO', 'xEq'],
      dtype='object')

* If distance in Nan, it is reasonable to set them to very large value as the atoms are infinitely far from each other
* If Coordination number is Nan then it is reasonable to set it as 0, as no atoms are in vicinity of chosen atom
* If any of the charges have Nan values it is set to 0

In [5]:
fillValues = {'cAlGa':0, 
              'cAlIn':0, 
              'cAlO':0,
              'cGaAl':0, 
              'cGaIn':0, 
              'cGaO':0, 
              'cInAl':0, 
              'cInGa':0, 
              'cInO':0,
              'distAlGa':100,
              'distAlIn':100,
              'distAlO':100,
              'distGaAl':100,
              'distGaIn':100,
              'distGaO':100,
              'distInAl':100,
              'distInGa':100,
              'distInO':100,
              'qAl':0, 
              'qGa':0, 
              'qIn':0, 
              'qO':0}

In [6]:
X_full = X_full.fillna(value=fillValues)
X_full

Unnamed: 0,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,...,distGaO,distInAl,distInGa,distInO,elInt,qAl,qGa,qIn,qO,xEq
0,33,80.0,0.6250,0.3750,0.0000,9.9523,8.5513,9.1775,90.0026,90.0023,...,2.025,100.000,100.000,100.000,-41.086176,0.349004,0.345264,0.000000,-0.231734,4.896453
1,194,80.0,0.6250,0.3750,0.0000,6.1840,6.1838,23.6287,90.0186,89.9980,...,1.775,0.025,0.025,1.775,-38.554098,0.344690,0.347084,0.000000,-0.230392,4.905702
2,227,40.0,0.8125,0.1875,0.0000,9.7510,5.6595,13.9630,90.9688,91.1228,...,1.825,0.025,0.025,1.825,-33.957596,0.343019,0.339820,0.000000,-0.228280,4.925995
3,167,30.0,0.7500,0.0000,0.2500,5.0036,5.0034,13.5318,89.9888,90.0119,...,0.025,2.925,0.025,0.025,-35.763321,0.337616,0.000000,0.375759,-0.231435,4.898196
4,194,80.0,0.0000,0.6250,0.3750,6.6614,6.6612,24.5813,89.9960,90.0006,...,2.025,0.025,3.625,2.025,-36.645650,0.000000,0.342521,0.359175,-0.232511,4.869110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,33,40.0,0.7500,0.2500,0.0000,4.9469,8.5014,9.1298,90.0038,90.0023,...,1.775,0.025,0.025,1.775,-39.262887,0.346869,0.345489,0.000000,-0.231016,4.905482
2396,167,30.0,0.4167,0.5833,0.0000,4.9566,4.9562,13.4178,89.9938,90.0075,...,2.025,0.025,0.025,2.025,-35.323868,0.345140,0.342118,0.000000,-0.228918,4.928134
2397,206,80.0,0.4375,0.5625,0.0000,9.2204,9.2200,9.2199,90.0047,90.0046,...,1.975,0.025,0.025,1.975,-38.566520,0.364599,0.330046,0.000000,-0.230109,4.873503
2398,33,80.0,0.3125,0.1875,0.5000,10.6529,9.0954,9.7210,90.0015,89.9996,...,2.125,3.375,3.525,2.125,-39.874295,0.335601,0.329533,0.372616,-0.235314,4.842888


### Categorize spacegroups

In [7]:
# transform encoding
def encode_spacegroup(X):
    # 1-2 triclinic
    # 3-15 monoclinic
    # 16-74 orthorhombic
    # 75-142 tetragonal
    # 143-167 trigonal
    # 168-194 hexagonal
    # 195-230 cubic
    # [ 33 194 227 167 206  12] are the possible spacegroup values
    # onehot encode each separately
    return pd.get_dummies(X, columns=["spacegroup"])

In [8]:
X_full = encode_spacegroup(X_full)

In [9]:
X_full

Unnamed: 0,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,...,qGa,qIn,qO,xEq,spacegroup_12,spacegroup_33,spacegroup_167,spacegroup_194,spacegroup_206,spacegroup_227
0,80.0,0.6250,0.3750,0.0000,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,...,0.345264,0.000000,-0.231734,4.896453,0,1,0,0,0,0
1,80.0,0.6250,0.3750,0.0000,6.1840,6.1838,23.6287,90.0186,89.9980,120.0025,...,0.347084,0.000000,-0.230392,4.905702,0,0,0,1,0,0
2,40.0,0.8125,0.1875,0.0000,9.7510,5.6595,13.9630,90.9688,91.1228,30.5185,...,0.339820,0.000000,-0.228280,4.925995,0,0,0,0,0,1
3,30.0,0.7500,0.0000,0.2500,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,...,0.000000,0.375759,-0.231435,4.898196,0,0,1,0,0,0
4,80.0,0.0000,0.6250,0.3750,6.6614,6.6612,24.5813,89.9960,90.0006,119.9893,...,0.342521,0.359175,-0.232511,4.869110,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,40.0,0.7500,0.2500,0.0000,4.9469,8.5014,9.1298,90.0038,90.0023,90.0015,...,0.345489,0.000000,-0.231016,4.905482,0,1,0,0,0,0
2396,30.0,0.4167,0.5833,0.0000,4.9566,4.9562,13.4178,89.9938,90.0075,120.0007,...,0.342118,0.000000,-0.228918,4.928134,0,0,1,0,0,0
2397,80.0,0.4375,0.5625,0.0000,9.2204,9.2200,9.2199,90.0047,90.0046,89.9954,...,0.330046,0.000000,-0.230109,4.873503,0,0,0,0,1,0
2398,80.0,0.3125,0.1875,0.5000,10.6529,9.0954,9.7210,90.0015,89.9996,90.0004,...,0.329533,0.372616,-0.235314,4.842888,0,1,0,0,0,0


## Scale attributes to from 0 to 1

In [10]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_full)

X_fullMinMax = scaler.transform(X_full)

### Scale using Standard scaler

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_full)

X_fullStandard = scaler.transform(X_full)

### Separate into Training and validation

In [12]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

trainSize = 0.85

############## Separate Train into Validation and Train for Formation energy ####################
#Without Scaling
#X_train_energ, X_val_energ, y_train_energ, y_val_energ = train_test_split(X_full, y_fe, train_size=trainSize, random_state=1)
#Min-Max Scaling
X_train_energ, X_val_energ, y_train_energ, y_val_energ = train_test_split(X_fullMinMax, y_fe, train_size=trainSize, random_state=1)
#Standard scaling
#X_train_energ, X_val_energ, y_train_energ, y_val_energ = train_test_split(X_fullStandard, y_fe, train_size=trainSize, random_state=1)


############## Separate Train into Validation and Train for Band Gap ####################
#Without Scaling
#X_train_gap, X_val_gap, y_train_gap, y_val_gap = train_test_split(X_full, y_be, train_size=trainSize, random_state=1)
#Min-Max Scaling
X_train_gap, X_val_gap, y_train_gap, y_val_gap = train_test_split(X_fullMinMax, y_be, train_size=trainSize, random_state=1)
#Standard scaling
#X_train_gap, X_val_gap, y_train_gap, y_val_gap = train_test_split(X_fullStandard, y_be, train_size=trainSize, random_state=1)


# Model training

In [13]:
from sklearn.metrics import mean_squared_log_error
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

def rmsle_exp(y_true, y_pred, **kwargs):
    return np.sqrt(mean_squared_log_error(np.expm1(y_true), np.expm1(y_pred))) 
rmsle_scorer_exp = sklearn.metrics.make_scorer(rmsle_exp, greater_is_better=False)

## KNN

In [17]:
from sklearn.model_selection import cross_val_score
def evaluate_CV(model, X, y, metric=rmsle_scorer, n_folds=5, random_state=None):
    return -cross_val_score(model, X, y, cv=n_folds, scoring=metric).mean()

#### Formation energy

In [22]:
#################### KNN ##############################
from sklearn.neighbors import KNeighborsRegressor
results_df = pd.DataFrame(columns=['model',"Neigbors","Metric","weight",'trainError', 'valError','crossVal'])

for metr in ["manhattan", "minkowski"]:
    for i in range(3, 31, 2):
        for weight in ['uniform', 'distance']:
            knn = KNeighborsRegressor(n_neighbors=i).fit(X_train_energ, y_train_energ)

#            print("Neighbours " + str(i))
#            print("Metric " + metr)

            trainError = rmsle(y_train_energ, knn.predict(X_train_energ))
            valError = rmsle(y_val_energ, knn.predict(X_val_energ))
            cross_Val = evaluate_CV(knn, X_fullMinMax, y_fe)
            results_df = results_df.append({'model': 'KNN',"Neigbors":i,"Metric":metr,"weight":weight,"trainError":trainError, 'valError':valError, 'deltaErrors':abs(trainError-valError), 'crossVal':cross_Val}, ignore_index=True)


In [24]:
results_df.sort_values(by=["crossVal","valError", "deltaErrors"])

Unnamed: 0,model,Neigbors,Metric,weight,trainError,valError,crossVal,deltaErrors
6,KNN,9,manhattan,uniform,0.030834,0.033243,0.034815,0.002409
7,KNN,9,manhattan,distance,0.030834,0.033243,0.034815,0.002409
34,KNN,9,minkowski,uniform,0.030834,0.033243,0.034815,0.002409
35,KNN,9,minkowski,distance,0.030834,0.033243,0.034815,0.002409
4,KNN,7,manhattan,uniform,0.029769,0.033206,0.034843,0.003437
5,KNN,7,manhattan,distance,0.029769,0.033206,0.034843,0.003437
32,KNN,7,minkowski,uniform,0.029769,0.033206,0.034843,0.003437
33,KNN,7,minkowski,distance,0.029769,0.033206,0.034843,0.003437
8,KNN,11,manhattan,uniform,0.031826,0.033542,0.035039,0.001715
9,KNN,11,manhattan,distance,0.031826,0.033542,0.035039,0.001715


4	KNN	7	manhattan	uniform	NaN	0.029758	0.033206	0.003448

#### Band Gap

In [25]:
#################### KNN ##############################
from sklearn.neighbors import KNeighborsRegressor
results_df = pd.DataFrame(columns=['model',"Neigbors","Metric","weight",'trainError', 'valError'])

for metr in ["manhattan", "minkowski"]:
    for i in range(3, 31, 2):
        for weight in ['uniform', 'distance']:
            knn = KNeighborsRegressor(n_neighbors=i).fit(X_train_gap, y_train_gap)

#            print("Neighbours " + str(i))
#            print("Metric " + metr)

            trainError = rmsle(y_train_gap, knn.predict(X_train_gap))
            valError = rmsle(y_val_gap, knn.predict(X_val_gap))
            cross_Val = evaluate_CV(knn, X_fullMinMax, y_be)
            results_df = results_df.append({'model': 'KNN',"Neigbors":i,"Metric":metr,"weight":weight,"trainError":trainError, 'valError':valError, 'deltaErrors':abs(trainError-valError), 'crossVal':cross_Val}, ignore_index=True)


In [26]:
results_df.sort_values(by=["crossVal","valError", "deltaErrors"])

Unnamed: 0,model,Neigbors,Metric,weight,trainError,valError,crossVal,deltaErrors
4,KNN,7,manhattan,uniform,0.087694,0.091703,0.104764,0.004009
5,KNN,7,manhattan,distance,0.087694,0.091703,0.104764,0.004009
32,KNN,7,minkowski,uniform,0.087694,0.091703,0.104764,0.004009
33,KNN,7,minkowski,distance,0.087694,0.091703,0.104764,0.004009
2,KNN,5,manhattan,uniform,0.083395,0.095135,0.104778,0.01174
3,KNN,5,manhattan,distance,0.083395,0.095135,0.104778,0.01174
30,KNN,5,minkowski,uniform,0.083395,0.095135,0.104778,0.01174
31,KNN,5,minkowski,distance,0.083395,0.095135,0.104778,0.01174
0,KNN,3,manhattan,uniform,0.073825,0.100882,0.106668,0.027057
1,KNN,3,manhattan,distance,0.073825,0.100882,0.106668,0.027057


4	KNN	7	manhattan	uniform	NaN	0.087693	0.091661	0.003968

## SVM

#### Formation energy

In [27]:

################ SVR ##################################
from sklearn.svm import SVR

kernels = ['poly', 'rbf']
gammas=['scale', 'auto']
for ker in kernels:
    for gam in gammas:
        svm = SVR(kernel=ker, gamma=gam).fit(X_train_energ, y_train_energ)
        trainError = rmsle(y_train_energ, abs(svm.predict(X_train_energ)))
        valError = rmsle(y_val_energ, abs(svm.predict(X_val_energ)))
        cross_Val = evaluate_CV(svm, X_fullMinMax, y_fe)
        print("Kernel: " + ker)
        print("Gamma: "+ gam)
        print("Training error: "+str(trainError))
        print("Validation error: "+ str(valError))
        print("Cross Val: "+ str(cross_Val))

Kernel: poly
Gamma: scale
Training error: 0.045997317116021175
Validation error: 0.04671477427084467
Cross Val: 0.04732915601765719
Kernel: poly
Gamma: auto
Training error: 0.05416182217671211
Validation error: 0.053642699421981195
Cross Val: 0.05463414256402662
Kernel: rbf
Gamma: scale
Training error: 0.04646652513871875
Validation error: 0.046441673502856246
Cross Val: 0.04756780245129116
Kernel: rbf
Gamma: auto
Training error: 0.046751293936961445
Validation error: 0.04608426523986951
Cross Val: 0.04776180561131257


Kernel: rbf
Gamma: scale
Training error: 0.04645671148891898
Validation error: 0.04642970080484417

#### Band gap

In [28]:

################ SVR ##################################
from sklearn.svm import SVR

kernels = ['poly', 'rbf']
gammas=['scale', 'auto']
for ker in kernels:
    for gam in gammas:
        svm = SVR(kernel=ker, gamma=gam).fit(X_train_gap, y_train_gap)
        trainError = rmsle(y_train_gap, abs(svm.predict(X_train_gap)))
        valError = rmsle(y_val_gap, abs(svm.predict(X_val_gap)))
        cross_Val = evaluate_CV(svm, X_fullMinMax, y_be)
        print("#########################")
        print("Kernel: " + ker)
        print("Gamma: "+ gam)
        print("Training error: "+str(trainError))
        print("Validation error: "+ str(valError))
        print("Cross Val: "+ str(cross_Val))


#########################
Kernel: poly
Gamma: scale
Training error: 0.07575040507886405
Validation error: 0.07160947746484908
Cross Val: 0.08465286297886752
#########################
Kernel: poly
Gamma: auto
Training error: 0.162781233431358
Validation error: 0.1670097942693191
Cross Val: 0.16710570066265953
#########################
Kernel: rbf
Gamma: scale
Training error: 0.08127944103395272
Validation error: 0.07412610645456003
Cross Val: 0.08697960351607728
#########################
Kernel: rbf
Gamma: auto
Training error: 0.09657497005375883
Validation error: 0.08403335666208304
Cross Val: 0.09701846425998542


Kernel: poly
Gamma: scale
Training error: 0.07584564240632702
Validation error: 0.07156753875472792

### Random Forest

#### Formation energy

In [29]:
from sklearn.ensemble import RandomForestRegressor

In [30]:
############### Random Forest ##############################

n_estimatorss = [650,750,850,1000]
max_depths=[30,35,40,45]
min_samples_splits=[4,6,8]
seeds = [1]
results_df = pd.DataFrame(columns=['model',"seed","n_estimators","max_depth","min_samples_split",'TrainError', 'ValError', 'deltaErrors'])

for seed in seeds:
    for n_estimator in n_estimatorss:
        for max_d in max_depths:
            for min_ss in min_samples_splits:
                rf = RandomForestRegressor(criterion='mse', n_estimators=n_estimator, max_depth=max_d, min_samples_split=min_ss, random_state=seed).fit(X_train_energ, y_train_energ)
                trainError = rmsle(y_train_energ, (rf.predict(X_train_energ)))
                valError = rmsle(y_val_energ, (rf.predict(X_val_energ)))

                cross_Val = evaluate_CV(rf, X_fullMinMax, y_fe)
                results_df = results_df.append({'model': 'RF',"seed":seed,"n_estimators":n_estimator,"max_depth":max_d,"min_samples_split":min_ss,"TrainError":trainError, 'ValError':valError, 'deltaErrors':abs(trainError-valError),'crossVal':cross_Val}, ignore_index=True)


In [31]:
results_df.sort_values(by=["crossVal","ValError", "deltaErrors"])

Unnamed: 0,model,seed,n_estimators,max_depth,min_samples_split,TrainError,ValError,deltaErrors,crossVal
39,RF,1,1000,35,4,0.013313,0.030207,0.016894,0.031341
42,RF,1,1000,40,4,0.013313,0.030207,0.016894,0.031341
45,RF,1,1000,45,4,0.013313,0.030207,0.016894,0.031341
36,RF,1,1000,30,4,0.013312,0.030207,0.016894,0.031342
3,RF,1,650,35,4,0.013319,0.030156,0.016837,0.031344
6,RF,1,650,40,4,0.013319,0.030156,0.016837,0.031344
9,RF,1,650,45,4,0.013319,0.030156,0.016837,0.031344
0,RF,1,650,30,4,0.013318,0.030153,0.016835,0.031346
27,RF,1,850,35,4,0.01332,0.030145,0.016825,0.031347
30,RF,1,850,40,4,0.01332,0.030145,0.016825,0.031347


0	RF	1	850	30	4	0.013319	0.030144

#### Band Gap

In [32]:
############### Random Forest ##############################

n_estimatorss = [650,750,850,1000]
max_depths=[30,35,40,45,50]
min_samples_splits=[4,6,8]
seeds = [1]
results_df = pd.DataFrame(columns=['model',"seed","n_estimators","max_depth","min_samples_split",'TrainError', 'ValError', 'deltaErrors'])

for seed in seeds:
    for n_estimator in n_estimatorss:
        for max_d in max_depths:
            for min_ss in min_samples_splits:
                rf = RandomForestRegressor(criterion='mse', n_estimators=n_estimator, max_depth=max_d, min_samples_split=min_ss, random_state=seed).fit(X_train_gap, y_train_gap)
                trainError = rmsle(y_train_gap, (rf.predict(X_train_gap)))
                valError = rmsle(y_val_gap, (rf.predict(X_val_gap)))
                cross_Val = evaluate_CV(rf, X_fullMinMax, y_be)
                results_df = results_df.append({'model': 'RF',"seed":seed,"n_estimators":n_estimator,"max_depth":max_d,"min_samples_split":min_ss,"TrainError":trainError, 'ValError':valError, 'deltaErrors':abs(trainError-valError),'crossVal':cross_Val}, ignore_index=True)


In [33]:
results_df.sort_values(by=["crossVal","ValError", "deltaErrors"])

Unnamed: 0,model,seed,n_estimators,max_depth,min_samples_split,TrainError,ValError,deltaErrors,crossVal
31,RF,1,850,30,6,0.041993,0.080487,0.038494,0.089325
34,RF,1,850,35,6,0.041993,0.080487,0.038494,0.089325
37,RF,1,850,40,6,0.041993,0.080487,0.038494,0.089325
40,RF,1,850,45,6,0.041993,0.080487,0.038494,0.089325
43,RF,1,850,50,6,0.041993,0.080487,0.038494,0.089325
33,RF,1,850,35,4,0.038443,0.080184,0.041741,0.089339
36,RF,1,850,40,4,0.038443,0.080184,0.041741,0.089339
39,RF,1,850,45,4,0.038443,0.080184,0.041741,0.089339
42,RF,1,850,50,4,0.038443,0.080184,0.041741,0.089339
30,RF,1,850,30,4,0.038444,0.080191,0.041747,0.089339


### Neutral Net (Multi-Layer Perceptron regressor)

In [34]:
from sklearn.neural_network import MLPRegressor

#### Formation energy

In [35]:
############### ANN ##############################
hidden_layer_sizes = [(100,),(50,50), (50,100,50)]
activations = ["identity", "logistic", "tanh", "relu"]
solvers=['lbfgs','adam']
learning_rates = ["constant", "invscaling", "adaptive"]
seeds = [1]
results_df = pd.DataFrame(columns=['activation',"hidden_layer_size","solver","learning_rate",'TrainError', 'ValError', 'deltaErrors'])

for activation in activations:
    for solver in solvers:
        for learning_rate in learning_rates:
            for hidden_layer_size in hidden_layer_sizes:
                mlp = MLPRegressor(activation=activation, solver=solver, learning_rate=learning_rate, hidden_layer_sizes=hidden_layer_size).fit(X_train_energ, y_train_energ)
                trainError = rmsle(y_train_energ, abs(mlp.predict(X_train_energ)))
                valError = rmsle(y_val_energ, abs(mlp.predict(X_val_energ)))
                cross_Val = evaluate_CV(mlp, X_fullMinMax, y_fe)
                results_df = results_df.append({'activation':activation,"hidden_layer_size":hidden_layer_size,"solver":solver,"learning_rate":learning_rate,"TrainError":trainError, 'ValError':valError, 'deltaErrors':abs(trainError-valError),'crossVal':cross_Val}, ignore_index=True)


In [36]:
results_df.sort_values(by=["crossVal","ValError", "deltaErrors"])

Unnamed: 0,activation,hidden_layer_size,solver,learning_rate,TrainError,ValError,deltaErrors,crossVal
58,relu,"(50, 50)",lbfgs,invscaling,0.030703,0.030816,0.000114,0.031325
54,relu,"(100,)",lbfgs,constant,0.026771,0.030554,0.003783,0.031591
60,relu,"(100,)",lbfgs,adaptive,0.027959,0.030064,0.002105,0.031689
41,tanh,"(50, 100, 50)",lbfgs,invscaling,0.030880,0.030808,0.000072,0.031788
57,relu,"(100,)",lbfgs,invscaling,0.027824,0.030575,0.002751,0.031805
...,...,...,...,...,...,...,...,...
35,logistic,"(50, 100, 50)",adam,adaptive,0.052076,0.051262,0.000814,0.065256
29,logistic,"(50, 100, 50)",adam,constant,0.053991,0.052534,0.001457,0.066034
23,logistic,"(50, 100, 50)",lbfgs,invscaling,0.086962,0.085022,0.001940,0.086544
26,logistic,"(50, 100, 50)",lbfgs,adaptive,0.086831,0.084814,0.002017,0.086601


#### Band gap

In [37]:
############### ANN ##############################
hidden_layer_sizes = [(100,),(50,50), (50,100,50),(50,100,100,50)]
activations = ["identity", "logistic", "tanh", "relu"]
solvers=['lbfgs','adam']
learning_rates = ["constant", "invscaling", "adaptive"]
seeds = [1]
results_df = pd.DataFrame(columns=['activation',"hidden_layer_size","solver","learning_rate",'TrainError', 'ValError', 'deltaErrors'])

for activation in activations:
    for solver in solvers:
        for learning_rate in learning_rates:
            for hidden_layer_size in hidden_layer_sizes:
                mlp = MLPRegressor(activation=activation, solver=solver, learning_rate=learning_rate, hidden_layer_sizes=hidden_layer_size).fit(X_train_gap, y_train_gap)
                trainError = rmsle(y_train_gap, abs(mlp.predict(X_train_gap)))
                valError = rmsle(y_val_gap, abs(mlp.predict(X_val_gap)))
                cross_Val = evaluate_CV(mlp, X_fullMinMax, y_be)
                results_df = results_df.append({'activation':activation,"hidden_layer_size":hidden_layer_size,"solver":solver,"learning_rate":learning_rate,"TrainError":trainError, 'ValError':valError, 'deltaErrors':abs(trainError-valError),'crossVal':cross_Val}, ignore_index=True)






In [38]:
results_df.sort_values(by=["crossVal","ValError", "deltaErrors"])

Unnamed: 0,activation,hidden_layer_size,solver,learning_rate,TrainError,ValError,deltaErrors,crossVal
41,tanh,"(50, 100, 50)",lbfgs,invscaling,0.076627,0.073540,0.003087,0.082592
43,tanh,"(50, 50)",lbfgs,adaptive,0.076890,0.075026,0.001864,0.082881
38,tanh,"(50, 100, 50)",lbfgs,constant,0.078058,0.071912,0.006146,0.082995
44,tanh,"(50, 100, 50)",lbfgs,adaptive,0.077632,0.072404,0.005228,0.083001
40,tanh,"(50, 50)",lbfgs,invscaling,0.077870,0.073002,0.004868,0.083208
...,...,...,...,...,...,...,...,...
9,identity,"(100,)",adam,constant,0.113267,0.091627,0.021640,0.113185
10,identity,"(50, 50)",adam,constant,0.110986,0.090602,0.020384,0.113203
13,identity,"(50, 50)",adam,invscaling,0.114022,0.093414,0.020608,0.113265
12,identity,"(100,)",adam,invscaling,0.113628,0.092374,0.021254,0.114066
