# Notebook for developing models

In [8]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

## Data preparation for model training

### Importing data

In [9]:
# read in data
train = pd.read_csv("train.csv", delimiter=",")
test = pd.read_csv("test.csv", delimiter=",")
X_additional = pd.read_csv("additionalAttributes.csv", delimiter=",").drop("Unnamed: 0", axis=1)
X = train.drop(["formation_energy_ev_natom", "bandgap_energy_ev", "id"], axis=1)
y_fe = train.formation_energy_ev_natom
y_be = train.bandgap_energy_ev
X_full = pd.concat([X, X_additional], axis=1)

In [10]:
X_full

Unnamed: 0,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,...,distGaO,distInAl,distInGa,distInO,elInt,qAl,qGa,qIn,qO,xEq
0,33,80.0,0.6250,0.3750,0.0000,9.9523,8.5513,9.1775,90.0026,90.0023,...,2.025,,,,-41.086176,0.349004,0.345264,,-0.231734,4.896453
1,194,80.0,0.6250,0.3750,0.0000,6.1840,6.1838,23.6287,90.0186,89.9980,...,1.775,0.025,0.025,1.775,-38.554098,0.344690,0.347084,,-0.230392,4.905702
2,227,40.0,0.8125,0.1875,0.0000,9.7510,5.6595,13.9630,90.9688,91.1228,...,1.825,0.025,0.025,1.825,-33.957596,0.343019,0.339820,,-0.228280,4.925995
3,167,30.0,0.7500,0.0000,0.2500,5.0036,5.0034,13.5318,89.9888,90.0119,...,0.025,2.925,0.025,0.025,-35.763321,0.337616,,0.375759,-0.231435,4.898196
4,194,80.0,0.0000,0.6250,0.3750,6.6614,6.6612,24.5813,89.9960,90.0006,...,2.025,0.025,3.625,2.025,-36.645650,,0.342521,0.359175,-0.232511,4.869110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,33,40.0,0.7500,0.2500,0.0000,4.9469,8.5014,9.1298,90.0038,90.0023,...,1.775,0.025,0.025,1.775,-39.262887,0.346869,0.345489,,-0.231016,4.905482
2396,167,30.0,0.4167,0.5833,0.0000,4.9566,4.9562,13.4178,89.9938,90.0075,...,2.025,0.025,0.025,2.025,-35.323868,0.345140,0.342118,,-0.228918,4.928134
2397,206,80.0,0.4375,0.5625,0.0000,9.2204,9.2200,9.2199,90.0047,90.0046,...,1.975,0.025,0.025,1.975,-38.566520,0.364599,0.330046,,-0.230109,4.873503
2398,33,80.0,0.3125,0.1875,0.5000,10.6529,9.0954,9.7210,90.0015,89.9996,...,2.125,3.375,3.525,2.125,-39.874295,0.335601,0.329533,0.372616,-0.235314,4.842888


### Remove nan values From data

In [11]:
X_full.columns

Index(['spacegroup', 'number_of_total_atoms', 'percent_atom_al',
       'percent_atom_ga', 'percent_atom_in', 'lattice_vector_1_ang',
       'lattice_vector_2_ang', 'lattice_vector_3_ang',
       'lattice_angle_alpha_degree', 'lattice_angle_beta_degree',
       'lattice_angle_gamma_degree', 'Vatom', 'cAlGa', 'cAlIn', 'cAlO',
       'cGaAl', 'cGaIn', 'cGaO', 'cInAl', 'cInGa', 'cInO', 'distAlGa',
       'distAlIn', 'distAlO', 'distGaAl', 'distGaIn', 'distGaO', 'distInAl',
       'distInGa', 'distInO', 'elInt', 'qAl', 'qGa', 'qIn', 'qO', 'xEq'],
      dtype='object')

* If distance in Nan, it is reasonable to set them to very large value as the atoms are infinitely far from each other
* If Coordination number is Nan then it is reasonable to set it as 0, as no atoms are in vicinity of chosen atom
* If any of the charges have Nan values it is set to 0

In [12]:
fillValues = {'cAlGa':0, 
              'cAlIn':0, 
              'cAlO':0,
              'cGaAl':0, 
              'cGaIn':0, 
              'cGaO':0, 
              'cInAl':0, 
              'cInGa':0, 
              'cInO':0,
              'distAlGa':9999999,
              'distAlIn':9999999,
              'distAlO':9999999,
              'distGaAl':9999999,
              'distGaIn':9999999,
              'distGaO':9999999,
              'distInAl':9999999,
              'distInGa':9999999,
              'distInO':9999999,
              'qAl':0, 
              'qGa':0, 
              'qIn':0, 
              'qO':0}

In [13]:
X_full = X_full.fillna(value=fillValues)
X_full

### Categorize spacegroups

In [16]:
# transform encoding
def encode_spacegroup(X):
    # 1-2 triclinic
    # 3-15 monoclinic
    # 16-74 orthorhombic
    # 75-142 tetragonal
    # 143-167 trigonal
    # 168-194 hexagonal
    # 195-230 cubic
    # [ 33 194 227 167 206  12] are the possible spacegroup values
    # onehot encode each separately
    return pd.get_dummies(X, columns=["spacegroup"])

In [17]:
X_full = encode_spacegroup(X_full)

### Separate into Training and validation

In [20]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

trainSize = 0.85

############## Separate Train into Validation and Train for Formation energy ####################
X_train_energ, X_val_energ, y_train_energ, y_val_energ = train_test_split(X_full, y_fe, train_size=trainSize, random_state=1)


############## Separate Train into Validation and Train for Band Gap ####################
X_train_gap, X_val_gap, y_train_gap, y_val_gap = train_test_split(X_full, y_be, train_size=trainSize, random_state=1)


# Model training

In [21]:
from sklearn.metrics import mean_squared_log_error
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

## KNN

In [31]:
#################### KNN ##############################
from sklearn.neighbors import KNeighborsRegressor
results_df = pd.DataFrame(columns=['model',"Neigbors","Metric","weight","min_samples_split",'trainError', 'valError'])

for metr in ["manhattan", "minkowski"]:
    for i in range(3, 71, 2):
        for weight in ['uniform', 'distance']:
            knn = KNeighborsRegressor(n_neighbors=i).fit(X_train_energ, y_train_energ)

#            print("Neighbours " + str(i))
#            print("Metric " + metr)

            trainError = rmsle(y_train_energ, knn.predict(X_train_energ))
            valError = rmsle(y_val_energ, knn.predict(X_val_energ))

            results_df = results_df.append({'model': 'KNN',"Neigbors":i,"Metric":metr,"weight":weight,"trainError":trainError, 'valError':valError, 'deltaErrors':abs(trainError-valError)}, ignore_index=True)



In [32]:
results_df.sort_values(by=["valError", "deltaErrors"])

Unnamed: 0,model,Neigbors,Metric,weight,min_samples_split,trainError,valError,deltaErrors
4,KNN,7,manhattan,uniform,,0.032344,0.034206,0.001863
5,KNN,7,manhattan,distance,,0.032344,0.034206,0.001863
72,KNN,7,minkowski,uniform,,0.032344,0.034206,0.001863
73,KNN,7,minkowski,distance,,0.032344,0.034206,0.001863
2,KNN,5,manhattan,uniform,,0.030688,0.034329,0.003641
...,...,...,...,...,...,...,...,...
133,KNN,67,minkowski,distance,,0.052375,0.052961,0.000586
66,KNN,69,manhattan,uniform,,0.052763,0.053360,0.000597
67,KNN,69,manhattan,distance,,0.052763,0.053360,0.000597
134,KNN,69,minkowski,uniform,,0.052763,0.053360,0.000597


## SVC