In [43]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [46]:
# metric
from sklearn.metrics import mean_squared_log_error
def rmsle(y_true, y_pred, **kwargs):
    return np.sqrt(mean_squared_log_error(y_true, np.clip(y_pred, 0, None)))
rmsle_scorer = sklearn.metrics.make_scorer(rmsle, greater_is_better=False)

# should not be used. only for specific case
def rmsle_exp(y_true, y_pred, **kwargs):
    return np.sqrt(mean_squared_log_error(np.expm1(y_true), np.clip(np.expm1(y_pred), 0, None))) 
rmsle_scorer_exp = sklearn.metrics.make_scorer(rmsle_exp, greater_is_better=False)

In [48]:
# read in data
train = pd.read_csv("train.csv", delimiter=",")
test = pd.read_csv("test.csv", delimiter=",")
X_additional = pd.read_csv("additionalAttributes.csv", delimiter=",").drop("Unnamed: 0", axis=1)
X = train.drop(["formation_energy_ev_natom", "bandgap_energy_ev", "id"], axis=1)
test_X = test.drop(["id"], axis=1)
y_fe = train.formation_energy_ev_natom
y_be = train.bandgap_energy_ev
X_full = pd.concat([X, X_additional], axis=1)

In [28]:
# transform encoding
def encode_spacegroup(X):
    # 1-2 triclinic
    # 3-15 monoclinic
    # 16-74 orthorhombic
    # 75-142 tetragonal
    # 143-167 trigonal
    # 168-194 hexagonal
    # 195-230 cubic
    # [ 33 194 227 167 206  12] are the possible spacegroup values
    # onehot encode each separately
    return pd.get_dummies(X, columns=["spacegroup"])

In [29]:
encode_spacegroup(X)

Unnamed: 0,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,spacegroup_12,spacegroup_33,spacegroup_167,spacegroup_194,spacegroup_206,spacegroup_227
0,80.0,0.6250,0.3750,0.0000,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,0,1,0,0,0,0
1,80.0,0.6250,0.3750,0.0000,6.1840,6.1838,23.6287,90.0186,89.9980,120.0025,0,0,0,1,0,0
2,40.0,0.8125,0.1875,0.0000,9.7510,5.6595,13.9630,90.9688,91.1228,30.5185,0,0,0,0,0,1
3,30.0,0.7500,0.0000,0.2500,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,0,0,1,0,0,0
4,80.0,0.0000,0.6250,0.3750,6.6614,6.6612,24.5813,89.9960,90.0006,119.9893,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,40.0,0.7500,0.2500,0.0000,4.9469,8.5014,9.1298,90.0038,90.0023,90.0015,0,1,0,0,0,0
2396,30.0,0.4167,0.5833,0.0000,4.9566,4.9562,13.4178,89.9938,90.0075,120.0007,0,0,1,0,0,0
2397,80.0,0.4375,0.5625,0.0000,9.2204,9.2200,9.2199,90.0047,90.0046,89.9954,0,0,0,0,1,0
2398,80.0,0.3125,0.1875,0.5000,10.6529,9.0954,9.7210,90.0015,89.9996,90.0004,0,1,0,0,0,0


In [38]:
from sklearn.model_selection import cross_val_score
def evaluate_CV(model, X, y, metric=rmsle_scorer, n_folds=5, random_state=None):
    return -cross_val_score(model, X, y, cv=n_folds, scoring=metric).mean()

In [None]:
# submission file
def save_results(y_fe_pred, y_be_pred, name):
    results = pd.DataFrame({"id": test.id, "formation_energy_ev_natom": y_fe_pred, "bandgap_energy_ev": y_be_pred})
    results.to_csv(name + ".csv",index=False)

In [49]:
# filling values
fillValues = {'cAlGa':0, 
              'cAlIn':0, 
              'cAlO':0,
              'cGaAl':0, 
              'cGaIn':0, 
              'cGaO':0, 
              'cInAl':0, 
              'cInGa':0, 
              'cInO':0,
              'distAlGa':9999999,
              'distAlIn':9999999,
              'distAlO':9999999,
              'distGaAl':9999999,
              'distGaIn':9999999,
              'distGaO':9999999,
              'distInAl':9999999,
              'distInGa':9999999,
              'distInO':9999999,
              'qAl':0, 
              'qGa':0, 
              'qIn':0, 
              'qO':0}
X_full = X_full.fillna(value=fillValues)