# This file contains the preparation of data and many helpful functions

In [2]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

### The metric for this competition is RMSLE

In [3]:
# metric
from sklearn.metrics import mean_squared_log_error
def rmsle(y_true, y_pred, **kwargs):
    return np.sqrt(mean_squared_log_error(y_true, np.clip(y_pred, 0, None)))
rmsle_scorer = sklearn.metrics.make_scorer(rmsle, greater_is_better=False)

# should not be used. only for specific case (log transformed target)
def rmsle_exp(y_true, y_pred, **kwargs):
    return np.sqrt(mean_squared_log_error(np.expm1(y_true), np.clip(np.expm1(y_pred), 0, None))) 
rmsle_scorer_exp = sklearn.metrics.make_scorer(rmsle_exp, greater_is_better=False)

### Reading in data

In [4]:
# read in data
train = pd.read_csv("train.csv", delimiter=",")
test = pd.read_csv("test.csv", delimiter=",")

# calculated data
X_additional = pd.read_csv("additionalAttributes.csv", delimiter=",").drop("Unnamed: 0", axis=1)
X_additional_test = pd.read_csv("additionalAttributesTest.csv", delimiter=",").drop("Unnamed: 0", axis=1)
X = train.drop(["formation_energy_ev_natom", "bandgap_energy_ev", "id"], axis=1)
test_X = test.drop(["id"], axis=1)

y_fe = train.formation_energy_ev_natom
y_be = train.bandgap_energy_ev

X_full = pd.concat([X, X_additional], axis=1)
X_full_test = pd.concat([test_X, X_additional_test], axis=1)

### Encoding spacegroup

Not used in the end because it gave lower scores

In [5]:
# transform encoding
def encode_spacegroup(X):
    # 1-2 triclinic
    # 3-15 monoclinic
    # 16-74 orthorhombic
    # 75-142 tetragonal
    # 143-167 trigonal
    # 168-194 hexagonal
    # 195-230 cubic
    # [ 33 194 227 167 206  12] are the possible spacegroup values
    # onehot encode each separately
    return pd.get_dummies(X, columns=["spacegroup"])

### Helpful functions for evaluation and saving results

In [6]:
from sklearn.model_selection import cross_val_score
def evaluate_CV(model, X, y, metric=rmsle_scorer, n_folds=5, random_state=None):
    return -cross_val_score(model, X, y, cv=n_folds, scoring=metric).mean()

In [7]:
# submission file
def save_results(y_fe_pred, y_be_pred, name):
    results = pd.DataFrame({"id": test.id, "formation_energy_ev_natom": y_fe_pred, "bandgap_energy_ev": y_be_pred})
    results.to_csv(name + ".csv",index=False)

### Filling the missing values in the calculated data

In [8]:
# filling values
fillValues = {'cAlGa':0, 
              'cAlAl':0,
              'cAlIn':0, 
              'cAlO':0,
              'cGaGa':0,
              'cGaAl':0, 
              'cGaIn':0, 
              'cGaO':0, 
              'cInIn':0,
              'cInAl':0, 
              'cInGa':0, 
              'cInO':0,
              'distAlAl':100,
              'distAlGa':100,
              'distAlIn':100,
              'distAlO':100,
              'distGaGa':100,
              'distGaAl':100,
              'distGaIn':100,
              'distGaO':100,
              'distInIn':100,
              'distInAl':100,
              'distInGa':100,
              'distInO':100,
              'qAl':0, 
              'qGa':0, 
              'qIn':0, 
              'qO':0}
X_full = X_full.fillna(value=fillValues)
X_full_test = X_full_test.fillna(value=fillValues)

### Engineering new features from spacegroup

In [9]:
def set_symmetry(X):
    centrosymmetric = []
    enantiomorphic = []
    groupEnantiomorph = [1]+list(np.arange(3,6,1))+list(np.arange(16,25,1))+list(np.arange(75,81,1))+list(np.arange(89,99,1))+list(np.arange(143,147,1))+list(np.arange(149,156,1))+list(np.arange(168,174,1))+list(np.arange(177,183,1))+list(np.arange(195,200,1))+list(np.arange(207,215,1))
    groupCentrosym = [2]+list(np.arange(10,16,1))+list(np.arange(47,75,1))+list(np.arange(83,89,1))+list(np.arange(123,143,1))+list(np.arange(147,149,1))+list(np.arange(162,168,1))+list(np.arange(175,177,1))+list(np.arange(191,195,1))+list(np.arange(200,207,1))+list(np.arange(221,231,1))
    for i in range(len(X)):
        if(X.spacegroup[i] in groupEnantiomorph):
            enantiomorphic+=[1]
            centrosymmetric+=[0]
        elif(X.spacegroup[i] in groupCentrosym):
            enantiomorphic+=[0]
            centrosymmetric+=[1]
        else:
            enantiomorphic+=[0]
            centrosymmetric+=[0]
    X["centroSym"] = centrosymmetric
    X["enantioMorph"] = enantiomorphic
    return X

In [10]:
X_full = set_symmetry(X_full)
X_full_test = set_symmetry(X_full_test)

### Log-Transformation on features

In [11]:
# apply log transformation to some features
for col in ["cAlGa", "cAlIn", "cGaAl", "cInAl", "cInGa", "distAlO", "distGaAl", "distGaIn", "distGaO", "distInAl", "distInGa", "distInO", "spacegroup"]:
    X_full[col] = np.log1p(X_full[col])
    X_full_test[col] = np.log1p(X_full_test[col])

### Standardizing features

In [12]:
# standardize features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_full)
X_full = pd.DataFrame(scaler.transform(X_full), columns=X_full.columns)
X_full_test = pd.DataFrame(scaler.transform(X_full_test), columns=X_full.columns)