In [1]:
# IMPORTS
import pandas as pd
import sklearn.model_selection as model_selection
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
class Dataset():
    def __init__(self, train_path, test_path):
        self.train_df = pd.read_csv(train_path)
        self.test_df = pd.read_csv(test_path)
        self.encoders = dict()
        self.train_data = None
        self.targets = None
        self.test_data = None
        self.X_train = None
        self.Y_train = None
        self.X_val = None
        self.Y_val = None
        self.X_test = None
        self.Y_test = None
        
    def split_data(self):
        
        (self.X_train, self.X_test, 
         self.Y_train, self.Y_test) = model_selection.train_test_split(self.train_data, 
                                                                       self.targets, 
                                                                       test_size=.2)
        
        (self.X_train, self.X_val, 
         self.Y_train, self.Y_val) = model_selection.train_test_split(self.X_train, 
                                                                      self.Y_train, 
                                                                      test_size=.2)
        
    
    def prepare_data(self):
        train_df = self.train_df.drop('id', axis=1).copy()
        test_df = self.test_df.drop('id', axis=1).copy()
        
        self.encoders['color'] = OneHotEncoder(sparse=False)
        self.encoders['type'] = LabelEncoder()
        
        self.encoders['color'].fit(train_df['color'].to_numpy().reshape(-1, 1))
        self.encoders['type'].fit(train_df['type'].to_numpy().reshape(-1, 1))
        
        color_hot_train = self.encoders['color'].transform(train_df['color'].to_numpy().reshape(-1, 1))
        color_hot_test = self.encoders['color'].transform(test_df['color'].to_numpy().reshape(-1, 1))
        targets = self.encoders['type'].transform(train_df['type'].to_numpy().reshape(-1, 1))
        
        train_data = np.zeros((train_df.shape[0], train_df.shape[1]+4))
        for i in range(train_data.shape[0]):
            train_data[i] = np.concatenate((train_df.to_numpy()[i][:-2], color_hot_train[i]))
        
        test_data = np.zeros((test_df.shape[0], test_df.shape[1]+5))
        for i in range(test_data.shape[0]):
            test_data[i] = np.concatenate((test_df.to_numpy()[i][:-1], color_hot_test[i]))
        
        self.train_data = train_data
        self.targets = targets
        self.test_data = test_data

In [3]:
dataset = Dataset('train.csv', 'test.csv')
dataset.prepare_data()
dataset.split_data()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [4]:
class Model():
    def __init__(self, model, dataset):
        self.dataset = dataset
        self.model = model
        self.train_predictions = None
        self.val_predictions = None
        self.test_predictions = None
    
    def fit(self):
        self.model.fit(self.dataset.X_train, self.dataset.Y_train)
    
    def test(self):
        val_predictions = self.model.predict(self.dataset.X_val)
        train_predictions = self.model.predict(self.dataset.X_train)
        self.val_predictions = val_predictions
        self.train_predictions = train_predictions
        
    def predict(self):
        self.test_predictions = self.model.predict(self.dataset.X_test)

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from keras import layers, models, optimizers
import time
import pickle

In [351]:
def build_model(input_shape, activation, units, lr, optimizer, init, beta1=0.9, beta2=0.999):
    model = models.Sequential()
    model.add(layers.Dense(activation=activation, units=units, init=init, input_shape=(input_shape, ), kernel_regularizer='l2'))
    model.add(layers.Dense(activation=activation, units=units, init=init, kernel_regularizer='l2'))
    model.add(layers.Dense(activation='softmax', units=3, kernel_regularizer='l2'))
    
    model.compile(loss='categorical_crossentropy', optimizer=optimizer(lr=lr, beta_1=beta1, beta_2=beta2))
    return model

In [362]:
learning_rates = [.0001,]
inits = ['glorot_uniform']
activations = ['relu']
beta1s = [0.9]
beta2s = [0.999]
units = [50]

In [363]:
models_results = {}
for lr in learning_rates:
    for init in inits:
        for activation in activations:
            for beta1 in beta1s:
                for beta2 in beta2s:
                    for unit in units:
                        start = time.time()
                        model = build_model(dataset.X_train.shape[1], activation, 
                                            units=unit, lr=lr, optimizer=optimizers.Adam, 
                                            init=init, beta1=beta1, beta2=beta2)

                        model.fit(dataset.X_train, dataset.Y_train, validation_split=0, epochs=3000, verbose=0)

                        parameters = (lr, init, activation, unit, optimizers.Adam, beta1, beta2, 2)

                        models_results[parameters] = {'training': model.predict(dataset.X_train),
                                                      'val': model.predict(dataset.X_val),
                                                      'test': model.predict(dataset.X_test)}

                        print(f'Created model with parameters: {parameters}')
                        print(f'Training took {time.time()-start} ms')

  This is separate from the ipykernel package so we can avoid doing imports until


Created model with parameters: (0.0001, 'glorot_uniform', 'relu', 50, <class 'keras.optimizers.Adam'>, 0.9, 0.999, 2)
Training took 54.95100021362305 ms


In [364]:
def pred_to_hot(y_pred):
    for row in range(y_pred.shape[0]):
        max_index = np.argmax(y_pred[row])
        for col in range(y_pred.shape[1]):
            y_pred[row, col] = np.floor(y_pred[row, col]) if col != max_index else np.ceil(y_pred[row, col])

    return y_pred

In [365]:
accuracies = {}
for param_set in models_results.keys():
    model = models_results[param_set]
    set_accuracies = {}
    for key, y_true in zip(model.keys(), [dataset.Y_train, dataset.Y_val, dataset.Y_test]):
        array = model[key].copy()
        set_accuracies[key] = accuracy_score(y_true, pred_to_hot(array))
    accuracies[param_set] = set_accuracies

In [366]:
pd.DataFrame(accuracies).T.sort_values(by=['test'], ascending=[False])

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,test,training,val
0.0001,glorot_uniform,relu,50,<class 'keras.optimizers.Adam'>,0.9,0.999,2,0.76,0.720339,0.683333


In [407]:
models_to_solve = {}
for lr in learning_rates:
    for init in inits:
        for activation in activations:
            for beta1 in beta1s:
                for beta2 in beta2s:
                    for unit in units:
                        model = build_model(dataset.X_train.shape[1], activation, 
                                            units=unit, lr=lr, optimizer=optimizers.Adam, 
                                            init=init, beta1=beta1, beta2=beta2)
                        model.fit(dataset.train_data, dataset.targets, validation_split=0, epochs=3000, verbose=0)
                        parameters = (lr, init, activation, unit, optimizers.Adam, beta1, beta2, 2)
                        models_to_solve[parameters] = model
                        print(f'Trained model with parameters: {parameters}')

  This is separate from the ipykernel package so we can avoid doing imports until


Trained model with parameters: (0.0001, 'glorot_uniform', 'relu', 50, <class 'keras.optimizers.Adam'>, 0.9, 0.999, 2)


In [13]:
parameters = {'max_depth': [5, 10, 20, 30, 50], 
              'n_estimators': [5, 15, 30, 50, 70, 100], 
              'min_samples_split': [5, 10, 25, 40, 55, 70]}

forest = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(estimator=forest, param_distributions=parameters, scoring='accuracy', 
                                   cv=5, n_jobs=-1, random_state=42)
random_search.fit(dataset.X_train, dataset.Y_train)
random_search.best_params_

{'n_estimators': 70, 'min_samples_split': 10, 'max_depth': 30}

In [14]:
final_forest = RandomForestClassifier(n_estimators=70, max_depth=30, min_samples_split=10, random_state=42)
final_forest.fit(dataset.X_train, dataset.Y_train)

print(f'Train accuracy: {accuracy_score(dataset.Y_train, final_forest.predict(dataset.X_train))}')
print(f'Train accuracy: {accuracy_score(dataset.Y_val, final_forest.predict(dataset.X_val))}')
print(f'Train accuracy: {accuracy_score(dataset.Y_test, final_forest.predict(dataset.X_test))}')

Train accuracy: 0.9279661016949152
Train accuracy: 0.7166666666666667
Train accuracy: 0.7066666666666667


In [15]:
final_forest.fit(dataset.train_data, dataset.targets)
predictions = final_forest.predict(dataset.test_data)
dataset.test_df['type'] = dataset.encoders['type'].inverse_transform(predictions)
dataset.test_df.head()

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul,color,type
0,3,0.471774,0.387937,0.706087,0.698537,black,Ghoul
1,6,0.427332,0.645024,0.565558,0.451462,white,Goblin
2,9,0.549602,0.491931,0.660387,0.449809,black,Ghoul
3,10,0.638095,0.682867,0.471409,0.356924,white,Ghost
4,13,0.361762,0.583997,0.377256,0.276364,black,Ghost


In [16]:
dataset.test_df[['id', 'type']].set_index('id').to_csv('submission.csv')

In [17]:
dataset.test_df['type'].value_counts()

Ghost     193
Ghoul     178
Goblin    158
Name: type, dtype: int64

In [19]:
dataset.train_df['type'].value_counts()

Ghoul     129
Goblin    125
Ghost     117
Name: type, dtype: int64