# Intro to ML Project
## Task 3
### Jan Bauer, Alaisha Sharma

In [16]:
import numpy as np
import pandas as pd

import tensorflow as tf
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils, to_categorical

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
# same seed for consistency
seed = 1
np.random.seed(seed)

In [3]:
train_data = pd.read_hdf("data/train.h5", "train")
train_data.head()

Unnamed: 0,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x111,x112,x113,x114,x115,x116,x117,x118,x119,x120
0,3,0.692628,0.476915,-0.979932,0.744277,0.539924,0.820458,0.851063,-0.121848,-0.58813,...,0.78694,0.75143,-0.576014,-0.452984,0.014936,-0.606131,0.533646,-0.957278,0.154,-0.777874
1,4,0.65978,0.261427,-0.983456,0.608041,0.539439,0.823413,0.765966,-0.270752,-0.624442,...,0.783914,0.714355,-0.56981,-0.359184,0.277566,-0.654566,0.529314,-0.957973,0.229013,-0.777375
2,1,0.705061,0.388275,-0.981143,0.628974,0.417311,0.813629,0.831153,-0.324068,-0.644861,...,0.808604,0.687422,-0.57439,-0.382889,0.123676,-0.642053,0.351704,-0.970116,0.205652,-0.757741
3,3,0.743044,0.508975,-0.979041,0.763926,0.407026,0.717283,0.842587,-0.25668,-0.729415,...,0.790318,0.674969,-0.687551,-0.459537,0.074573,-0.647956,0.421374,-0.961151,0.301331,-0.771479
4,4,0.66937,0.376581,-0.973238,0.657474,0.527102,0.757041,0.765099,-0.202327,-0.622846,...,0.79821,0.744071,-0.525544,-0.358159,0.30512,-0.693324,0.457829,-0.964764,0.19501,-0.791609


In [4]:
X_train = train_data.iloc[:,1:]
X_train.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x111,x112,x113,x114,x115,x116,x117,x118,x119,x120
0,0.692628,0.476915,-0.979932,0.744277,0.539924,0.820458,0.851063,-0.121848,-0.58813,0.984151,...,0.78694,0.75143,-0.576014,-0.452984,0.014936,-0.606131,0.533646,-0.957278,0.154,-0.777874
1,0.65978,0.261427,-0.983456,0.608041,0.539439,0.823413,0.765966,-0.270752,-0.624442,0.985527,...,0.783914,0.714355,-0.56981,-0.359184,0.277566,-0.654566,0.529314,-0.957973,0.229013,-0.777375
2,0.705061,0.388275,-0.981143,0.628974,0.417311,0.813629,0.831153,-0.324068,-0.644861,0.984268,...,0.808604,0.687422,-0.57439,-0.382889,0.123676,-0.642053,0.351704,-0.970116,0.205652,-0.757741
3,0.743044,0.508975,-0.979041,0.763926,0.407026,0.717283,0.842587,-0.25668,-0.729415,0.984553,...,0.790318,0.674969,-0.687551,-0.459537,0.074573,-0.647956,0.421374,-0.961151,0.301331,-0.771479
4,0.66937,0.376581,-0.973238,0.657474,0.527102,0.757041,0.765099,-0.202327,-0.622846,0.985728,...,0.79821,0.744071,-0.525544,-0.358159,0.30512,-0.693324,0.457829,-0.964764,0.19501,-0.791609


In [5]:
y_train = train_data.iloc[:,0:1]
y_train.head()

Unnamed: 0,y
0,3
1,4
2,1
3,3
4,4


In [6]:
X_test = pd.read_hdf("data/test.h5", "test")
X_test.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x111,x112,x113,x114,x115,x116,x117,x118,x119,x120
45324,0.701411,0.187016,-0.982154,0.60705,0.522439,0.845936,0.760097,-0.351103,-0.57456,0.983332,...,0.785956,0.704893,-0.496666,-0.388587,0.222872,-0.593376,0.482908,-0.964796,0.142961,-0.778702
45325,0.760352,0.317576,-0.984779,0.62765,0.526251,0.811506,0.831591,-0.108528,-0.601386,0.980275,...,0.755637,0.663636,-0.561561,-0.397575,0.082542,-0.540654,0.414563,-0.974782,0.095093,-0.801885
45326,0.786375,0.531402,-0.982018,0.689513,0.516748,0.744813,0.796561,-0.236796,-0.664038,0.984294,...,0.801912,0.733216,-0.660644,-0.440287,0.224579,-0.562353,0.48594,-0.963444,0.308921,-0.796694
45327,0.619722,0.343872,-0.978162,0.685251,0.531297,0.755004,0.75323,-0.238413,-0.604895,0.983026,...,0.777083,0.700376,-0.566261,-0.301069,0.187914,-0.627751,0.503113,-0.962118,0.166841,-0.730115
45328,0.724763,0.217592,-0.975856,0.65591,0.438478,0.855352,0.794299,-0.388748,-0.638993,0.982055,...,0.789648,0.67673,-0.529255,-0.349266,0.091318,-0.632296,0.463369,-0.966323,0.032839,-0.78641


In [10]:
scaler = StandardScaler()  
scaler.fit(X_train)  
X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)  

In [12]:
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_y = encoder.transform(y_train)
onehot_y = np_utils.to_categorical(encoded_y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [18]:
nodes = 20
in_dim = X_train.shape[1]
out_dim = onehot_y.shape[1]

In [50]:
classifier = Sequential()
in_dim = X_train.shape[1]
out_dim = onehot_y.shape[1]

# first hidden Layer
classifier.add(Dense(30, activation='relu', kernel_initializer='random_normal', input_dim=in_dim))
# second hidden Layer
classifier.add(Dense(20, activation='relu', kernel_initializer='random_normal'))
# third hidden Layer
classifier.add(Dense(15, activation='relu', kernel_initializer='random_normal'))
# output layer
classifier.add(Dense(out_dim, activation='softmax', kernel_initializer='random_normal'))

classifier.compile(optimizer ='adam',loss='categorical_crossentropy', metrics =['accuracy'])
classifier.fit(X_train, onehot_y, batch_size=10, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1227cda58>

In [51]:
pred_ker = classifier.predict(X_test)
# find most likely category from soft max
pred_ker = np.argmax(pred_ker, axis=-1)

In [52]:
# convert back to pandas dataframe
X_test = pd.DataFrame(X_test)
pred_submit = pd.DataFrame(list(zip(X_test.index.values + 45324, pred_ker)), columns=['Id', 'y'])
final_submit = pred_submit.to_csv("submit.csv", index=False)