# Project Pipeline
Execute the cells step by step to obtain a prediction and score for your configuration.
## 1. Imports

In [1]:
import numpy as np
import dask.dataframe as dd
import os.path

from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
import sklearn.preprocessing as skprep
import sklearn.decomposition as skdecomp

import subsampling

## 2. Prepare Data Sets
### 2.1 Load Data

In [2]:
train_data = dd.read_csv('data/all_train.csv')
test_data = dd.read_csv('data/all_test.csv')

if not os.path.isfile('data/all_sample.csv'):
    subsampling.generate_sample_set(0.001)

sample_data = dd.read_csv('data/all_sample.csv')

### 2.2 Set Sampling Percentage

In [68]:
PERCENTAGE = 0.0001

##########################################

train_data = train_data.sample(PERCENTAGE)
test_data = test_data.sample(PERCENTAGE)

## 3. Preprocessing: PCA

In [13]:
NDIM = 5

##########################################

pca = skdecomp.PCA(n_components=NDIM)
pca.fit(sample_data.compute())

PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

## 4. Choose Classifier
### 4.1 Neural Net

In [69]:
HIDDEN_LAYERS = (28, 15)
MAX_ITER = 30

##########################################

classifier = MLPClassifier(hidden_layer_sizes=HIDDEN_LAYERS, 
                           max_iter=MAX_ITER, 
                           warm_start=True)

### 4.2 Naive Bayes

In [15]:
# TODO

## 5. Training

In [87]:
WINDOW_SIZE = 500
WINDOW_REPS = 10

##########################################

import time
start = time.time()

iterator = train_data.iterrows()
window = np.zeros((0,29))

for row in iterator:
    window = np.append(window, [row[1]], axis=0)
    if window.shape[0] == WINDOW_SIZE:
        classifier.fit(window[:,1:29], window[:,0])
        window = np.zeros((0,29))
if len(window) > 0:
    classifier.fit(window[:,1:29], window[:,0])

print(time.time() - start)

92.35737633705139


## 6. Prediction

In [88]:
iterator = test_data.iterrows()
window = np.zeros((0,29))
conf_mat = np.zeros((2, 2))

for row in iterator:
    window = np.append(window, [row[1]], axis=0)
    if window.shape[0] == WINDOW_SIZE:
        prediction = classifier.predict(window[:,1:29])
        conf_mat += confusion_matrix(window[:,0], prediction)
        window = np.zeros((0,29))
if len(window) > 0:
    prediction = classifier.predict(window[:,1:29])
    conf_mat += confusion_matrix(window[:,0], prediction)

print(conf_mat)

[[ 169.    0.]
 [ 207.   10.]]
