# Project Pipeline
Execute the cells step by step to obtain a prediction and score for your configuration.
In order to make the widgets work you might need to execute 
```
jupyter nbextension enable --py --sys-prefix widgetsnbextension
```
on your system.

## 1. Imports

In [11]:
import numpy as np
import dask.dataframe as dd
import os.path
import time
from ipywidgets import *
from IPython.display import display

from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
import sklearn.preprocessing as skprep
import sklearn.decomposition as skdecomp

import subsampling

## 2. Prepare Data Sets
### 2.1 Load Data
Load training and test data. Furthermore, create a sample set for calculating transformations such as PCA.

In [12]:
all_train_data = dd.read_csv('data/all_train.csv')
all_test_data = dd.read_csv('data/all_test.csv')

if not os.path.isfile('data/all_sample.csv'):
    subsampling.generate_sample_set(0.001)

sample_data = dd.read_csv('data/all_sample.csv')

### 2.2 Subsample Data

In [17]:
train_data = all_train_data
test_data = all_test_data

def subsample_data(p):
    global train_data, test_data
    train_data = all_train_data.sample(p)
    test_data = all_test_data.sample(p)
    print('Subsampled {} of all data.'.format(p))

percentage = BoundedFloatText(
    value=0.0001,
    min=0,
    max=1.0,
    description='Sampling Percentage:'
)
i = interact(subsample_data, p=percentage)

## 3. Preprocessing: PCA

In [18]:
pca = None

def perform_pca(n):
    global pca
    print('Computing PCA with {} principal components.'.format(n))
    pca = skdecomp.PCA(n_components=n)
    pca.fit(sample_data.compute())
    print('PCA done.')

ncomponents = IntSlider(
    value=5,
    min=1,
    max=len(train_data.columns),
    step=1,
    description='num components:',
    continuous_update=False,
    layout=Layout(width='80%')
)
i = interact(perform_pca, n=ncomponents)

## 4. Configure Classifiers
### 4.1 Neural Net

In [19]:
hidden_layer_box = None
hidden_layers = [20, 20]
max_iter = 30
mlp = None

def update_hidden_layer_size(s, l):
    global hidden_layers, mlp
    hidden_layers[l] = s
    
    mlp = MLPClassifier(hidden_layer_sizes=hidden_layers, 
                        max_iter=max_iter, 
                        warm_start=True)

def set_max_iter(m):
    global max_iter, mlp
    max_iter = m
    
    mlp = MLPClassifier(hidden_layer_sizes=hidden_layers, 
                        max_iter=max_iter, 
                        warm_start=True)

def configure_hidden_layers(n):
    global hidden_layer_box, hidden_layers
    
    if not hidden_layer_box is None:
        hidden_layer_box.close()
    
    hidden_layers = [20 for i in range(n)]
    
    items = [interactive(update_hidden_layer_size,
                         s=IntSlider(
                             value=20,
                             min=1,
                             max=100,
                             step=1,
                             description='Hidden layer {}:'.format(i),
                             continuous_update=False,
                             layout=Layout(width='80%')
                         ),
                         l=fixed(i)) 
             for i in range(n)]
    
    hidden_layer_box = VBox(
        [widgets.Label('Hidden layer sizes:', layout=Layout(width='100%'))] + items,
        layout=Layout(border='solid', padding='10px')
    )
    display(hidden_layer_box)

max_iter_slider =  IntSlider(
    value=30,
    min=1,
    max=500,
    step=1,
    description='max iterations:',
    continuous_update=False,
    layout=Layout(width='80%')
)
i = interact(set_max_iter, m=max_iter_slider)

nhidden = IntSlider(
    value=2,
    min=1,
    max=10,
    step=1,
    description='num hidden layers:',
    continuous_update=False,
    layout=Layout(width='80%')
)
i = interact(configure_hidden_layers, n=nhidden)

### 4.2 Naive Bayes

In [15]:
# TODO

## 5. Training

In [16]:
WINDOW_SIZE = 500
WINDOW_REPS = 10
classifier = None

##########################################

def set_classifier(c):
    global classifier
    if c == 'Multilayer Perceptron':
        classifier = mlp
    else:
        classifier = None

classifier_rb = RadioButtons(
    options=['Multilayer Perceptron', 'Naive Bayes'],
    description='Classifier:'
)
interact(set_classifier, c=classifier_rb)

def perform_training(*args):
    print('Training {}'.format(classifier))
    start_time = time.time()

    iterator = train_data.iterrows()
    window = np.zeros((0,29))

    for row in iterator:
        window = np.append(window, [row[1]], axis=0)
        if window.shape[0] == 500:
            classifier.fit(window[:,1:29], window[:,0])
            window = np.zeros((0,29))
    classifier.fit(window[:,1:29], window[:,0])

    print('Time taken: {}'.format(time.time() - start_time))

start = widgets.Button(
    description='Start training',
    button_style='danger'
)
display(start)
start.on_click(perform_training)

## 6. Prediction

In [88]:
iterator = test_data.iterrows()
window = np.zeros((0,29))
conf_mat = np.zeros((2, 2))

for row in iterator:
    window = np.append(window, [row[1]], axis=0)
    if window.shape[0] == WINDOW_SIZE:
        prediction = classifier.predict(window[:,1:29])
        conf_mat += confusion_matrix(window[:,0], prediction)
        window = np.zeros((0,29))        
prediction = classifier.predict(window[:,1:29])
conf_mat += confusion_matrix(window[:,0], prediction)

print(conf_mat)

[[ 169.    0.]
 [ 207.   10.]]
