In [49]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the training data into feature matrix, class labels, and event ids:

In [64]:
from proj1_helpers import *
DATA_TRAIN_PATH = os.path.join(os.getcwd(), '../data/train.csv')
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Our CRAZY machine learning thing :) ...

In [51]:
from nn_model import NNModel
from preprocessing import *
from nn_model import *

In [65]:
def convert_labels(y):
    y[y==-1] = 0
    y = y.astype(np.int)
    return y

def make_balanced(y, tX):
    pos_zero = np.argwhere(y==0).squeeze()
    num_delete = (y==0).sum() - (y==1).sum()
    pos_delete = np.random.choice(pos_zero, replace=False, size=num_delete)
    y_balanced = np.delete(y, pos_delete)
    x_balanced = np.delete(tX, pos_delete, axis=0)
    return (x_balanced, y_balanced)

def shuffle_dataset(x_train, y_train):
    shuffle = np.arange(y_train.shape[0])
    np.random.shuffle(shuffle)
    y_train = y_train[shuffle]
    x_train = x_train[shuffle]
    return (x_train, y_train)

In [66]:
y = convert_labels(y)

print('0 class instances: {}, 1 class instances: {}'.format((y==0).sum(), (y==1).sum()))

x_train, y_train = make_balanced(y, tX)

print('0 class instances: {}, 1 class instances: {}'.format((y_train==0).sum(), (y_train==1).sum()))

x_train, y_train = shuffle_dataset(x_train, y_train)

0 class instances: 164333, 1 class instances: 85667
0 class instances: 85667, 1 class instances: 85667


In [53]:
def cross_validation(tX, y, k=5, 
                     lr=0.01, lambda_=10, 
                     epochs=10, batch_size=128,
                     momentum=0.9, verbose=1):
    k_folds = np.array_split(np.arange(y.shape[0]), axis=0, indices_or_sections=k)
    accuracy = np.zeros(k)
    i = 0
    print("Cross-validation started!")
    for k_fold in k_folds:
        train_mask = ~np.isin(np.arange(y.shape[0]), k_fold)
        w_init = np.random.randn(1, tX.shape[1]+1)
        
        x_train = tX[train_mask]
        y_train = y[train_mask]
        x_val = tX[~train_mask]
        y_val = y[~train_mask]
        batch_size_ = min(batch_size, x_train.shape[0])
        
        model = NNModel(x_train.shape[1])
        model.add_layer(1)
        model.train(x_train, y_train,
                    lr=lr, lambda_=lambda_,
                    batch_size=batch_size,
                    epochs=epochs, verbose=verbose,
                    loss_fun='logistic_reg', momentum=momentum)
        
        y_pred = model.predict(x_val)
        y_pred = y_pred > 0
        y_pred = y_pred.squeeze()
        accuracy[i] = (y_pred==y_val).mean()
        i += 1
    accuracy_avg = accuracy.mean()
    return accuracy_avg
        

def hyperparamters_tunning(tX, y,
                           use_transformations,
                           handling_outliers,
                           transform_inplace,
                           max_degrees,
                           pairwise,
                           add_exp,
                           lrs, lambdas,
                           momentum,
                           batch_size,
                           epochs, k=5):
    best = {
        "transformation": None,
        "handling_outlier": None,
        "max_degree": None,
        "accuracy": 0,
        "lr": None,
        "lambda": None,
    }
    for use_transformation in use_transformations:
        for handling_outlier in handling_outliers:
            for max_degree in max_degrees:
                for lr in lrs:
                    for lambda_ in lambdas:
                        preprocessing = Preprocessing(
                            use_transformations=use_transformation,
                            handling_outliers=handling_outlier,
                            max_degree=max_degree
                        )
                        
                        tX_preprocessed = preprocessing.preprocess(data_=tX,
                                                                   transform_inplace=transform_inplace,
                                                                   pairwise=pairwise,
                                                                   add_exp=add_exp,
                                                                  )
                        accuracy = cross_validation(tX_preprocessed, y,
                                                    k=k, lr=lr, lambda_=lambda_,
                                                    epochs=epochs,
                                                    batch_size=batch_size,
                                                    momentum=momentum
                                                   )
                        if best['accuracy'] < accuracy:
                            print("Best is updated with accuracy:", accuracy)
                            best["transformation"] = use_transformation
                            best["handling_outlier"] = handling_outlier
                            best["max_degree"] = max_degree
                            best["accuracy"] = accuracy
                            best["lambda"] = lambda_
                            best["lr"] = lr
    return best

In [67]:
use_transformations = [True]
handling_outliers = ['fill_mean']
transform_inplace = False
max_degrees = [15]
pairwise = True
add_exp = False
lrs = [1]
lambdas = [0]
batch_size = 64
epochs = 20
momentum = 0.9

best = hyperparamters_tunning(tX, y,
                              use_transformations,
                              handling_outliers,
                              transform_inplace,
                              max_degrees,
                              pairwise,
                              add_exp,
                              lrs, lambdas,
                              momentum,
                              batch_size, epochs, k=2)
use_transformation = best["transformation"]
handling_outlier = best["handling_outlier"]
max_degree = best["max_degree"]
accuracy = best["accuracy"]
lambda_ = best["lambda"]
lr = best["lr"]

Cross-validation started!
Training started
>Epoch #20:	[####################]; Loss: 0.39091280887935714
Training ended

Training started
>Epoch #20:	[####################]; Loss: 0.6345803232417156
Training ended

Best is updated with accuracy: 0.8275079999999999


In [84]:
use_transformation = True
handling_outlier = 'fill_mean'
transform_inplace = False
max_degree = 15
pairwise=True
lambda_ = 0
lr = 1
verbose = 1
batch_size = 64
epochs = 25
momentum = 0.9
preprocessing = Preprocessing(use_transformations=use_transformation,
                              handling_outliers=handling_outlier,
                              max_degree=max_degree)
                        
tX_preprocessed = preprocessing.preprocess(data_=tX, transform_inplace=transform_inplace, pairwise=True)
model = NNModel(tX_preprocessed.shape[1])
model.add_layer(1)
model.train(tX_preprocessed, y,
            lr=lr, lambda_=lambda_,
            batch_size=batch_size,
            epochs=epochs, verbose=verbose,
            loss_fun='logistic_reg', momentum=momentum)

Training started
>Epoch #25:	[####################]; Loss: 0.8412820323923876
Training ended



In [85]:
def calculate_accuracy(y, x, model):
    y_pred = model.predict(x)
    y_pred = y_pred > 0
    y_pred = y_pred.squeeze()
    accuracy = (y_pred==y).mean()
    return accuracy


accuracy = calculate_accuracy(y, tX_preprocessed, model)
print(accuracy)

0.828376


In [86]:
#set small weights to zero
thrreshold = 0.005
print('number of total parameters = ', model.parameters['W1'].shape[1])
print('number of significant parameters =', np.sum(np.abs(model.parameters['W1'][0]) >= thrreshold))

model.parameters['W1'][np.abs(model.parameters['W1']) < thrreshold] = 0
accuracy = calculate_accuracy(y, tX_preprocessed, model)
print('accuracy after setting insignificant parameters to 0 =', accuracy)

number of total parameters =  1038
number of significant parameters = 1029
accuracy after setting insignificant parameters to 0 = 0.828368


In [9]:
print_parameters = (f'use_transformation: {use_transformation}\n'
                    + f'handling_outlier: {handling_outlier}\n'
                    + f'transform_inplace: {transform_inplace}\n'
                    + f'max_degree: {max_degree}\n'
                    + f'lambda_: {lambda_}\n'
                    + f'lr: {lr}\n'
                    + f'batch_size: {batch_size}\n'
                    + f'epochs: {epochs}\n'
                    + f'momentum: {momentum}\n')
print(print_parameters)

use_transformation: True
handling_outlier: fill_mean
transform_inplace: False
max_degree: 2
lambda_: 0
lr: 1
batch_size: 64
epochs: 10
momentum: 0.9



## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = DATA_TRAIN_PATH = os.path.join(os.getcwd(), '../data/test.csv')
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
x_test = preprocessing.preprocess(data_=tX_test, transform_inplace=False)

OUTPUT_PATH = 'prediction.csv' # TODO: fill in desired name of output file for submission
y_pred = model.predict(x_test)
res = y_pred>0.5
res = res.squeeze()
pred = -np.ones(res.shape)
pred[res] = 1
create_csv_submission(ids_test, pred, OUTPUT_PATH)