In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = os.path.join(os.getcwd(), '../data/train.csv')
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Our CRAZY machine learning thing :) ...

In [3]:
from nn_model import NNModel
from preprocessing import *
from nn_model import *

In [4]:
def convert_labels(y):
    y[y==-1] = 0
    y = y.astype(np.int)
    return y

def make_balanced(y, tX):
    pos_zero = np.argwhere(y==0).squeeze()
    num_delete = (y==0).sum() - (y==1).sum()
    pos_delete = np.random.choice(pos_zero, replace=False, size=num_delete)
    y_balanced = np.delete(y, pos_delete)
    x_balanced = np.delete(tX, pos_delete, axis=0)
    return (x_balanced, y_balanced)

def shuffle_dataset(x_train, y_train):
    shuffle = np.arange(y_train.shape[0])
    np.random.shuffle(shuffle)
    y_train = y_train[shuffle]
    x_train = x_train[shuffle]
    return (x_train, y_train)

def f1_score(y_pred, y_target):
    true_pos = np.sum(y_pred[y_pred==1 & y_target==1])
    all_pos_predicted = np.sum(y_pred[y_pred==1])
    all_pos_actual = np.sum(y_target[y_target==1])
    precision = true_pos / all_pos_predicted
    recall = true_pos / all_pos_actual
    f1_score = 2 * (precision + recall) / (precision * recall)
    return f1_score

In [5]:
y = convert_labels(y)

print('0 class instances: {}, 1 class instances: {}'.format((y==0).sum(), (y==1).sum()))

x_train, y_train = make_balanced(y, tX)

print('0 class instances: {}, 1 class instances: {}'.format((y_train==0).sum(), (y_train==1).sum()))

x_train, y_train = shuffle_dataset(x_train, y_train)

0 class instances: 164333, 1 class instances: 85667
0 class instances: 85667, 1 class instances: 85667


In [6]:
def cross_validation(tX, y, k=5, 
                     lr=0.01, lambda_=10, 
                     epochs=10, batch_size=128,
                     momentum=0.9, verbose=1):
    k_folds = np.array_split(np.arange(y.shape[0]), axis=0, indices_or_sections=k)
    accuracy_test = np.zeros(k)
    accuracy_train = np.zeros(k)
    f1_score_test = np.zeros(k)
    f1_score_train = np.zeros(k)
    i = 0
    print("Cross-validation started!")
    for k_fold in k_folds:
        train_mask = ~np.isin(np.arange(y.shape[0]), k_fold)
        w_init = np.random.randn(1, tX.shape[1]+1)
        
        x_train = tX[train_mask]
        y_train = y[train_mask]
        x_val = tX[~train_mask]
        y_val = y[~train_mask]
        batch_size_ = min(batch_size, x_train.shape[0])
        
        model = NNModel(x_train.shape[1])
        model.add_layer(1)
        model.train(x_train, y_train,
                    lr=lr, lambda_=lambda_,
                    batch_size=batch_size,
                    epochs=epochs, verbose=verbose,
                    loss_fun='logistic_reg', momentum=momentum)
        
        y_pred = model.predict(x_val)
        y_pred = y_pred > 0
        y_pred = y_pred.squeeze()
        accuracy_test[i] = (y_pred==y_val).mean()
        f1_score_test[i] = f1_score(y_pred, y_val)
        
        y_pred = model.predict(x_train)
        y_pred = y_pred > 0
        y_pred = y_pred.squeeze()
        accuracy_train[i] = (y_pred==y_train).mean()
        f1_score_train[i] = f1_score(y_pred, y_val)
        i += 1
    accuracy_avg_test = accuracy_test.mean()
    accuracy_avg_train = accuracy_train.mean()
    return accuracy_avg_test, accuracy_avg_train, f1_score_test, f1_score_train
        

def hyperparamters_tunning(tX, y,
                           use_transformations,
                           handling_outliers,
                           transform_inplace,
                           max_degrees,
                           pair_wise,
                           add_exp,
                           lrs, lambdas,
                           momentum,
                           batch_size,
                           epochs, k=5):
    best = {
        "transformation": None,
        "handling_outlier": None,
        "max_degree": None,
        "accuracy": 0,
        "lr": None,
        "lambda": None,
    }
    for use_transformation in use_transformations:
        for handling_outlier in handling_outliers:
            for max_degree in max_degrees:
                for pairwise in pair_wise:
                    for lr in lrs:
                        for lambda_ in lambdas:
                            for add_exp_ in add_exp:
                                preprocessing = Preprocessing(
                                    use_transformations=use_transformation,
                                    handling_outliers=handling_outlier,
                                    max_degree=max_degree
                                )

                                tX_preprocessed = preprocessing.preprocess(data_=tX,
                                                                           transform_inplace=transform_inplace,
                                                                           pairwise=pairwise,
                                                                           add_exp=add_exp_,
                                                                          )
                                accuracy_test, accuracy_train, f1_test, f1_train = cross_validation(tX_preprocessed, y,
                                                                                                    k=k, lr=lr, lambda_=lambda_,
                                                                                                    epochs=epochs,
                                                                                                    batch_size=batch_size,
                                                                                                    momentum=momentum
                                                                                                   )
                                print_parameters = (f'use_transformation: {use_transformation}\n'
                                                    + f'handling_outlier: {handling_outlier}\n'
                                                    + f'transform_inplace: {transform_inplace}\n'
                                                    + f'max_degree: {max_degree}\n'
                                                   )
                                print(print_parameters)
                                print(f'Test accuracy:{accuracy_test}')
                                print(f'F1 score test:{f1_test}')
                                print(f'Train accuracy:{accuracy_train}\n')
                                print(f'F1 score train:{f1_train}\n')
                                if best['accuracy'] < accuracy:
                                    print("Best is updated with accuracy:", accuracy, "\n")
                                    best["transformation"] = use_transformation
                                    best["handling_outlier"] = handling_outlier
                                    best["max_degree"] = max_degree
                                    best["accuracy"] = accuracy
                                    best["lambda"] = lambda_
                                    best["lr"] = lr
        return best

In [7]:
use_transformations = [True, False]
handling_outliers = ['fill_mean']
transform_inplace = False
max_degrees = [2, 5]
pair_wise = [True, False]
add_exp = [True, False]
lrs = [1]
lambdas = [0]
batch_size = 64
epochs = 20
momentum = 0.9

best = hyperparamters_tunning(tX, y,
                              use_transformations,
                              handling_outliers,
                              transform_inplace,
                              max_degrees,
                              pairwise,
                              add_exp,
                              lrs, lambdas,
                              momentum,
                              batch_size, epochs, k=4)
use_transformation = best["transformation"]
handling_outlier = best["handling_outlier"]
max_degree = best["max_degree"]
accuracy = best["accuracy"]
lambda_ = best["lambda"]
lr = best["lr"]

Cross-validation started!
Training started


  np.log(1 - pred_sq[target_sq==0]),


>Epoch #1:	[#                   ]; Loss: 28.338664506329096

  np.log(pred_sq[target_sq==1]),


>Epoch #20:	[####################]; Loss: 0.5637425304561026
Training ended

Training started
>Epoch #20:	[####################]; Loss: 0.39913196704501336
Training ended

Best is updated with accuracy: 0.827256
Cross-validation started!
Training started
>Epoch #20:	[####################]; Loss: 0.48481293259765584
Training ended

Training started
>Epoch #20:	[####################]; Loss: 0.47982607460453874
Training ended

Best is updated with accuracy: 0.8274680000000001
Cross-validation started!
Training started
>Epoch #20:	[####################]; Loss: 0.6463261077953752
Training ended

Training started
>Epoch #20:	[####################]; Loss: 0.5635553685760842
Training ended

Cross-validation started!
Training started
>Epoch #20:	[####################]; Loss: 0.5730148386975166
Training ended

Training started
>Epoch #20:	[####################]; Loss: 0.40728891589968317
Training ended

Cross-validation started!
Training started
>Epoch #20:	[####################]; Loss: 0.493745

In [None]:
use_transformation = True
handling_outlier = 'fill_mean'
transform_inplace = False
max_degree = 2
pairwise=True
add_exp = True
lambda_ = 0
lr = 1

In [None]:
best["transformation"] = use_transformation
                            best["handling_outlier"] = handling_outlier
                            best["max_degree"] = max_degree
                            best["accuracy"] = accuracy
                            best["lambda"] = lambda_
                            best["lr"] = lr

In [7]:
use_transformation = True
handling_outlier = 'fill_mean'
transform_inplace = False
max_degree = 2
pairwise=True
add_exp = True
lambda_ = 0
lr = 1
verbose = 1
batch_size = 64
epochs = 25
momentum = 0.9
preprocessing = Preprocessing(use_transformations=use_transformation,
                              handling_outliers=handling_outlier,
                              max_degree=max_degree)
                        
tX_preprocessed = preprocessing.preprocess(data_=tX, transform_inplace=transform_inplace, pairwise=pairwise, add_exp=add_exp)
model = NNModel(tX_preprocessed.shape[1])
model.add_layer(1)
model.train(tX_preprocessed, y,
            lr=lr, lambda_=lambda_,
            batch_size=batch_size,
            epochs=epochs, verbose=verbose,
            loss_fun='logistic_reg', momentum=momentum)

Training started


  np.log(1 - pred_sq[target_sq==0]),
  np.log(pred_sq[target_sq==1]),


>Epoch #25:	[####################]; Loss: 0.39937214519821534
Training ended



In [8]:
def calculate_accuracy(y, x, model):
    y_pred = model.predict(x)
    y_pred = y_pred > 0
    y_pred = y_pred.squeeze()
    accuracy = (y_pred==y).mean()
    return accuracy


accuracy = calculate_accuracy(y, tX_preprocessed, model)
print(accuracy)

0.82912


In [9]:
#set small weights to zero
thrreshold = 0.005
print('number of total parameters = ', model.parameters['W1'].shape[1])
print('number of significant parameters =', np.sum(np.abs(model.parameters['W1'][0]) >= thrreshold))

model.parameters['W1'][np.abs(model.parameters['W1']) < thrreshold] = 0
accuracy = calculate_accuracy(y, tX_preprocessed, model)
print('accuracy after setting insignificant parameters to 0 =', accuracy)

number of total parameters =  1698
number of significant parameters = 1492
accuracy after setting insignificant parameters to 0 = 0.82928


In [10]:
print_parameters = (f'use_transformation: {use_transformation}\n'
                    + f'handling_outlier: {handling_outlier}\n'
                    + f'transform_inplace: {transform_inplace}\n'
                    + f'max_degree: {max_degree}\n'
                    + f'lambda_: {lambda_}\n'
                    + f'lr: {lr}\n'
                    + f'batch_size: {batch_size}\n'
                    + f'epochs: {epochs}\n'
                    + f'momentum: {momentum}\n')
print(print_parameters)

use_transformation: True
handling_outlier: fill_mean
transform_inplace: False
max_degree: 2
lambda_: 0
lr: 1
batch_size: 64
epochs: 25
momentum: 0.9



## Generate predictions and save ouput in csv format for submission:

In [11]:
DATA_TEST_PATH = DATA_TRAIN_PATH = os.path.join(os.getcwd(), '../data/test.csv')
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [14]:
x_test = preprocessing.preprocess(data_=tX_test, transform_inplace=transform_inplace, pairwise=pairwise, add_exp=add_exp)

OUTPUT_PATH = 'prediction.csv' # TODO: fill in desired name of output file for submission
y_pred = model.predict(x_test)
res = y_pred>0
res = res.squeeze()
pred = -np.ones(res.shape)
pred[res] = 1
create_csv_submission(ids_test, pred, OUTPUT_PATH)