In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = os.path.join(os.getcwd(), '../data/train.csv')
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Our CRAZY machine learning thing :) ...

In [3]:
from nn_model import NNModel
from preprocessing import *
from nn_model import *

In [4]:
y[y==-1] = 0
y = y.astype(np.int)
tX[tX==-999] = np.nan

#Delete rows from tX and y, which has more than 8 nan values in tX and 0 label in y
# num_nan_in_row = np.isnan(tX).sum(axis=1)
# mask = (y==0) & (num_nan_in_row>8)
# y = y[~mask]
# tX = tX[~mask]

pos_zero = np.argwhere(y==0).squeeze()
num_delete = (y==0).sum() - (y==1).sum()
pos_delete = np.random.choice(pos_zero, replace=False, size=num_delete)
y_train = np.delete(y, pos_delete)
x_train = np.delete(tX, pos_delete, axis=0)

shuffle = np.arange(y_train.shape[0])
np.random.shuffle(shuffle)

y_train = y_train[shuffle]
x_train = x_train[shuffle]

In [5]:
def cross_validation(tX, y, k=5, 
                     lr=0.01, lambda_=10, 
                     epochs=10, batch_size=128,
                     momentum=0.9, verbose=1):
    k_folds = np.array_split(np.arange(y.shape[0]), axis=0, indices_or_sections=k)
    accuracy = np.zeros(k)
    i = 0
    print("Cross-validation started!")
    for k_fold in k_folds:
        train_mask = ~np.isin(np.arange(y.shape[0]), k_fold)
        w_init = np.random.randn(1, tX.shape[1]+1)
        
        x_train = tX[train_mask]
        y_train = y[train_mask]
        x_val = tX[~train_mask]
        y_val = y[~train_mask]
        batch_size_ = min(batch_size, x_train.shape[0])
        
        model = NNModel(x_train.shape[1])
        model.add_layer(1)
        model.train(x_train, y_train,
                    lr=lr, lambda_=lambda_,
                    batch_size=batch_size,
                    epochs=epochs, verbose=verbose,
                    loss_fun='logistic_reg', momentum=momentum)
        
        y_pred = model.predict(x_val)
        y_pred = y_pred > 0
        y_pred = y_pred.squeeze()
        accuracy[i] = (y_pred==y_val).mean()
        i += 1
    accuracy_avg = accuracy.mean()
    return accuracy_avg
        

def hyperparamters_tunning(tX, y,
                           use_transformations,
                           handling_outliers,
                           transform_inplace,
                           max_degrees,
                           lrs, lambdas,
                           momentum,
                           batch_size,
                           epochs, k=5):
    best = {
        "transformation": None,
        "handling_outlier": None,
        "max_degree": None,
        "accuracy": 0,
        "lr": None,
        "lambda": None,
    }
    for use_transformation in use_transformations:
        for handling_outlier in handling_outliers:
            for max_degree in max_degrees:
                for lr in lrs:
                    for lambda_ in lambdas:
                        preprocessing = Preprocessing(
                            use_transformations=use_transformation,
                            handling_outliers=handling_outlier,
                            max_degree=max_degree
                        )
                        
                        tX_preprocessed = preprocessing.preprocess(data_=tX,
                                                                   transform_inplace=transform_inplace)
                        accuracy = cross_validation(tX_preprocessed, y,
                                                    k=k, lr=lr, lambda_=lambda_,
                                                    epochs=epochs,
                                                    batch_size=batch_size,
                                                    momentum=momentum
                                                   )
                        if best['accuracy'] < accuracy:
                            print("Best is updated with accuracy:", accuracy)
                            best["transformation"] = use_transformation
                            best["handling_outlier"] = handling_outlier
                            best["max_degree"] = max_degree
                            best["accuracy"] = accuracy
                            best["lambda"] = lambda_
                            best["lr"] = lr
    return best

In [6]:
use_transformations = [True]
handling_outliers = ['fill_mean']
use_poly_augmentation = True
max_degrees = [2]
lrs = [1]
transform_inplace = False
lambdas = [0]
batch_size = 64
epochs = 10
momentum = 0.9
best = hyperparamters_tunning(tX, y,
                              use_transformations,
                              handling_outliers,
                              transform_inplace,
                              max_degrees,
                              lrs, lambdas,
                              momentum,
                              batch_size, epochs, k=5)

  13: lambda x: np.log(x-19),
  16: lambda x: np.log(x-25),
  data = np.hstack((self.transformations[col](data[:,col:col+1]), data))
  23: lambda x: np.log(x-29),
  26: lambda x: np.log(x-29)
  self.means = np.nanmean(data[:,:self.numerical_features], axis=0)
  np.subtract(arr, avg, out=arr, casting='unsafe')
  keepdims=keepdims)
  data[:,:self.numerical_features] = (data[:,:self.numerical_features]-self.means)/self.stds


Cross-validation started!
Training started


  np.log(1 - pred_sq[target_sq==0]),
  np.log(pred_sq[target_sq==1]),


>Epoch #7:	[##############      ]; Loss: 0.6128291214149886

KeyboardInterrupt: 

In [None]:
use_transformation = best["transformation"]
handling_outlier = best["handling_outlier"]
max_degree = best["max_degree"]
accuracy = best["accuracy"]
lambda_ = best["lambda"]
lr = best["lr"]
verbose = 1
batch_size = 64
epochs = 10
momentum = 0.9
transform_inplace = False
preprocessing = Preprocessing(use_transformations=use_transformation,
                              handling_outliers=handling_outlier,
                              max_degree=max_degree)
                        
tX_preprocessed = preprocessing.preprocess(data_=tX, transform_inplace=transform_inplace)
model = NNModel(tX_preprocessed.shape[1])
model.add_layer(1)
model.train(tX_preprocessed, y,
            lr=lr, lambda_=lambda_,
            batch_size=batch_size,
            epochs=epochs, verbose=verbose,
            loss_fun='logistic_reg', momentum=momentum)

In [None]:
y_pred = model.predict(tX_preprocessed)
y_pred = y_pred > 0
y_pred = y_pred.squeeze()
(y_pred==y).mean()

In [None]:
model.parameters['W1'].shape

In [None]:
np.sum(model.parameters['W1']>0.0001)

In [None]:
print_parameters = (f'use_transformations: {use_transformations}\n'
                    + f'handling_outliers: {handling_outliers}\n'
                    + f'transform_inplace: {transform_inplace}\n'
                    + f'max_degree: {max_degree}\n'
                    + f'lambda_: {lambda_}\n'
                    + f'lr: {lr}\n'
                    + f'batch_size: {batch_size}\n'
                    + f'epochs: {epochs}\n'
                    + f'momentum: {momentum}\n')
print(print_parameters)

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = DATA_TRAIN_PATH = os.path.join(os.getcwd(), '../data/test.csv')
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
x_test = preprocessing.preprocess(data_=tX_test, transform_inplace=False)

OUTPUT_PATH = 'prediction.csv' # TODO: fill in desired name of output file for submission
y_pred = model.predict(x_test)
res = y_pred>0.5
res = res.squeeze()
pred = -np.ones(res.shape)
pred[res] = 1
create_csv_submission(ids_test, pred, OUTPUT_PATH)