In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = os.path.join(os.getcwd(), '../data/train.csv')
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Our CRAZY machine learning thing :) ...

In [3]:
from nn_model import NNModel
from preprocessing import *
from nn_model import *

In [4]:
y[y==-1] = 0
y = y.astype(np.int)
tX[tX==-999] = np.nan

#Delete rows from tX and y, which has more than 8 nan values in tX and 0 label in y
# num_nan_in_row = np.isnan(tX).sum(axis=1)
# mask = (y==0) & (num_nan_in_row>8)
# y = y[~mask]
# tX = tX[~mask]

pos_zero = np.argwhere(y==0).squeeze()
num_delete = (y==0).sum() - (y==1).sum()
pos_delete = np.random.choice(pos_zero, replace=False, size=num_delete)
y_train = np.delete(y, pos_delete)
x_train = np.delete(tX, pos_delete, axis=0)

shuffle = np.arange(y_train.shape[0])
np.random.shuffle(shuffle)

y_train = y_train[shuffle]
x_train = x_train[shuffle]

In [17]:
def cross_validation(tX, y, k=5, 
                     lr=0.01, lambda_=10, 
                     epochs=10, batch_size=128,
                     momentum=0.9, verbose=1):
    k_folds = np.array_split(np.arange(y.shape[0]), axis=0, indices_or_sections=k)
    accuracy = np.zeros(k)
    i = 0
    print("Cross-validation started!")
    for k_fold in k_folds:
        train_mask = ~np.isin(np.arange(y.shape[0]), k_fold)
        w_init = np.random.randn(1, tX.shape[1]+1)
        
        x_train = tX[train_mask]
        y_train = y[train_mask]
        x_val = tX[~train_mask]
        y_val = y[~train_mask]
        batch_size_ = min(batch_size, x_train.shape[0])
        
        model = NNModel(x_train.shape[1])
        model.add_layer(1)
        model.train(x_train, y_train,
                    lr=lr, lambda_=lambda_,
                    batch_size=batch_size,
                    epochs=epochs, verbose=verbose,
                    loss_fun='logistic_reg', momentum=momentum)
        
        y_pred = model.predict(x_val)
        y_pred = y_pred > 0
        y_pred = y_pred.squeeze()
        accuracy[i] = (y_pred==y_val).mean()
        i += 1
    accuracy_avg = accuracy.mean()
    return accuracy_avg
        

def hyperparamters_tunning(tX, y,
                           use_transformations,
                           handling_outliers,
                           max_degrees,
                           lrs, lambdas,
                           momentum,
                           batch_size,
                           epochs, k=5):
    best = {
        "transformation": None,
        "handling_outlier": None,
        "max_degree": None,
        "accuracy": 0,
        "lr": None,
        "lambda": None,
    }
    for use_transformation in use_transformations:
        for handling_outlier in handling_outliers:
            for max_degree in max_degrees:
                for lr in lrs:
                    for lambda_ in lambdas:
                        preprocessing = Preprocessing(
                            use_transformations=use_transformation,
                            handling_outliers=handling_outlier,
                            max_degree=max_degree
                        )
                        
                        tX_preprocessed = preprocessing.preprocess(data_=tX)
                        accuracy = cross_validation(tX_preprocessed, y,
                                                    k=k, lr=lr, lambda_=lambda_,
                                                    epochs=epochs,
                                                    batch_size=batch_size,
                                                    momentum=momentum
                                                   )
                        if best['accuracy'] < accuracy:
                            print("Best is updated with accuracy:", accuracy)
                            best["transformation"] = use_transformation
                            best["handling_outlier"] = handling_outlier
                            best["max_degree"] = max_degree
                            best["accuracy"] = accuracy
                            best["lambda"] = lambda_
                            best["lr"] = lr
    return best

In [None]:
use_transformations = [False]
handling_outliers = ['fill_mean']
use_poly_augmentation = True
max_degrees = [12]
lrs = [1]
lambdas = [0]
batch_size = 64
epochs = 10
momentum = 0.9
best = hyperparamters_tunning(tX, y,
                              use_transformations,
                              handling_outliers,
                              max_degrees,
                              lrs, lambdas,
                              momentum,
                              batch_size, epochs, k=5)

Cross-validation started!
Training started
>Epoch #10:	[####################]; Loss: 3429.55174153708
Training ended

Training started
>Epoch #4:	[########            ]; Loss: 6213.816296489762

In [None]:
best["models_feat"] = models
use_transformation = best["transformation"]
handling_outlier = best["handling_outlier"]
degree = best["degree"]
accuracy = best["accuracy"]
lambda_ = best["lambda"]
lr = best["lr"]

# Train model with given hyperparameters

In [8]:
from proj1_helpers import *
DATA_TRAIN_PATH = os.path.join(os.getcwd(), '../data/train.csv')
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

preprocessing = Preprocessing()
y[y==-1] = 0
y = y.astype(np.int)
preprocessing.replace_outliers_by_nan(tX)

In [9]:
#Delete rows from tX and y, which has more than 8 nan values in tX and 0 label in y
num_nan_in_row = np.isnan(tX).sum(axis=1)
mask = (y==0) & (num_nan_in_row>8)
y = y[~mask]
tX = tX[~mask]

#Create train and validate sets
num_examples = tX.shape[0]
val_size = 10000
train_samples = np.random.choice(num_examples, replace=False, size=val_size)
mask_val = np.isin(np.arange(tX.shape[0]), train_samples)
x_val, y_val = tX[mask_val], y[mask_val]
x_train, y_train = tX[~mask_val], y[~mask_val]

In [10]:
use_transformation = False
handling_outlier = 'predict'
lr = 0.1
lambda_ = 100
degree = np.arange(2, 5)
epochs_preprocess = 10
epochs = 50
batch_size = 32
units = 1
activation = 'sigmoid'

In [12]:
preprocessing = Preprocessing(
    use_transformations=use_transformation,
    handling_outliers=handling_outlier,
)
train_data, models = preprocessing.preprocess(
    x_train,
    lr=lr,
    lambda_=lambda_,
    batch_size=batch_size,
    epochs=epochs,
    degrees=degree
)

model = NNModel(train_data.shape[1])
model.add_layer(units=units, activation=activation)
model.train(train_data, y_train, lr=lr,
            lambda_=lambda_,
            batch_size=batch_size, epochs=epochs,
            verbose=1)

preprocessing = Preprocessing(
    use_transformations=use_transformation,
    handling_outliers = handling_outlier,
)
val_data, _ = preprocessing.preprocess(
    x_val,
    train=False,
    models=models,
    degrees=degree
)

y_pred = model.predict(val_data)
y_pred = y_pred > 0.5
y_pred = y_pred.squeeze()
accuracy = (y_pred==y_val).mean()
print("Validation arruracy is", accuracy)

Preprocesing started!

[[4.35157936e-04 8.59632332e-01 1.76750908e+00 ... 0.00000000e+00
  1.00000000e+00 0.00000000e+00]
 [1.16103275e-09 2.20675604e+04 2.55959438e+00 ... 1.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [7.17331933e-06 1.01090913e+01 2.55959438e+00 ... 1.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [2.73713058e-48 2.70517611e+00 1.98021339e-03 ... 0.00000000e+00
  0.00000000e+00 1.00000000e+00]
 [1.51452501e-03 1.78126601e-34 1.66079947e+00 ... 0.00000000e+00
  1.00000000e+00 0.00000000e+00]
 [5.65277906e-07 6.97495144e-22 6.98481874e-15 ... 1.00000000e+00
  0.00000000e+00 0.00000000e+00]]
Training started
>Epoch #50:	[####################]; Loss: nan
Training ended

Training started
>Epoch #50:	[####################]; Loss: nan
Training ended

Training started
>Epoch #33:	[#############       ]; Loss: nan

KeyboardInterrupt: 

In [None]:
pred = model.predict(train_data)

In [None]:
res = pred>0.5

In [None]:
res = res.squeeze()

In [None]:
pred = np.zeros(y_train.shape)

In [None]:
pred[res] = 1

In [None]:
np.mean(pred==y_train)

## Generate predictions and save ouput in csv format for submission:

In [14]:
DATA_TEST_PATH = DATA_TRAIN_PATH = os.path.join(os.getcwd(), '../data/test.csv')
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [15]:
preprocessing = Preprocessing(
    use_transformations=use_transformation,
    handling_outliers = handling_outlier,
)
test_data, _ = preprocessing.preprocess(
    tX_test,
    train=False,
    models=models,
    degrees=degree
)

Preprocesing started!

Preprocessing ended



In [16]:
OUTPUT_PATH = 'prediction.csv' # TODO: fill in desired name of output file for submission
y_pred = model.predict(test_data)
res = y_pred>0.5
res = res.squeeze()
pred = -np.ones(res.shape)
pred[res] = 1
create_csv_submission(ids_test, pred, OUTPUT_PATH)