In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = os.path.join(os.getcwd(), '../data/train.csv')
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Our CRAZY machine learning thing :) ...

In [3]:
from preprocessing import *
from logistic_regression import *
import pandas as pd

In [4]:
def cross_validation(tX, y, k=5, 
                     lr=0.01, lambda_=10, 
                     max_iter=5000, batch_size=128):
    k_folds = np.array_split(np.arange(y.shape[0]), axis=0, indices_or_sections=k)
    accuracy = np.zeros(k)
    i = 0
    print("Cross-validation started!")
    for k_fold in k_folds:
        train_mask = ~np.isin(np.arange(y.shape[0]), k_fold)
        w_init = np.random.randn(1, tX.shape[1]+1)
        
        x_train = tX[train_mask]
        y_train = y[train_mask]
        x_val = tX[~train_mask]
        y_val = y[~train_mask]
        batch_size_ = min(batch_size, x_train.shape[0])
        (w, loss) = logistic_regression_reg(y=y_train, tX=x_train,
                                          initial_w=w_init,
                                          max_iter=max_iter, lambda_=lambda_,
                                          lr=lr, batch_size=batch_size_)
        y_pred = logistic_pred(x_val, w)
        accuracy[i] = (y_pred==y_val).mean()
        i += 1
    accuracy_avg = accuracy.mean()
    return accuracy_avg
        

def hyperparamters_tunning(tX, y,
                           use_transformations,
                           handling_outliers,
                           use_poly_augmentation,
                           max_degrees,
                           lrs, lambdas, batch_size, max_iter, k=5):
    best = {
        "transformation": None,
        "handling_outlier": None,
        "max_degree": None,
        "accuracy": 0,
        "lr": None,
        "lambda": None,
    }
    for use_transformation in use_transformations:
        for handling_outlier in handling_outliers:
            for max_degree in max_degrees:
                for lr in lrs:
                    for lambda_ in lambdas:
                        preprocessing = Preprocessing(
                            use_transformations=use_transformation,
                            handling_outliers=handling_outlier,
                            use_poly_augmentation=use_poly_augmentation,
                            max_degree=max_degree
                        )
                        
                        tX_preprocessed = preprocessing.preprocess(data_=tX)
                        accuracy = cross_validation(tX_preprocessed, y,
                                                    k=k, lr=lr, lambda_=lambda_,
                                                    max_iter=max_iter,
                                                    batch_size=batch_size)
                        if best['accuracy'] < accuracy:
                            print("Best is updated with accuracy:", accuracy)
                            best["transformation"] = use_transformation
                            best["handling_outlier"] = handling_outlier
                            best["max_degree"] = max_degree
                            best["accuracy"] = accuracy
                            best["lambda"] = lambda_
                            best["lr"] = lr
    return best

In [5]:
preprocessing = Preprocessing()
y[y==-1] = 0
y = y.astype(np.int)
preprocessing.replace_outliers_by_nan(tX)

#Delete rows from tX and y, which has more than 8 nan values in tX and 0 label in y
num_nan_in_row = np.isnan(tX).sum(axis=1)
mask = (y==0) & (num_nan_in_row>8)
y = y[~mask]
tX = tX[~mask]

shuffle = np.arange(y.shape[0])
np.random.shuffle(shuffle)
y = y[shuffle]
tX = tX[shuffle]

In [10]:
use_transformations = [False]
handling_outliers = ['fill_mean', 'remove']
lrs = [0.1]
lambdas = [0]
max_degrees = [3, 5]
use_poly_augmentation = True

batch_size = 64
max_iter = 10000
k = 5

In [None]:
best = hyperparamters_tunning(tX, y,
                              use_transformations=use_transformations,
                              handling_outliers=handling_outliers,
                              use_poly_augmentation=use_poly_augmentation,
                              max_degrees=max_degrees,
                              lrs=lrs, lambdas=lambdas,
                              batch_size=batch_size,
                              max_iter=max_iter, k=k)

Cross-validation started!
>Iter #9679:	[################### ]; Loss: 80.72686959128592, 0.11

In [8]:
use_transformation = best["transformation"]
handling_outlier = best["handling_outlier"]
max_degree = best["max_degree"]
accuracy = best["accuracy"]
lambda_ = best["lambda"]
lr = best["lr"]

In [9]:
preprocessing = Preprocessing(
                            use_transformations=use_transformation,
                            handling_outliers=handling_outlier,
                            use_poly_augmentation=use_poly_augmentation,
                            max_degree=max_degree
                        )
                        
tX_preprocessed = preprocessing.preprocess(data_=tX)
w_init = np.random.randn(1, tX_preprocessed.shape[1]+1)
w, loss = logistic_regression_reg(y=y, tX=tX_preprocessed,
                                  initial_w=w_init,
                                  max_iter=max_iter, lambda_=lambda_,
                                  lr=lr, batch_size=batch_size)
y_pred = logistic_pred(tX_preprocessed, w)
(y_pred==y).mean()

>Iter #99:	[################### ]; Loss: 93.15705613524557, 0.1

0.5954299773890955

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = DATA_TRAIN_PATH = os.path.join(os.getcwd(), '../data/test.csv')
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
preprocessing = Preprocessing(
    use_transformations=use_transformation,
    handling_outliers = handling_outlier,
)
test_data, _ = preprocessing.preprocess(
    tX_test,
    train=False,
    models=models,
    degrees=degree
)

In [None]:
OUTPUT_PATH = 'prediction.csv' # TODO: fill in desired name of output file for submission
y_pred = model.predict(test_data)
res = y_pred>0.5
res = res.squeeze()
pred = -np.ones(res.shape)
pred[res] = 1
create_csv_submission(ids_test, pred, OUTPUT_PATH)