In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [16]:
# Import helpers and model implementations
from scripts.proj1_helpers import *
from scripts.implementations import *

In [3]:
# Import Data set
y_train_l, x_train_l, ids_train = load_csv_data('./data/train.csv', sub_sample=False)
_, x_test_l, ids_test = load_csv_data('./data/test.csv', sub_sample=False)

# Entire Pipeline in one go: From Cleaning to Submission

In [4]:
# Copy arrys from memory, this step can be skipped in run.py
# Important, all following cells have to be executed in order from here on.
x_train, y_train = np.copy(x_train_l), np.copy(y_train_l)
x_test = np.copy(x_test_l)

# Set seed
seed = 1

In [5]:
# STEP 1: Define two methods, one tranforms x, the other y

def transform_x_logistic(x):
    # Step 1: Replace the -999 values in the first column by the mean
    x[:, 0][x[:, 0] == -999.0] = np.mean(x[:, 0][x[:, 0] != -999.0])
    
    # Step 2: Split data set into 3 datasets depending on jet num
    x_0, x_1, x_23 = split_x(x)
    
    # Step 3: Remove aberrant just to be sure ? FIXME
    
    # Step 4: Standardize data
    x_0, x_0_mean, x_0_std = standardize(x_0)
    x_1, x_1_mean, x_1_std = standardize(x_1)
    x_23, x_23_mean, x_23_std = standardize(x_23)
    
    # Step 5: Feature expansion
    # 5.1 add column with constant terms
    tx_0 = np.c_[np.ones((x_0.shape[0], 1)), x_0]
    tx_1 = np.c_[np.ones((x_1.shape[0], 1)), x_1]
    tx_23 = np.c_[np.ones((x_23.shape[0], 1)), x_23]

    # 5.2 polynomial basis

    # 5.3 nonlinear expansion
    
    # Final step: return all transformed data
    return tx_0, tx_1, tx_23

def transform_y_logistic(y, x):
    # Step 1: add dimension to y 
    y = np.expand_dims(y_train, axis=1)
    
    # For logistic regression: bring y back to 0 and 1 -> for logistic regression #FIXME bring back
    y = np.array([0 if i == -1 else 1 for i in y])
    
    # Step 2: Split data set into 3 datasets depending on jet num
    y_0, y_1, y_23 = split_y(y, x)

    return y_0, y_1, y_23
    

In [47]:
# STEP 2: Perform cross validation

In [19]:
# STEP 3: Learn on entire dataset
y_t0, y_t1, y_t23 = transform_y_logistic(y_train, x_train)
tx_t0, tx_t1, tx_t23 = transform_x_logistic(x_train)

initial_w_0 = np.zeros((tx_t0.shape[1], 1))
initial_w_1 = np.zeros((tx_t1.shape[1], 1))
initial_w_23 = np.zeros((tx_t23.shape[1], 1))

lambda_ = 0.02
gamma = 0.01
max_iters = 50000

# Logistic regression
#w_0, loss_w0 = logistic_regression(y_t0, tx_t0, initial_w_0, max_iters, gamma)
#w_1, loss_w1 = logistic_regression(y_t1, tx_t1, initial_w_1, max_iters, gamma)
#w_2, loss_w2 = logistic_regression(y_t23, tx_t23, initial_w_23, max_iters, gamma)

# Regularized logistic regression
w_0, loss_w0 = reg_logistic_regression_SGD(y_t0, tx_t0, lambda_, initial_w_0, max_iters, gamma)
w_1, loss_w1 = reg_logistic_regression_SGD(y_t1, tx_t1, lambda_, initial_w_1, max_iters, gamma)
w_2, loss_w2 = reg_logistic_regression_SGD(y_t23, tx_t23, lambda_, initial_w_23, max_iters, gamma)


Current iteration=0, loss=68712.23192577629
weights size:0.0004649018270131234
Current iteration=10000, loss=42436.89335814543
weights size:3.2775941773543797
Current iteration=20000, loss=43600.545865174274
weights size:3.125411051280684
Current iteration=30000, loss=42891.93868858164
weights size:3.01827307936436
Current iteration=40000, loss=42217.62105360966
weights size:3.304723935297876
loss=42389.056382095085
Current iteration=0, loss=53591.060334834445
weights size:0.0009666461541747606
Current iteration=10000, loss=43189.19345276145
weights size:1.531386980042974
Current iteration=20000, loss=43426.67455649385
weights size:1.5576740275050802
Current iteration=30000, loss=43303.781382348745
weights size:1.6572118008442722
Current iteration=40000, loss=43192.28533185497
weights size:1.5365246041502734
loss=43118.66437245187
Current iteration=0, loss=50192.99171468878
weights size:0.00043053562790325566
Current iteration=10000, loss=39112.04437969311
weights size:1.48696576582945

In [20]:
y_prediction_train = predict_labels_datasets_logistic(w_0, w_1, w_2, x_train, transform_x_logistic)
compute_accuracy(y_train, y_prediction_train)

0.73854

In [54]:
# Predict y_test to submit on kaggle
y_prediction_test = predict_labels_datasets_logistic(w_0, w_1, w_2, x_test, transform_x_logistic)
create_csv_submission(ids_test, y_prediction_test, "submission.csv")