In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [7]:
# --- IMPORTANT this notebook has to be run from inside the /notebook subdirectory for it to import properly
# Import helpers and model implementations
# Path hack.
import sys, os
sys.path.insert(0, os.path.abspath('..'))

from scripts.implementations import *
from scripts.cross_validation import *
from scripts.helpers import *
from scripts.split import *

In [8]:
# Import Data set
y_train_l, x_train_l, ids_train = load_csv_data('../data/train.csv', sub_sample=False)
_, x_test_l, ids_test = load_csv_data('../data/test.csv', sub_sample=False)

# Entire Pipeline in one go: From Cleaning to Submission
## Logistic Regression

In [9]:
# Copy arrys from memory, this step can be skipped in run.py
# Important, all following cells have to be executed in order from here on.
x_train, y_train = np.copy(x_train_l), np.copy(y_train_l)
x_test = np.copy(x_test_l)

# Set seed
seed = 1

## Creation of a model we can submit
### Step 1
All we need to do to adapt this pipeline to another model, is to change the following two transform methods

In [18]:
def transform_x_logistic(x):
    # Step 1: Replace the -999 values in the first column by the mean
    x[:, 0][x[:, 0] == -999.0] = np.mean(x[:, 0][x[:, 0] != -999.0])
    
    # Step 2: Split data set into 3 datasets depending on jet num
    x_0, x_1, x_23 = split_x(x)
    
    # Step 3: Remove aberrant data to be sure to have no outliers left
    remove_aberrant(x_0)
    remove_aberrant(x_1)
    remove_aberrant(x_23)
    
    # (opt) Add polynomial basis
    degree = 2
    x_0 = build_poly(x_0, degree)
    x_1 = build_poly(x_1, degree)
    x_23 = build_poly(x_23, degree)

    # Step 4: Standardize data
    x_0, x_0_mean, x_0_std = standardize(x_0)
    x_1, x_1_mean, x_1_std = standardize(x_1)
    x_23, x_23_mean, x_23_std = standardize(x_23)
    
    # Step 5: Other feature expansion
    # 5.1 add column with constant terms
    tx_0 = np.c_[np.ones((x_0.shape[0], 1)), x_0]
    tx_1 = np.c_[np.ones((x_1.shape[0], 1)), x_1]
    tx_23 = np.c_[np.ones((x_23.shape[0], 1)), x_23]

    # (opt) 5.2 nonlinear expansion
    
    # Final step: return all transformed data
    return tx_0, tx_1, tx_23

def transform_y_logistic(y, x):
    # Step 1: add dimension to y 
    y = np.expand_dims(y, axis=1)
    
    # For logistic regression only: bring y back to 0 and 1 -> for logistic regression #FIXME bring back
    y = np.array([0 if i == -1 else 1 for i in y])
    
    # Step 2: Split data set into 3 datasets depending on jet num
    y_0, y_1, y_23 = split_y(y, x)

    return y_0, y_1, y_23
    

### Step 2
Test performance on entire dataset (here withoute cross validation)

In [19]:
y_t0, y_t1, y_t23 = transform_y_logistic(y_train, x_train)
tx_t0, tx_t1, tx_t23 = transform_x_logistic(x_train)

initial_w_0 = np.zeros((tx_t0.shape[1], 1))
initial_w_1 = np.zeros((tx_t1.shape[1], 1))
initial_w_23 = np.zeros((tx_t23.shape[1], 1))

lambda_ = 0.01
gamma = 0.005
max_iters = 100000

# Logistic regression
#w_0, loss_w0 = logistic_regression(y_t0, tx_t0, initial_w_0, max_iters, gamma)
#w_1, loss_w1 = logistic_regression(y_t1, tx_t1, initial_w_1, max_iters, gamma)
#w_2, loss_w2 = logistic_regression(y_t23, tx_t23, initial_w_23, max_iters, gamma)

# Regularized, stochastic logistic regression
w_0, loss_w0 = reg_logistic_regression_SGD(y_t0, tx_t0, lambda_, initial_w_0, max_iters, gamma)
w_1, loss_w1 = reg_logistic_regression_SGD(y_t1, tx_t1, lambda_, initial_w_1, max_iters, gamma)
w_2, loss_w2 = reg_logistic_regression_SGD(y_t23, tx_t23, lambda_, initial_w_23, max_iters, gamma)

# Newton logistic regression
#w_0, loss_w0 = logistic_regression_newton(y_t0, tx_t0, lambda_, initial_w_0, max_iters, gamma)
#w_1, loss_w1 = logistic_regression_newton(y_t1, tx_t1, lambda_, initial_w_1, max_iters, gamma)
#w_2, loss_w2 = logistic_regression_newton(y_t23, tx_t23, lambda_, initial_w_23, max_iters, gamma)


Current iteration=0, loss=69329.41963251591
weights size:0.00011500367132997651
Current iteration=10000, loss=39850.58308588407
weights size:3.9466733121429556
Current iteration=20000, loss=38995.848163005365
weights size:5.22011483723575
Current iteration=30000, loss=39090.07414855655
weights size:5.533675671480695
Current iteration=40000, loss=38917.28943327911
weights size:5.348124682223529
Current iteration=50000, loss=39199.924327438
weights size:5.49808260045738
Current iteration=60000, loss=38868.37405440737
weights size:5.118220449471801
Current iteration=70000, loss=38847.56347529571
weights size:5.423796997992938
Current iteration=80000, loss=39037.477273451936
weights size:5.538770820359847
Current iteration=90000, loss=39144.69035506584
weights size:5.426523194866387
loss=39126.00279489747
Current iteration=0, loss=53703.452163828064
weights size:0.000132468746995737
Current iteration=10000, loss=40117.62688569149
weights size:2.595493105977552
Current iteration=20000, loss

### Step 3
Check accuracy of trained model on same training set

In [20]:
y_prediction_train = predict_labels_datasets_logistic(w_0, w_1, w_2, x_train, transform_x_logistic)
compute_accuracy(y_train, y_prediction_train)

0.77034

### Create a submission file

In [21]:
# Predict y_test to submit on kaggle
y_prediction_test = predict_labels_datasets_logistic(w_0, w_1, w_2, x_test, transform_x_logistic)
create_csv_submission(ids_test, y_prediction_test, "submission.csv")


## Results of logistic regression
1. gamma = 0.0005 max_iters = 10000000 lambda = 0 degree = 1 accuracy = 0.745748
2. gamma = 0.0005 max_iters = 10000000 lambda = 0 degree = 2 accuracy = 0.80364 -> Kaggle 0.77300
3. gamma = 0.0005 max_iters = 10000000 lambda = 0 degree = 3 accuracy = 0.791964 -> Kaggle 0.78508
4. gamma = 0.0005 max_iters = 10000000 lambda = 0 degree = 4 accuracy = 0.791584

8. gamma = 0.0005 max_iters = 10000000 lambda = 0 degree = 8 accuracy = 0.766736
