In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [2]:
# Import helpers and model implementations
from scripts.proj1_helpers import *
from scripts.implementations import *

In [3]:
# Import Data set
y_train_l, x_train_l, ids_train = load_csv_data('./data/train.csv', sub_sample=False)
_, x_test_l, ids_test = load_csv_data('./data/test.csv', sub_sample=False)

# Entire Pipeline in one go: From Cleaning to Submission

In [4]:
# Copy arrys from memory, this step can be skipped in run.py
# Important, all following cells have to be executed in order from here on.
x_train, y_train = np.copy(x_train_l), np.copy(y_train_l)
x_test = np.copy(x_test_l)

# Set seed
seed = 1

In [5]:
# STEP 1: Define two methods, one tranforms x, the other y

def transform_x_logistic(x):
    # Step 1: Replace the -999 values in the first column by the mean
    x[:, 0][x[:, 0] == -999.0] = np.mean(x[:, 0][x[:, 0] != -999.0])
    
    # Step 2: Split data set into 3 datasets depending on jet num
    x_0, x_1, x_23 = split_x(x)
    
    # Step 3: Remove aberrant just to be sure
    remove_aberrant(x_0)
    remove_aberrant(x_1)
    remove_aberrant(x_23)
    
    # 5.2 polynomial basis
    degree = 8
    x_0 = build_poly(x_0, degree)
    x_1 = build_poly(x_1, degree)
    x_23 = build_poly(x_23, degree)
    

    # Step 4: Standardize data
    x_0, x_0_mean, x_0_std = standardize(x_0)
    x_1, x_1_mean, x_1_std = standardize(x_1)
    x_23, x_23_mean, x_23_std = standardize(x_23)
    
    # Step 5: Feature expansion
    # 5.1 add column with constant terms
    tx_0 = np.c_[np.ones((x_0.shape[0], 1)), x_0]
    tx_1 = np.c_[np.ones((x_1.shape[0], 1)), x_1]
    tx_23 = np.c_[np.ones((x_23.shape[0], 1)), x_23]

    
    # 5.3 nonlinear expansion
    
    # Final step: return all transformed data
    return tx_0, tx_1, tx_23

def transform_y_logistic(y, x):
    # Step 1: add dimension to y 
    y = np.expand_dims(y, axis=1)
    
    # For logistic regression: bring y back to 0 and 1 -> for logistic regression #FIXME bring back
    y = np.array([0 if i == -1 else 1 for i in y])
    
    # Step 2: Split data set into 3 datasets depending on jet num
    y_0, y_1, y_23 = split_y(y, x)

    return y_0, y_1, y_23
    

In [6]:
# STEP 2: Perform cross validation

In [9]:
# STEP 3: Learn on entire dataset
y_t0, y_t1, y_t23 = transform_y_logistic(y_train, x_train)
tx_t0, tx_t1, tx_t23 = transform_x_logistic(x_train)

initial_w_0 = np.zeros((tx_t0.shape[1], 1))
initial_w_1 = np.zeros((tx_t1.shape[1], 1))
initial_w_23 = np.zeros((tx_t23.shape[1], 1))

lambda_ = 0.01
gamma = 0.005
max_iters = 100000

# Logistic regression
#w_0, loss_w0 = logistic_regression(y_t0, tx_t0, initial_w_0, max_iters, gamma)
#w_1, loss_w1 = logistic_regression(y_t1, tx_t1, initial_w_1, max_iters, gamma)
#w_2, loss_w2 = logistic_regression(y_t23, tx_t23, initial_w_23, max_iters, gamma)

# Regularized logistic regression, stochastic
w_0, loss_w0 = reg_logistic_regression_SGD(y_t0, tx_t0, lambda_, initial_w_0, max_iters, gamma)
w_1, loss_w1 = reg_logistic_regression_SGD(y_t1, tx_t1, lambda_, initial_w_1, max_iters, gamma)
w_2, loss_w2 = reg_logistic_regression_SGD(y_t23, tx_t23, lambda_, initial_w_23, max_iters, gamma)

# Newton logistic regression
#w_0, loss_w0 = logistic_regression_newton(y_t0, tx_t0, lambda_, initial_w_0, max_iters, gamma)
#w_1, loss_w1 = logistic_regression_newton(y_t1, tx_t1, lambda_, initial_w_1, max_iters, gamma)
#w_2, loss_w2 = logistic_regression_newton(y_t23, tx_t23, lambda_, initial_w_23, max_iters, gamma)



Current iteration=0, loss=69192.96017608918
weights size:0.0001052745053757


  return - np.squeeze((y.T @ np.log(sigmoid(tx @ w)) + (1 - y).T @ np.log(1 - sigmoid(tx @ w))))


Current iteration=10000, loss=nan
weights size:4.329599370255327
Current iteration=20000, loss=nan
weights size:5.092868872450366
Current iteration=30000, loss=nan
weights size:5.118820057681811
Current iteration=40000, loss=nan
weights size:5.119121136132341
Current iteration=50000, loss=nan
weights size:5.276511474182574
Current iteration=60000, loss=39844.33000044996
weights size:5.230755403782653
Current iteration=70000, loss=nan
weights size:5.61076581606541


  return 1.0 / (1 + np.exp(-t))


Current iteration=80000, loss=nan
weights size:52.34333260327412
Current iteration=90000, loss=nan
weights size:21.27181846609832
loss=nan
Current iteration=0, loss=53002.46760433886
weights size:0.0017572519497114356
Current iteration=10000, loss=nan
weights size:4.8807405851300665
Current iteration=20000, loss=nan
weights size:4.218276794376191
Current iteration=30000, loss=nan
weights size:10.541153761265397
Current iteration=40000, loss=nan
weights size:7.169846170402905
Current iteration=50000, loss=nan
weights size:6.762182068486548
Current iteration=60000, loss=nan
weights size:5.221180890419102
Current iteration=70000, loss=nan
weights size:7.607152216550544
Current iteration=80000, loss=nan
weights size:5.109977206778802
Current iteration=90000, loss=nan
weights size:5.934067352569003
loss=nan
Current iteration=0, loss=49549.739484312835
weights size:0.0006207334895275524
Current iteration=10000, loss=inf
weights size:2.898133262474943
Current iteration=20000, loss=nan
weights

In [10]:
y_prediction_train = predict_labels_datasets_logistic(w_0, w_1, w_2, x_train, transform_x_logistic)
compute_accuracy(y_train, y_prediction_train)

0.766736

In [79]:
# Predict y_test to submit on kaggle
y_prediction_test = predict_labels_datasets_logistic(w_0, w_1, w_2, x_test, transform_x_logistic)
create_csv_submission(ids_test, y_prediction_test, "submission.csv")

# Results:
## Reg_logistic_SGD
gamma = 0.0005 max_iters = 10000000 lambda = 0 degree = 1 accuracy = 0.745748
gamma = 0.0005 max_iters = 10000000 lambda = 0 degree = 2 accuracy = 0.80364 -> Kaggle 0.77300
gamma = 0.0005 max_iters = 10000000 lambda = 0 degree = 3 accuracy = 0.791964 -> Kaggle 0.78508
gamma = 0.0005 max_iters = 10000000 lambda = 0 degree = 4 accuracy = 0.791584

gamma = 0.0005 max_iters = 10000000 lambda = 0 degree = 8 accuracy = 0.766736



In [None]:
0.80364 