In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [1]:
# Import helpers and model implementations
from implementations import *
from cross_validation import *
from helpers import *
from split import *

In [2]:
# Import Data set
y_train_l, x_train_l, ids_train = load_csv_data('../data/train.csv', sub_sample=False)
_, x_test_l, ids_test = load_csv_data('../data/test.csv', sub_sample=False)

# Entire Pipeline in one go: From Cleaning to Submission
## Linear Regression: Solving normal equations (Least squares)

In [3]:
# Copy arrys from memory, this step can be skipped in run.py
# Important, all following cells have to be executed in order from here on.
x_train, y_train = np.copy(x_train_l), np.copy(y_train_l)
x_test = np.copy(x_test_l)

# Set seed
seed = 1

## Creation of a model we can submit
### Step 1
All we need to do to adapt this pipeline to another model, is to change the following two transform methods

In [9]:
# STEP 1: Define two methods, one tranforms x, the other y

def transform_x_least_squares(x, deg):
    # Step 1: Replace the -999 values in the first column by the mean
    x[:, 0][x[:, 0] == -999.0] = np.mean(x[:, 0][x[:, 0] != -999.0])
    
    # Step 2: Split data set into 3 datasets depending on jet num
    x_0, x_1, x_23 = split_x(x)
    
    # Step 3: Remove aberrant just to be sure
    # Step 4: Standardize data
    x_0, x_0_mean, x_0_std = standardize(x_0)
    x_1, x_1_mean, x_1_std = standardize(x_1)
    x_23, x_23_mean, x_23_std = standardize(x_23)
    
    # Step 5: Feature expansion
    # 5.1 add column with constant terms
    tx_0 = np.c_[np.ones((x_0.shape[0], 1)), x_0]
    tx_1 = np.c_[np.ones((x_1.shape[0], 1)), x_1]
    tx_23 = np.c_[np.ones((x_23.shape[0], 1)), x_23]

    # 5.2 polynomial basis

    # 5.3 nonlinear expansion
    
    # Final step: return all transformed data
    return tx_0, tx_1, tx_23

def transform_y_least_squares(y, x):
    # Step 1: add dimension to y 
    y = np.expand_dims(y, axis=1)
    
    # Step 2: Split data set into 3 datasets depending on jet num
    y_0, y_1, y_23 = split_y(y, x)

    return y_0, y_1, y_23
    

### Step 2
Test performance on entire dataset (here withoute cross validation)

In [11]:
y_t0, y_t1, y_t23 = transform_y_least_squares(y_train, x_train)
tx_t0, tx_t1, tx_t23 = transform_x_least_squares(x_train, 1)

w_0, loss_w0 = least_squares(y_t0, tx_t0)
w_1, loss_w1 = least_squares(y_t1, tx_t1)
w_2, loss_w2 = least_squares(y_t23, tx_t23)

### Step 3
Check accuracy of trained model on same training set

In [12]:
y_prediction_train = predict_labels_datasets(w_0, w_1, w_2, x_train, transform_x_least_squares, 1)
compute_accuracy(y_train, y_prediction_train)

0.75976

### Create a submission file

In [14]:
# Predict y_test to submit on kaggle
y_prediction_test = predict_labels_datasets(w_0, w_1, w_2, x_test, transform_x_least_squares, 1)
create_csv_submission(ids_test, y_prediction_test, "submission.csv")