In [169]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from scripts.proj1_helpers import *
from scripts.preprocess import *

from scripts.split_data import *
from scripts.model_testing import *
from scripts.cross_validation import *

from scripts.implementations import least_squares_GD
from scripts.implementations import least_squares_SGD
from scripts.implementations import least_squares
from scripts.implementations import ridge_regression
from scripts.implementations import logistic_regression
from scripts.implementations import reg_logistic_regression

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [170]:
# load data
raw_y, raw_x, ind = load_csv_data('higgs-data/train.csv')

## * Data Processing
1. Based on PRI_JET_NUM (feature 22), which ranged in value of inclusive [0,4], we devide the training data into 4 sets. From each sets we obtain exactly one model (w0/w1/w2/w4). E.g. for PRI_JET_NUM=0 we will get w0.
2. We standardize the data using power terms.

### Creating Subsets

In [171]:
def create_subsets(x, y):
    sets_x = []
    sets_y = []
    for pri_jet_num_val in np.unique(x[:,22]):
        indices = (x[:,22] == pri_jet_num_val)
        x_tmp   = x[indices,:]
        y_tmp   = y[indices]
        
        sets_x.append(x_tmp)
        sets_y.append(y_tmp)
        
        #         indices = (x[:,22] == pri_jet_num_val) & (x[:,0] != -999)
        #         x_tmp   = x[indices,:]
        #         y_tmp   = y[indices]

        #         sets_x.append(x_tmp)
        #         sets_y.append(y_tmp)

        #         indices = (x[:,22] == pri_jet_num_val) & (x[:,0] == -999)
        #         x_tmp   = x[indices,:]
        #         y_tmp   = y[indices]

        #         sets_x.append(x_tmp)
        #         sets_y.append(y_tmp)
        
    return sets_x, sets_y

sets_x, sets_y = create_subsets(raw_x, raw_y)

# Dataset 0 where PRI_JET_NUM = 0
x0 = sets_x[0]
y0 = sets_y[0]

# Dataset 1 where PRI_JET_NUM = 1
x1 = sets_x[1]
y1 = sets_y[1]

# Dataset 2 where PRI_JET_NUM = 2
x2 = sets_x[2]
y2 = sets_y[2]

# Dataset 3 where PRI_JET_NUM = 3
x3 = sets_x[3]
y3 = sets_y[3]

# x00 = sets_x[0]
# y00 = sets_y[0]

# x01 = sets_x[1]
# y01 = sets_y[1]

# x10 = sets_x[2]
# y10 = sets_y[2]

# x11 = sets_x[3]
# y11 = sets_y[3]

# x20 = sets_x[4]
# y20 = sets_y[4]

# x21 = sets_x[5]
# y21 = sets_y[5]

# x30 = sets_x[6]
# y30 = sets_y[6]

# x31 = sets_x[7]
# y31 = sets_y[7]


In [54]:
raw_x

array([[-999.   ,   79.589,   23.916, ..., -999.   , -999.   ,    0.   ],
       [ 106.398,   67.49 ,   87.949, ..., -999.   , -999.   ,   47.575],
       [ 117.794,   56.226,   96.358, ..., -999.   , -999.   ,    0.   ],
       ..., 
       [ 108.497,    9.837,   65.149, ..., -999.   , -999.   ,    0.   ],
       [  96.711,   20.006,   66.942, ..., -999.   , -999.   ,   30.863],
       [  92.373,   80.109,   77.619, ..., -999.   , -999.   ,    0.   ]])

### Features Reduction

In [172]:
# if PRI_JET_NUM = 0
# We drop features 4, 5, 6, 11, 12, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29 
x0 = np.delete(x0, [4, 5, 6, 11, 12, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29], 1)

# if PRI_JET_NUM = 1
# We drop features 4, 5, 6, 11, 12, 15, 18, 20, 22, 26, 27, 28 
x1 = np.delete(x1, [4, 5, 6, 11, 12, 15, 18, 20, 22, 26, 27, 28], 1)

# if PRI_JET_NUM == 2 or PRI_JET_NUM == 3
# We drop features 11, 15, 18, 20, 22, 28
x2 = np.delete(x2, [11, 15, 18, 20, 22, 28], 1)
x3 = np.delete(x3, [11, 15, 18, 20, 22, 28], 1)


# x00 = np.delete(x00, [4, 5, 6, 11, 12, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29], 1)
# x01 = np.delete(x01, [0, 4, 5, 6, 11, 12, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29], 1)
# x10 = np.delete(x10, [4, 5, 6, 11, 12, 15, 18, 20, 22, 26, 27, 28], 1)
# x11 = np.delete(x11, [0, 4, 5, 6, 11, 12, 15, 18, 20, 22, 26, 27, 28], 1)
# x20 = np.delete(x20, [11, 15, 18, 20, 22, 28], 1)
# x21 = np.delete(x21, [0, 11, 15, 18, 20, 22, 28], 1)
# x30 = np.delete(x30, [11, 15, 18, 20, 22, 28], 1)
# x31 = np.delete(x31, [0, 11, 15, 18, 20, 22, 28], 1)

In [86]:
# x10.shape
import pandas as pd

In [142]:
# data = pd.DataFrame(x00)
# # data
# data = 1/1+np.log(1+data)
# data['y'] = y00
# data.corr()

In [143]:
# data = pd.DataFrame(x01)
# data['y'] = y01
# # data
# data.corr()
# # len(data[data.index == -1])
# # len(data) -len(data[data.index == -1])

### Data Standarization Using Power Terms

In [173]:
standardize_x0 = standardize_with_power_terms(x0, 2, True, with_sqrt=True)

standardize_x1 = standardize_with_power_terms(x1, 2, True, with_sqrt=True)

standardize_x2 = standardize_with_power_terms(x2, 2, True, with_sqrt=True)

standardize_x3 = standardize_with_power_terms(x3, 2, True, with_sqrt=True)

# standardize_x00 = standardize_with_power_terms(x00, 4, True, with_sqrt=False)
# standardize_x01 = standardize_with_power_terms(x01, 4, True, with_sqrt=False)
# standardize_x10 = standardize_with_power_terms(x10, 4, True, with_sqrt=False)
# standardize_x11 = standardize_with_power_terms(x11, 4, True, with_sqrt=False)
# standardize_x20 = standardize_with_power_terms(x20, 4, True, with_sqrt=False)
# standardize_x21 = standardize_with_power_terms(x21, 4, True, with_sqrt=False)
# standardize_x30 = standardize_with_power_terms(x30, 4, True, with_sqrt=False)
# standardize_x31 = standardize_with_power_terms(x31, 4, True, with_sqrt=False)

# standardize_x00 = standardize_with_power_terms(x00, 2, True, with_sqrt=True)
# standardize_x01 = standardize_with_power_terms(x01, 2, True, with_sqrt=True)
# standardize_x10 = standardize_with_power_terms(x10, 2, True, with_sqrt=True)
# standardize_x11 = standardize_with_power_terms(x11, 2, True, with_sqrt=True)
# standardize_x20 = standardize_with_power_terms(x20, 2, True, with_sqrt=True)
# standardize_x21 = standardize_with_power_terms(x21, 2, True, with_sqrt=True)
# standardize_x30 = standardize_with_power_terms(x30, 2, True, with_sqrt=True)
# standardize_x31 = standardize_with_power_terms(x31, 2, True, with_sqrt=True)



# Main Implementations

In [174]:
# Final Datasets
standardize_x = [standardize_x0, standardize_x1, standardize_x2, standardize_x3]
sets_y        = [y0, y1, y2, y3]

# standardize_x = [standardize_x00, standardize_x01, standardize_x10, standardize_x11, \
#                  standardize_x20, standardize_x21, standardize_x30, standardize_x31]
# sets_y        = [y00, y01, y10, y11, y20, y21, y30, y31]


# Define the parameters of the algorithm.
max_iters = 5000
gamma     = 0.000002
lambda_   = 0.000001

## 1. Least Squares Using Gradient Descent

In [8]:
ws_1 = []
for x, y in zip(standardize_x, sets_y):
    initial_w = np.zeros(x.shape[1])
    w, loss   = least_squares_GD(y, x, initial_w, max_iters, gamma)
    print(np.mean(predict_labels(w, x) == y))
    ws_1.append(w)
    

0.73913304575
0.680697410502
0.704063200937
0.682097094387


## 2. Least Squares Using Stochastic Gradient Descent

In [9]:
ws_2 = []
for x, y in zip(standardize_x, sets_y):
    initial_w = np.zeros(x.shape[1])
    w, loss   = least_squares_SGD(y, x, initial_w, max_iters, gamma)
    print(np.mean(predict_labels(w, x) == y))
    ws_2.append(w)
    

0.738942880306
0.680142886619
0.704380793585
0.682818985743


## 3. Least Squares Using Normal Equations

In [10]:
ws_3 = []
for x, y in zip(standardize_x, sets_y):
    w, loss = least_squares(y, x)
    print(np.mean(predict_labels(w, x) == y))
    ws_3.append(w)
    

0.833455105942
0.775572578149
0.804898866591
0.802201768634


## 4. Ridge Regression

In [24]:
ws_4 = []
for x, y in zip(standardize_x, sets_y):
    w, loss = ridge_regression(y, x, lambda_)
    print(np.mean(predict_labels(w, x) == y))
    ws_4.append(w)
    

0.833224905668
0.775753120809
0.804938565672
0.801524995488


## 5. Logistic Regression using Gradient Descent

In [175]:
ws_5 = []
for x, y in zip(standardize_x, sets_y):
    #map y to value of either zero or one
    mapped_y = (y+1)/2
    
    initial_w = np.zeros(x.shape[1])
    w, loss   = logistic_regression(mapped_y, x, initial_w, max_iters, gamma)
    
    print(np.mean(predict_labels(w, x) == y))
    ws_5.append(w)
    
# 0.840711418934
# 0.794826163211
# 0.822485559459
# 0.812939902545

0.841422037172
0.797701949861
0.823537585105
0.815466522288


## 6. Regularized Logistic Regression using Gradient Descent

In [29]:
ws_6 = []
for x, y in zip(standardize_x, sets_y):
    #map y to value of either zero or one
    mapped_y = (y+1)/2
    
    initial_w = np.zeros(x.shape[1])
    w, loss   = reg_logistic_regression(mapped_y, x, initial_w, max_iters, gamma, lambda_)
    
    print(np.mean(predict_labels(w, x) == y))
    ws_6.append(w)
    

0.840991662747
0.794955122253
0.822068719109
0.81361667569


# Testing and Make Submission to Kaggle

In [51]:
standardize_x10.shape
# mask = (raw_x[:, 22] == 1) & (raw_x[:, 0] != -999)
# mask.shape
# len(ws_5[2])

(158095, 61)

In [68]:
y_pred = np.ones(len(raw_y))
w00, w01, w10, w11, w20, w21, w30, w31 = ws_5[0], ws_5[1], ws_5[2], ws_5[3], ws_5[4], ws_5[5], ws_5[6], ws_5[7]

mask = (raw_x[:, 22] == 0) & (raw_x[:, 0] != -999)
y_pred[mask] = predict_labels(w00, standardize_x00)
mask = (raw_x[:, 22] == 0) & (raw_x[:, 0] == -999)
y_pred[mask] = predict_labels(w01, standardize_x01)
mask = (raw_x[:, 22] == 1) & (raw_x[:, 0] != -999)
y_pred[mask] = predict_labels(w10, standardize_x10)
mask = (raw_x[:, 22] == 1) & (raw_x[:, 0] == -999)
y_pred[mask] = predict_labels(w11, standardize_x11)
mask = (raw_x[:, 22] == 2) & (raw_x[:, 0] != -999)
y_pred[mask] = predict_labels(w20, standardize_x20)
mask = (raw_x[:, 22] == 2) & (raw_x[:, 0] == -999)
y_pred[mask] = predict_labels(w21, standardize_x21)
mask = (raw_x[:, 22] == 3) & (raw_x[:, 0] != -999)
y_pred[mask] = predict_labels(w30, standardize_x30)
mask = (raw_x[:, 22] == 3) & (raw_x[:, 0] == -999)
y_pred[mask] = predict_labels(w31, standardize_x31)

create_csv_submission(ind, y_pred, 'prediction.csv')

In [176]:
test_y, test_x, ind = load_csv_data('higgs-data/test.csv')

def make_submission_file(x, y, w, filename="prediction.csv"):
    y_pred = np.ones(len(y))
    w0, w1, w2, w3 = w[0], w[1], w[2], w[3]
    
    # for PRI_JET_NUM = 0
    mask         = (x[:, 22] == 0)
    x_tmp        = x[mask, :]
    x_tmp        = np.delete(x_tmp, [4, 5, 6, 11, 12, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29], 1)
    stand_x      = standardize_with_power_terms(x_tmp, 2, True, with_sqrt=True)
    y_pred[mask] = predict_labels(w0, stand_x)

    # for PRI_JET_NUM = 1
    mask         = (x[:, 22] == 1)
    x_tmp        = x[mask, :]
    x_tmp        = np.delete(x_tmp, [4, 5, 6, 11, 12, 15, 18, 20, 22, 26, 27, 28], 1)
    stand_x      = standardize_with_power_terms(x_tmp, 2, True, with_sqrt=True)
    y_pred[mask] = predict_labels(w1, stand_x)

    # for PRI_JET_NUM = 2
    mask         = (x[:, 22] == 2)
    x_tmp        = x[mask, :]
    x_tmp        = np.delete(x_tmp, [11, 15, 18, 20, 22, 28], 1)
    stand_x      = standardize_with_power_terms(x_tmp, 2, True, with_sqrt=True)
    y_pred[mask] = predict_labels(w2, stand_x)

    # for PRI_JET_NUM = 3
    mask         = x[:, 22] == 3
    x_tmp        = x[mask, :]
    x_tmp        = np.delete(x_tmp, [11, 15, 18, 20, 22, 28], 1)
    stand_x      = standardize_with_power_terms(x_tmp, 2, True, with_sqrt=True)
    y_pred[mask] = predict_labels(w3, stand_x)
    
    create_csv_submission(ind, y_pred, filename)

### 5. Testing Logistic Regression

In [177]:
make_submission_file(test_x, test_y, ws_5, "submission-logistic.csv")