In [115]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from implementations import *
from utilities import *
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the training data into feature matrix, class labels, and event ids:

In [63]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
N = len(y) # training set cardinality
D = tX.shape[1] # number of parameters ("dimensionality")

# Processing the data

We perform three operations aimed at applying our models on a suitable training dataset.

1) Several observations are incomplete, as it can be shown looking at the number of "-999"s in the dataset. 
    To solve this issue, we can eliminate columns where too many data, say 70%, are missing, and we can impose the mean in the remaining cases.

2) The values of the features are quite high and spread, so it is a better practice to normalize or standardize the data before proceeding with the implementations. Indeed, GD is very sensitive to ill-conditioning. The two above mentioned methods consist respecively in dividing each feature column by its maximum or by subtracting the mean and dividing by the standard deviation.

3) Outliers should be detected and eliminated from the training dataset.

In [64]:
print(np.count_nonzero(tX==-999)) # counting the missing data in tX

1580052


In [65]:
tX, D = missing_values(tX)
print(tX)
print(D)

[[ 1.38470000e+02  5.16550000e+01  9.78270000e+01 ...  2.15000000e+00
   4.44000000e-01  1.13497000e+02]
 [ 1.60937000e+02  6.87680000e+01  1.03235000e+02 ...  7.25000000e-01
   1.15800000e+00  4.62260000e+01]
 [ 1.21858528e+02  1.62172000e+02  1.25953000e+02 ...  2.05300000e+00
  -2.02800000e+00  4.42510000e+01]
 ...
 [ 1.05457000e+02  6.05260000e+01  7.58390000e+01 ...  1.80000000e+00
  -1.66000000e-01  4.19920000e+01]
 [ 9.49510000e+01  1.93620000e+01  6.88120000e+01 ... -3.27458741e-03
  -1.23928255e-02  0.00000000e+00]
 [ 1.21858528e+02  7.27560000e+01  7.08310000e+01 ... -3.27458741e-03
  -1.23928255e-02  0.00000000e+00]]
23


In [74]:
# tX = normalize(tX)
tX = standardize_tX(tX)

In [75]:
print(tX)
print(tX.shape)

[[ 3.14910656e-01  6.83319669e-02  4.07680272e-01 ...  1.55729751e+00
   3.24824359e-01  4.12510497e-01]
 [ 7.40827026e-01  5.52504823e-01  5.40136414e-01 ...  5.26704866e-01
   8.32993155e-01 -2.73819964e-01]
 [-5.38802302e-16  3.19515553e+00  1.09655998e+00 ...  1.48714489e+00
  -1.43454996e+00 -2.93969845e-01]
 ...
 [-3.10930673e-01  3.19316447e-01 -1.30863670e-01 ...  1.30416949e+00
  -1.09325452e-01 -3.17017229e-01]
 [-5.10097335e-01 -8.45323970e-01 -3.02973380e-01 ...  1.00367341e-17
   1.11117522e-17 -7.45439413e-01]
 [-5.38802302e-16  6.65336083e-01 -2.53522760e-01 ...  1.00367341e-17
   1.11117522e-17 -7.45439413e-01]]
(250000, 23)


In [84]:
alpha = 0.1 # quantile of order 0.1; this means that we will cut the upper and lower 10% tail
tX = eliminate_outliers(tX, alpha)

## Models 

Application of the 6 models expected for the project, on the prepocessed data.

## Linear Regression using Gradient Descent


In [85]:
max_iters = 500
gamma = 0.1
initial_w = np.zeros([D,])
w, loss = least_squares_GD(y, tX, initial_w, max_iters, gamma)

In [86]:
print(w)
print(loss)

[ 4.11017176e-01 -3.56174301e-01 -2.35900974e-01  9.81773887e-02
  1.67047733e-01 -2.14679694e-02 -3.85294140e-01  2.38519776e-01
  6.41506572e-02  6.49097262e-01 -2.47293775e-03 -1.84308335e-03
 -2.28074027e-02  4.72584913e-04  3.14827387e-03  3.22785857e-02
 -2.34555689e-04 -1.30268016e-01  3.17139654e-01  3.07209691e-01
 -1.44275634e-03 -1.62969347e-03  8.93457870e-03]
0.36033025514467687


## Linear Regression using Stochastic Gradient Descent

In [87]:
max_iters = 500
gamma = 0.01
batch_size = 1
initial_w = np.zeros([D,])
w, loss = least_squares_SGD(y, tX, initial_w, max_iters, gamma, batch_size)

In [88]:
print(w)
print(loss)

[ 0.18440222 -0.25602842  0.06793253  0.17296854  0.12919834 -0.0900654
 -0.02639086 -0.01308734  0.18318065  0.20851757  0.03951859 -0.13360193
  0.04693863  0.00469096 -0.04423778  0.05663676 -0.01453888 -0.00357058
  0.07071319  0.03582717 -0.08705132  0.09015691  0.01803553]
0.4077851354589864


## Least Squares Regression using Normal Equations

In [89]:
w, loss = least_squares(y, tX)

In [90]:
print(w)
print(loss)

[ 4.46918728e-01 -3.42163482e-01 -3.03194178e-01  1.16321666e-01
  1.90076462e-01 -2.98023804e-02 -1.07793902e+00  3.35966742e-01
  5.58275403e-02  8.76704721e-01 -2.52186900e-03 -1.69497187e-03
  3.86836340e-02  6.00150871e-04  3.26193782e-03  4.72281995e-03
 -9.16840773e-05 -8.47368490e-02  3.69370475e-01  3.46785042e-01
 -1.38464372e-03 -1.73536770e-03  5.08187300e-01]
0.3571475926800008


## Ridge Regression using Normal Equations
We perform a cross-validation for chosing the optimal lambda.

In [118]:
seed = 1
#degree = 5
k_fold = 4
lambdas = np.logspace(-4, 0, 30)

# splitting data in k fold
k_indices = build_k_indices(y, k_fold, seed)

rmse_tr = []
rmse_te = []

for i in range(len(lambdas)):
        lambda_ = lambdas[i]
        tr_loss = 0
        te_loss = 0
        for k in range(k_fold): 
            loss_tr, loss_te = cross_validation(y, tX, k_indices, k, lambda_)[1:]
            tr_loss = tr_loss + loss_tr
            te_loss = te_loss + loss_te
        rmse_tr.append(np.sqrt(2 * tr_loss/k_fold))
        rmse_te.append(np.sqrt(2 * te_loss/k_fold))


In [120]:
print(k_indices.shape)
#print(rmse_tr)
#print(rmse_te)

(4, 62500)
[0.8451559762683958, 0.8451614112036048, 0.8451712453828589, 0.8451887903207852, 0.8452195307897435, 0.8452721869496425, 0.845359949238615, 0.8455016589399731, 0.8457226576988703, 0.8460552278640989, 0.846538927452292, 0.8472210782083571, 0.8481564211773459, 0.8494030230264888, 0.8510115066755363, 0.8530090117407382, 0.8553860748850798, 0.8580973353290767, 0.8610808707604044, 0.8642898100449252, 0.867723077514569, 0.8714443202814338, 0.8755846042342056, 0.8803282157154915, 0.8858805947477761, 0.8924183878534053, 0.900029441431769, 0.9086629740901928, 0.9181148005746428, 0.928058088415232]
[0.8452383894121399, 0.8452437318954308, 0.8452534425190434, 0.8452708237816218, 0.8453013497936661, 0.845353728644392, 0.8454411377574003, 0.8455824050277555, 0.8458028587465765, 0.8461347678977268, 0.8466176765467694, 0.8472988910538661, 0.8482331348804624, 0.849478454283935, 0.8510854482665314, 0.853081228467293, 0.8554562999666743, 0.858165269056161, 0.8611461828162709, 0.86435214545117

## Generate predictions and save ouput in csv format for submission:

In [10]:
DATA_TEST_PATH = '../data/test.csv' 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [31]:
OUTPUT_PATH = '' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)