In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [3]:
tX = np.where(tX==-999, 0, tX)
tX

array([[138.47 ,  51.655,  97.827, ...,   1.24 ,  -2.475, 113.497],
       [160.937,  68.768, 103.235, ...,   0.   ,   0.   ,  46.226],
       [  0.   , 162.172, 125.953, ...,   0.   ,   0.   ,  44.251],
       ...,
       [105.457,  60.526,  75.839, ...,   0.   ,   0.   ,  41.992],
       [ 94.951,  19.362,  68.812, ...,   0.   ,   0.   ,   0.   ],
       [  0.   ,  72.756,  70.831, ...,   0.   ,   0.   ,   0.   ]])

In [4]:
from implementations import *

initial_w = np.zeros(len(tX[0]))
max_iters = 50
gamma = 0.7

weigths, loss = least_squares_GD(y, tX, initial_w, max_iters, gamma)

print(weigths, loss)

[ 1.23787741e+251  4.66890739e+250  8.88960134e+250  1.04096961e+251
  1.70573873e+249  3.20795143e+251 -1.03683380e+249  2.28422123e+249
  2.68495925e+250  2.60163252e+251  1.57891643e+249  3.18680444e+248
  3.13472510e+248  4.91092550e+250 -9.40941505e+246 -2.33678042e+246
  5.63554614e+250 -1.12919545e+247  4.13028665e+247  5.83370652e+250
 -1.39758106e+246  3.18436637e+251  1.69200112e+249  9.98061967e+250
 -3.11081800e+246 -7.29342575e+246  4.04565569e+250 -8.40971505e+246
 -4.40164414e+246  1.54698537e+251] inf


  return e.T@e / (2 * len(y))


In [5]:
tX = standardize_matrix(tX)
len(tX)

250000

In [6]:
tX

array([[ 1.22443381,  0.19387987,  0.74197336, ..., -0.40458096,
        -0.44868057,  0.92798707],
       [ 1.4911326 ,  0.39702299,  0.80617005, ..., -0.41930061,
        -0.41930061,  0.1294339 ],
       [-0.41930061,  1.5057929 ,  1.07584839, ..., -0.41930061,
        -0.41930061,  0.10598928],
       ...,
       [ 0.83254673,  0.29918476,  0.48096063, ..., -0.41930061,
        -0.41930061,  0.0791734 ],
       [ 0.70783326, -0.18946032,  0.3975453 , ..., -0.41930061,
        -0.41930061, -0.41930061],
       [-0.41930061,  0.4443633 ,  0.42151222, ..., -0.41930061,
        -0.41930061, -0.41930061]])

In [7]:
# compute cross validation for training, return the optimal weigths and theirs respective loss for the train and the test datas
def cross_validation(y, tx, k_indices, k, lambda_):
    
    # get k'th subgroup in test, others in train
    training_indices = k_indices[~(np.arange(len(k_indices)) == k)].reshape(-1)
    test_indices = k_indices[k]
    
    tx_train = tx[training_indices]
    tx_test = tx[test_indices]
    y_train = y[training_indices]
    y_test = y[test_indices]
    
    # optimization with ridge_regression
    weigths = ridge_regression(y_train, tx_train, lambda_)
    
    # compute the loss for the train and test datas with the weigths found
    loss_train = compute_mse(y_train, tx_train, weigths)
    loss_test = compute_mse(y_test, tx_test, weigths)
    
    return weigths, loss_train, loss_test

In [9]:
# compute the best hyperparameters for regularized optimization
def best_hyperparameters(y, tx, lambdas, k_fold, seed=1):
    # for each lambda, store the respective loss and weigths
    losses = []
    weigths_all = []
    
    # build k indices for k-fold
    k_indices = build_k_indices(y, k_fold, seed)
    
    # compute cross validation for each lambda
    for lambda_ in lambdas:
        
        # to compute the total loss of each lambda, compute the loss for each iteration of the k-fold and compute the mean
        losses_test_tmp = []
        
        # compute loss for each iteration of the k_fold
        for k in range(k_fold):
            weigths, loss_train, loss_test = cross_validation(y, tx, k_indices, k, lambda_)
            losses_test_tmp.append(loss_test)
            weigths_all.append(weigths)
            
            
        #compute the loss for the specific lambda by taking the mean of the losses of each iteration of the k-fold
        losses.append(np.mean(losses_test_tmp))
        
    # find the optimal hyperparameter lambda by getting the minimum loss
    best_lambda_index = np.argmin(losses)
    optimal_lambda = lambdas[best_lambda_index]
    optimal_weigths = weigths_all[best_lambda_index]
    minimum_loss = losses[best_lambda_index]
    
    return optimal_lambda, optimal_weigths, minimum_loss

In [15]:
_, weights, _ = best_hyperparameters(y, tX, np.logspace(-4,0), 4)

## Do your thing crazy machine learning thing here :) ...

## Generate predictions and save ouput in csv format for submission:

In [11]:
DATA_TEST_PATH = '../data/test.csv' 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [16]:
## for testing the predictor, modify the tx_test

tX_test = np.where(tX_test==-999, np.NaN, tX_test)
tX_test = tX_test[~np.isnan(tX_test).any(axis=1)]

In [17]:
OUTPUT_PATH = '../data/sample-submission_test.csv' 
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)