In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from proj1_helpers import *
from implementations import *

%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [44]:
DATA_TEST_PATH = '../data/test.csv' 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

## Do your thing crazy machine learning thing here :) ...

In [4]:
# compute cross validation for training, return the optimal weigths and theirs respective loss for the train and the test datas
def cross_validation(y, tx, k_indices, k, lambda_, degree):
    
    # get k'th subgroup in test, others in train
    training_indices = k_indices[~(np.arange(len(k_indices)) == k)].reshape(-1)
    test_indices = k_indices[k]
    
    tx_train = tx[training_indices]
    tx_test = tx[test_indices]
    y_train = y[training_indices]
    y_test = y[test_indices]
    
    # features expansion
    tx_train = build_poly(tx_train, degree)
    tx_test = build_poly(tx_test, degree)
    
    # optimization with ridge_regression
    weigths = ridge_regression(y_train, tx_train, lambda_)
    
    # compute the loss for the train and test datas with the weigths found
    loss_train = compute_mse(y_train, tx_train, weigths)
    loss_test = compute_mse(y_test, tx_test, weigths)
    
    return weigths, loss_train, loss_test

In [5]:
# compute the best hyperparameters for regularized optimization
def best_hyperparameters(y, tx, degrees, lambdas, k_fold, seed=1):
    # for each degree, store the best lambda and the respective loss
    losses = []
    lambdas_best = []
    
    # build k indices for k-fold
    k_indices = build_k_indices(y, k_fold, seed)
    
    # compute cross validation with all lambdas for each degree
    for degree in degrees:
        
        # store the loss, respective to the lambdas
        losses_test = []
        
        # compute cross validation for each lambda of the specific degree
        for lambda_ in lambdas:
        
            # to compute the total loss of each lambda by storing the loss for each iteration 
            # of the k-fold and computing the mean
            losses_test_tmp = []
        
            # compute loss for each iteration of the k_fold
            for k in range(k_fold):
                weigths, loss_train, loss_test = cross_validation(y, tx, k_indices, k, lambda_, degree)
                losses_test_tmp.append(loss_test)
            
            
            #compute the loss for the specific lambda by taking the mean of the losses of each iteration of the k-fold
            losses_test.append(np.mean(losses_test_tmp))
        
        # find the optimal lambda hyperparameter by getting the minimum loss for each degree
        best_lambda_index = np.argmin(losses_test)
        lambdas_best.append(lambdas[best_lambda_index])
        losses.append(losses_test[best_lambda_index])
        
    # find the optimal degree hyperparameter by getting the minimum loss
    best_degree_index = np.argmin(losses)
    
    # compute the optimal hyperparameters
    opt_degree = degrees[best_degree_index]
    opt_lambda = lambdas_best[best_degree_index]
    
    
    return opt_degree, opt_lambda

In [31]:
# compute the best hyperparameters for regularized optimization for each subset of the training dataset
# and return the best weights of each subset, respective to the best hyperparameters
def train_models(y, tx, degrees, lambdas, k_fold, seed=1):
    
    # get the indices of each training subset
    indices_group = group_indices(tx)
    
    # store the best weights and degree for each training subset
    best_weights = []
    best_degree = []
    
    # compute the optimal hyperparameters for each training subset and the respective weights
    for indice_group in indices_group:
        y_subset = y[indice_group]
        tx_subset = drop_na_columns(tx[indice_group])
        
        opt_degree, opt_lambda = best_hyperparameters(y_subset, tx_subset, degrees, lambdas, k_fold, seed)
        weights = ridge_regression(y_subset, build_poly(tx_subset, opt_degree), opt_lambda)
        
        best_degree.append(opt_degree)
        best_weights.append(weights)
        
    return best_weights, best_degree

In [33]:
tX = np.where(tX==-999, np.NaN, tX)
best_weights, best_degree = train_models(y, tX, np.arange(5), np.logspace(-4, 0, 20), 3)

In [35]:
best_degree

[2, 1, 2, 3, 3, 4]

## Generate predictions and save ouput in csv format for submission:

In [59]:
tX_test = np.where(tX_test==-999, np.NaN, tX_test)

568238

In [46]:
indices_test_group = group_indices(tX_test)

[(array([     0,     14,     15, ..., 568193, 568218, 568220]),),
 (array([     2,      3,      5, ..., 568232, 568235, 568237]),),
 (array([    34,     37,     49, ..., 568180, 568214, 568224]),),
 (array([     1,      6,     10, ..., 568227, 568231, 568236]),),
 (array([    35,    173,    201, ..., 568075, 568189, 568223]),),
 (array([     4,      7,      9, ..., 568230, 568233, 568234]),)]

In [61]:
y_pred = np.zeros(tX_test.shape[0])
for i, indice_test_group in enumerate(indices_test_group):
    y_pred_subset = predict_labels(best_weights[i], build_poly(drop_na_columns(tX_test[indice_test_group]), best_degree[i]))
    y_pred[indice_test_group] = y_pred_subset
    
    
y_pred

array([-1., -1., -1., ...,  1., -1., -1.])

In [62]:
OUTPUT_PATH = '../data/sample-submission_test.csv' 
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)