In [147]:
import numpy as np
from proj1_helpers import *
from cross_validation import *
from tools import *
from implementations import *

%load_ext autoreload
%autoreload 2
seed = 10

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Import data

In [99]:
DATA_TRAIN_PATH = 'data/train.csv'
DATA_TEST_PATH = 'data/test.csv'

In [100]:
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Gradient descent

In [145]:
k_fold = 5
gamma = 0.01
max_iters = 500
k_indices = build_k_indices(y, k_fold, seed)
list_accuracy_train = []
list_accuracy_test = []

for k in range(k_fold):
    a_train, a_test = cross_validation(y, tX, k_indices, k, least_squares_gd, initial_w = None, max_iters = max_iters, gamma = gamma)
    list_accuracy_train.append(a_train)
    list_accuracy_test.append(a_test)

for i in range(len(list_accuracy_train)):
    print("{}: Training accuracy: {} - Test accuracy : {}".format(i, list_accuracy_train[i], list_accuracy_test[i]))
print("Average test accuracy: {}".format(np.mean(list_accuracy_test)))
print("Variance test accuracy: {}".format(np.var(list_accuracy_test)))

0: Training accuracy: 0.75853 - Test accuracy : 0.7551
1: Training accuracy: 0.75843 - Test accuracy : 0.75974
2: Training accuracy: 0.75819 - Test accuracy : 0.75912
3: Training accuracy: 0.757445 - Test accuracy : 0.75656
4: Training accuracy: 0.758455 - Test accuracy : 0.7594
Average test accuracy: 0.7579839999999999
Variance test accuracy: 3.344863999999969e-06


## Stochastic gradient descent

In [None]:
k_fold = 5
gamma = 0.1
max_iters = 500
k_indices = build_k_indices(y, k_fold, seed)
list_accuracy_train = []
list_accuracy_test = []

for k in range(k_fold):
    a_train, a_test = cross_validation(y, tX, k_indices, k, least_squares_sgd, initial_w = None, max_iters = max_iters, gamma = gamma)
    list_accuracy_train.append(a_train)
    list_accuracy_test.append(a_test)

for i in range(len(list_accuracy_train)):
    print("{}: Training accuracy: {} - Test accuracy : {}".format(i, list_accuracy_train[i], list_accuracy_test[i]))
print("Average test accuracy: {}".format(np.mean(list_accuracy_test)))
print("Variance test accuracy: {}".format(np.var(list_accuracy_test)))

## Least squares

In [142]:
k_fold = 5
k_indices = build_k_indices(y, k_fold, seed)
list_accuracy_train = []
list_accuracy_test = []

for k in range(k_fold):
    a_train, a_test = cross_validation(y, tX, k_indices, k, least_squares)
    list_accuracy_train.append(a_train)
    list_accuracy_test.append(a_test)

for i in range(len(list_accuracy_train)):
    print("{}: Training accuracy: {} - Test accuracy : {}".format(i, list_accuracy_train[i], list_accuracy_test[i]))
print("Average test accuracy: {}".format(np.mean(list_accuracy_test)))
print("Variance test accuracy: {}".format(np.var(list_accuracy_test)))

0: Training accuracy: 0.7753 - Test accuracy : 0.77166
1: Training accuracy: 0.77587 - Test accuracy : 0.7757
2: Training accuracy: 0.774895 - Test accuracy : 0.77594
3: Training accuracy: 0.77425 - Test accuracy : 0.77394
4: Training accuracy: 0.775435 - Test accuracy : 0.77788
Average test accuracy: 0.7750239999999999
Variance test accuracy: 4.388863999999989e-06


## Ridge regression

In [141]:
#version without splitting dataset by jet
k_fold = 5
lambda_ = 0.002
k_indices = build_k_indices(y, k_fold, seed)
list_accuracy_train = []
list_accuracy_test = []

for k in range(k_fold):
    a_train, a_test = cross_validation(y, tX, k_indices, k, ridge_regression, lambda_ = lambda_)
    list_accuracy_train.append(a_train)
    list_accuracy_test.append(a_test)

for i in range(len(list_accuracy_train)):
    print("{}: Training accuracy: {} - Test accuracy : {}".format(i, list_accuracy_train[i], list_accuracy_test[i]))
print("Average test accuracy: {}".format(np.mean(list_accuracy_test)))
print("Variance test accuracy: {}".format(np.var(list_accuracy_test)))

0: Training accuracy: 0.77417 - Test accuracy : 0.7711
1: Training accuracy: 0.774735 - Test accuracy : 0.77464
2: Training accuracy: 0.773785 - Test accuracy : 0.77516
3: Training accuracy: 0.77315 - Test accuracy : 0.7722
4: Training accuracy: 0.774415 - Test accuracy : 0.77638
Average test accuracy: 0.7738959999999999
Variance test accuracy: 3.8031039999999307e-06


In [139]:
#version with splitting dataset by jet
k_fold = 2
lambdas = [0.002, 0.001, 0.001]
degrees = [4, 7 ,9]

k_indices = build_k_indices(y, k_fold, seed)

list_accuracy_train = []
list_accuracy_test = []

for k in range(k_fold):
    accuracy_train, accuracy_test = cross_validation_ridge_regression(y, tX, k_indices, k, lambdas, degrees)
    list_accuracy_train.append(accuracy_train)
    list_accuracy_test.append(accuracy_test)

for i in range(len(list_accuracy_train)):
    print("{}: Training accuracy: {} - Test accuracy : {}".format(i, list_accuracy_train[i], list_accuracy_test[i]))
print("Average test accuracy: {}".format(np.mean(list_accuracy_test)))
print("Variance test accuracy: {}".format(np.var(list_accuracy_test)))

0: Training accuracy: 0.83796 - Test accuracy : 0.834872
1: Training accuracy: 0.838488 - Test accuracy : 0.834992
Average test accuracy: 0.834932
Variance test accuracy: 3.6000000000005393e-09


### run.py code --> ridge regression

In [15]:
print('Start predicting...\n')
DATA_TRAIN_PATH = 'data/train.csv'
DATA_TEST_PATH = 'data/test.csv'

y_train, tx_train, ids_train = load_csv_data(DATA_TRAIN_PATH)
_, tx_test, ids_test = load_csv_data(DATA_TEST_PATH)


lambdas = [0.002, 0.001, 0.001]
degrees = [4, 7, 9]

y_pred = np.zeros(tx_test.shape[0])

dict_jets_train = group_features_by_jet(tx_train)
dict_jets_test = group_features_by_jet(tx_test)

for index in range(len(dict_jets_train)):
    x_train = tx_train[dict_jets_train[index]]
    x_test = tx_test[dict_jets_test[index]]
    y_train = y[dict_jets_train[index]]

    #data processing
    x_train, x_test = process_data(x_train, x_test)
    x_train = build_polynomial_features(x_train, degrees[index])
    x_test = build_polynomial_features(x_test, degrees[index])
    x_train = np.hstack((np.ones((x_train.shape[0], 1)), x_train))
    x_test = np.hstack((np.ones((x_test.shape[0], 1)), x_test))

    weight, loss_train = ridge_regression(y_train, x_train, lambdas[index])

    temp_test_pred = predict_labels(weight, x_test)

    y_pred[dict_jets_test[index]] = temp_test_pred
    
print('Start generating prediction files...\n')
OUTPUT_PATH = 'data/output_ridge_regression3.csv'
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

print('Finish!')


Start predicting...

Start generating prediction files...

Finish!


## Logistic regression

In [148]:
k_fold = 2
gamma = 0.01
max_iters = 1000

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)
list_accuracy_train = []
list_accuracy_test = []

for k in range(k_fold):
    a_train, a_test = cross_validation(y, tX, k_indices, k, logistic_regression, initial_w = None ,max_iters = max_iters, gamma = gamma)
    list_accuracy_train.append(a_train)
    list_accuracy_test.append(a_test)

for i in range(len(list_accuracy_train)):
    print("{}: Training accuracy: {} - Test accuracy : {}".format(i, list_accuracy_train[i], list_accuracy_test[i]))
print("Average test accuracy: {}".format(np.mean(list_accuracy_test)))
print("Variance test accuracy: {}".format(np.var(list_accuracy_test)))

0: Training accuracy: 0.725848 - Test accuracy : 0.726168
1: Training accuracy: 0.727192 - Test accuracy : 0.726472
Average test accuracy: 0.7263200000000001
Variance test accuracy: 2.3103999999995586e-08
