In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
#just for the notebook
%run implementations
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here 
DATA_TEST_PATH = '../data/test.csv'
y, tX, train_ids = load_csv_data(DATA_TRAIN_PATH)
subsmission_y, submission_x, test_ids = load_csv_data(DATA_TEST_PATH)

In [3]:
tX.shape

(250000, 30)

In [4]:
DEGREE = 3

In [5]:
x_train, y_train, x_test, y_test = split_data(tX, y, 0.8, seed=1)

In [6]:
# Append y values as column to later divide y into buckets corresponding with x values
x_train = np.column_stack((x_train, y_train))
x_test = np.column_stack((x_test, y_test))

In [7]:
# get all the needed buckets
buckets = get_buckets(x_train)
# y
y_buckets = []
for i in range(len(buckets)):
    y_buckets.append(buckets[i][:, -1])
    buckets[i] = np.delete(buckets[i], -1, 1)
# test
test_buckets = get_buckets(x_test)
submission_x_buckets = get_buckets(submission_x.copy())
# test y
test_y_buckets = []
for i in range(len(test_buckets)):
    test_y_buckets.append(test_buckets[i][:, -1])
    test_buckets[i] = np.delete(test_buckets[i], -1, 1)

In [8]:
# Normalize the matrices
for i in range(len(buckets)):
    buckets[i] = normalize(buckets[i])
    test_buckets[i] = normalize(test_buckets[i])
    submission_x_buckets[i] = normalize(submission_x_buckets[i])

In [9]:
# Add Intercept
for i in range(len(buckets)):
    buckets[i] = np.column_stack((np.ones((buckets[i].shape[0], 1)), buckets[i]))
    test_buckets[i] = np.column_stack((np.ones((test_buckets[i].shape[0], 1)), test_buckets[i]))
    submission_x_buckets[i] = np.column_stack((np.ones((submission_x_buckets[i].shape[0], 1)),
                                              submission_x_buckets[i]))

In [10]:
def accuracy(train_buckets, test_buckets, test_y_buckets, weights, degree_and_lambda):
    correct_predictions = 0
    len_data = 0
    for i in range(len(train_buckets)):
        degree = degree_and_lambda[i]["degree"]
        rr_accuracy = compute_accuracy(weights[i], build_poly(test_buckets[i], degree), test_y_buckets[i])
        correct_predictions += (rr_accuracy * len(build_poly(test_buckets[i], degree)))
        len_data += len(build_poly(test_buckets[i], degree))
    total_accuracy = correct_predictions / len_data
    print(f"Accuracy = {total_accuracy}")
    return total_accuracy

In [11]:
len(test_buckets[0])

14738

In [None]:
best_degree_lambda = []
for i in range(len(buckets)):
    best_degree_lambda.append(best_degree_selection(y=y_buckets[i], x=buckets[i], degrees=[9], 
                      k_fold=5, lambdas=np.linspace(0.1, 1, 10)))

best lambdas : 1.0
best degree 9
best lambdas : 1.0
best degree 9
best lambdas : 1.0
best degree 9
best lambdas : 1.0
best degree 9
best lambdas : 1.0
best degree 9
best lambdas : 1.0
best degree 9
best lambdas : 0.1
best degree 9


In [14]:
best_degree_lambda

[{'degree': 11, 'lambda': 0.0001},
 {'degree': 11, 'lambda': 0.0001},
 {'degree': 11, 'lambda': 0.0001},
 {'degree': 11, 'lambda': 0.0001},
 {'degree': 11, 'lambda': 0.0001},
 {'degree': 11, 'lambda': 0.0001},
 {'degree': 11, 'lambda': 0.0001},
 {'degree': 11, 'lambda': 0.0001}]

In [15]:
weights = []
for i in range(len(buckets)):
    degree = best_degree_lambda[i]["degree"]
    lambda_ = best_degree_lambda[i]["lambda"]
    w_rr, loss_rr = ridge_regression(y=y_buckets[i], tx=build_poly(buckets[i], degree), lambda_=lambda_)
    print(w_rr.shape)
    weights.append(w_rr)

(341,)
(341,)
(341,)
(341,)
(341,)
(341,)
(341,)
(341,)


In [None]:
build_poly(test_buckets[0], 3).shape

In [16]:
acc = accuracy(train_buckets=buckets, test_buckets=test_buckets, test_y_buckets=test_y_buckets,
                            weights=weights, degree_and_lambda=best_degree_lambda)

Accuracy = 0.78582


In [None]:
submission_y, submission_x, ids_test = load_csv_data(DATA_TEST_PATH, sub_sample=False)

In [None]:
ids_array = ids_test
pri_jet_num_feature = submission_x[:, 22]
der_mass_mmc_col_feature = submission_x[:, 0]
ids_array = np.column_stack((ids_array, pri_jet_num_feature))
ids_array = np.column_stack((ids_array, der_mass_mmc_col_feature))

In [None]:
id_buckets = get_id_buckets(ids_array)

In [None]:
submission = predict_labels(weights[0], build_poly(submission_x_buckets[0], best_degree_lambda[0]["degree"]))
submission = np.column_stack((submission, id_buckets[0]))
for i in range(1, len(weights)):
    degree = best_degree_lambda[i]["degree"]
    predictions = predict_labels(weights[i], build_poly(submission_x_buckets[i], degree))
    predictions = np.column_stack((predictions, id_buckets[i]))
    submission = np.concatenate((submission, predictions))

In [None]:
submission = submission[submission[:, 1].argsort()]
submission = submission[:, 0]

In [None]:
create_csv_submission(ids_test, submission, "../data/output.csv")

## Do your thing crazy machine learning thing here :) ...

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
OUTPUT_PATH = '../data/submission.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)