In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from utils import calculate_mse
%load_ext autoreload
%autoreload 2

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"

## Load the training data into feature matrix, class labels, and event ids:

In [None]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

N = tX.shape[0]
# add column of ones to add bias term
tx = np.hstack((np.ones((N, 1)), tX))

D = tX.shape[1]

## 1. Exploratory data analysis

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2)

ax1.boxplot(tX[:100,:15]);
ax2.boxplot(tX[:100,15:]);

In [None]:
# boxplots of standardized features
fig, ax = plt.subplots()
ax.boxplot(tX_plot_stand [:,]);
ax.set_xticklabels(np.arange(0,30,1));
ax.set_xlabel("Features");
ax.set_ylabel("Standardized Values");
ax.set_title("Boxplot of standardized features");

In [None]:
# get indices sorted by variance of unstandardized trainings data
s = np.std(tX_plot , axis = 0)
s = np.argsort(s).tolist()

# feature 12 in [0,1]
# feature 22 in {2,3}
# feature 14, 17 in [-2.5, 2.5]
# feature 20, 18, 25 in [-3.142, 3.142]

In [None]:
# histograms of features
fig, ((ax1, ax2), (ax3,ax4)) = plt.subplots(2,2)
ax1.hist(tX_plot[:,s[:5]], bins = 100, histtype='step', stacked=True, fill=False);
ax2.hist(tX_plot[:,s[5:15]], bins = 100, histtype='step', stacked=True, fill=False);
ax3.hist(tX_plot[:,s[15:25]], bins = 100, histtype='step', stacked=True, fill=False);
ax4.hist(tX_plot[:,s[25:]], bins = 100, histtype='step', stacked=True, fill=False);
ax1.set_title("Histogram of unstandardized features");
# find a faster way to freshen up plot

ax1.set_yticklabels([]);
ax2.set_yticklabels([]);
ax3.set_yticklabels([]);
ax4.set_yticklabels([]);

## 2. Preprocessing

In [None]:
# Normalize dataset
for indx_col in range(tX.shape[1]):
    tX[:, indx_col] = (tX[:, indx_col] - np.mean(tX[:, indx_col]))/np.std(tX[:, indx_col])

print(y)
# Split dataset
split_perc = 0.7
split_ind = int(len(y) * split_perc)

# Training set
y_train = y[:split_ind]
tX_train = tX[:split_ind]
ids_train = ids[:split_ind]

# Validation set
y_val = y[split_ind:]
tX_val = tX[split_ind:]
ids_val = ids[split_ind:]

## 3. Training

In [None]:
# Set hyperparameters
max_iters = 100
gamma = 0.01
lambda_ = 0.1

# Initialize weights
initial_w = np.random.rand(D,)

from implementations import *
# Train
#weights_ls_GD, loss_ls_GD = least_squares_GD(y_train, tX_train, initial_w, max_iters, gamma)
#print(loss_ls_GD)
#weights_ls_SGD, loss_ls_SGD = least_squares_SGD(y_train, tX_train, initial_w, max_iters, gamma)
#weights_ls, loss_ls = least_squares(y_train, tX_train)
# weights_rr, loss_rr = ridge_regression(y_train, tX_train, lambda_)
# weights_lr, loss_lr = logistic_regression(y_train, tX_train, initial_w, max_iters, gamma)
# weights_reg_lr, loss_reg_lr = reg_logistic_regression(y_train, tX_train, lambda_, initial_w, max_iters, gamma)

## 4. Evaluation

In [None]:
from utils import cross_validation, build_k_indices
# Cross-validation

k = 5
k_indices = build_k_indices(y, k)
ls_GD_losses = []
ls_losses = []

for k_iteration in range(k):
    tX_train, y_train, tX_val, y_val = cross_validation(y, tX, k_indices, k_iteration)
    weights_ls_GD, loss_ls_GD = least_squares_GD(y_train, tX_train, initial_w, max_iters, gamma)
    weights_ls, loss_ls = least_squares(y_train, tX_train)

    y_pred_ls_GD_val = tX_val @ weights_ls_GD
    y_pred_ls_val = tX_val @ weights_ls

    ls_GD_losses.append(calculate_mse_loss(y_val, y_pred_ls_val))
    ls_losses.append(calculate_mse_loss(y_val, y_pred_ls_val))

ls_GD_losses = np.array(ls_GD_losses)
ls_losses = np.array(ls_losses)

ls_GD_mean_loss = np.mean(ls_GD_losses)
ls_mean_loss = np.mean(ls_losses)

print(ls_GD_mean_loss, ls_mean_loss)

## Generate predictions and save output in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
# TODO: decide real weights
weights = least_squares_GD(y, tX, initial_w, max_iters, gamma)
OUTPUT_PATH = '../data/submission.csv'
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)