In [4]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [25]:
from proj1_helpers import *
DATA_TRAIN_PATH = 'data/train.csv.zip' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

# Data cleaning and normalisation

In [3]:
# Reproducible random number generator
rng = np.random.default_rng()

In [4]:
# Per HiggsML Appendix B, this is how not-available is represented
tX[tX <= -999] = np.nan

In [57]:
tX_mean = np.nanmean(tX,axis=0)
tX_std = np.nanstd(tX,axis=0)
norm_tX = np.subtract(tX, tX_mean, where=np.isfinite(tX_mean))
norm_tX = np.divide(norm_tX, tX_std, where=tX_std>0)

## Generate predictions and save ouput in csv format for submission:

In [2]:
DATA_TEST_PATH = 'data/test.csv.zip' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [31]:
OUTPUT_PATH = '' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

# Appendix: Data Visualisation

In [8]:
from bokeh.io import output_notebook, show
from bokeh.plotting import figure, row
output_notebook()

In [19]:
n_points, n_features = norm_tX.shape

# we sample 2% of points for efficiency
sample_tX = [rng.choice(norm_tX[y==i], size=n_points//100, replace=False) for i in np.unique(y)]
assert len(sample_tX) == 2  # check that there are 2 labels

for i in range(n_features):
    r = []
    for j in range(n_features):
        fig = figure(x_axis_label=f'Feature #{i}', y_axis_label=f'Feature #{j}')
        fig.cross(sample_tX[0][:,i], sample_tX[0][:,j], color='orange')
        fig.cross(sample_tX[1][:,i], sample_tX[1][:,j], line_alpha=.7)
        r.append(fig)
    show(row(r))