In [None]:
### THIS FIRST PART SHOULD BE IN TRAIN.PY

In [1]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [2]:
from preprocessing import load_csv_data, preprocessing_pipeline, split_data_by_categorical_column

In [3]:
from implementations import reg_logistic_regression, svm, ridge_regression
from helpers import predict_labels, compute_accuracy
from cross_validation import k_fold_indices, k_fold_cross_split_data

In [4]:
PRI_JET_NUM_INDEX = 22
SEED = 2019

In [5]:
train_classes, train_data, train_ids = load_csv_data("data/train.csv")
train_classes_jet_num_splits, train_data_jet_num_splits, train_ids_jet_num_splits = \
    split_data_by_categorical_column(train_classes,
                                     train_data,
                                     train_ids,
                                     PRI_JET_NUM_INDEX)

In [6]:
POSSIBLE_LAMBDA_VALUES = [1e-6, 5e-5, 2.5e-5, 1e-5, 7.5e-4, 5e-4, 2.5e-4, 1e-4, 0]
POSSIBLE_LAMBDA_LOG = [0]
POSSIBLE_LAMBDA_SVM = [1e-2]
POSSIBLE_DEGREES = np.arange(5, 14)
grid_shape = (4, 1, len(POSSIBLE_DEGREES), len(POSSIBLE_LAMBDA_VALUES), 2)

In [6]:
from tqdm.notebook import tqdm_notebook as tqdm

In [None]:
train_accuracy_matrix = np.zeros(grid_shape)
validation_accuracy_matrix = np.zeros(grid_shape)

for jet_num, (train_classes_split, train_data_split) in enumerate(tqdm(zip(train_classes_jet_num_splits, train_data_jet_num_splits), total=4, desc='PRI_JET_NUM')):
    k_indices = k_fold_indices(train_data_split.shape[0], 5, SEED)
    for i, deg in enumerate(tqdm(POSSIBLE_DEGREES, desc='deg loop', leave=False)):
        train_data, _ = preprocessing_pipeline(train_data_split, degree=deg)
        train_set_folds = k_fold_cross_split_data(train_classes_split, train_data, k_indices)
     
        ### Train Ridge model on fold
        for j, lambda_ in enumerate(tqdm(POSSIBLE_LAMBDA_VALUES, desc='lambda loop', leave=False)):
            folds_train_accuracy = []
            folds_validation_accuracy = []
            for x_train, y_train, x_test, y_test in train_set_folds:
                w, train_loss = ridge_regression(y_train, x_train, lambda_)
                folds_train_accuracy.append(compute_accuracy(predict_labels(w, x_train), y_train))
                folds_validation_accuracy.append(compute_accuracy(predict_labels(w, x_test), y_test))
            train_accuracy_matrix[jet_num, 0, i, j] = (np.mean(folds_train_accuracy), np.std(folds_train_accuracy))
            validation_accuracy_matrix[jet_num, 0, i, j] = (np.mean(folds_validation_accuracy), np.std(folds_validation_accuracy))

        train_data_log_svm = preprocessing_pipeline(train_data_split, degree=deg, norm_first=False)
        train_set_folds = k_fold_cross_split_data(train_classes_split, train_data_log_svm, k_indices)

        
        ### Train Log model on fold
        for j, lambda_ in enumerate(tqdm(POSSIBLE_LAMBDA_LOG, desc='log loop', leave=False)):
            folds_train_accuracy = []
            folds_validation_accuracy = []
            for x_train, y_train, x_test, y_test in train_set_folds:
                initial_w = np.zeros((x_train.shape[1],))
                try:
                    w, train_loss = reg_logistic_regression(y_train, x_train, lambda_, initial_w, 150, 3e-1, 1)
                    folds_train_accuracy.append(compute_accuracy(predict_labels(w, x_train), y_train))
                    folds_validation_accuracy.append(compute_accuracy(predict_labels(w, x_test), y_test))
                except Exception:
                    pass
            train_accuracy_matrix[jet_num, 1, i, j] = (np.mean(folds_train_accuracy), np.std(folds_train_accuracy))
            validation_accuracy_matrix[jet_num, 1, i, j] = (np.mean(folds_validation_accuracy), np.std(folds_validation_accuracy))
        
        
        ### Train svm model on fold
        for j, lambda_ in enumerate(tqdm(POSSIBLE_LAMBDA_LOG, desc='svm loop', leave=False)):
            folds_train_accuracy = []
            folds_validation_accuracy = []
            for x_train, y_train, x_test, y_test in train_set_folds:
                initial_w = np.zeros((x_train.shape[1],))
                try:
                    w, train_loss = svm(y_train, x_train, lambda_, initial_w, 500, 1e-5, 1e-1)
                    folds_train_accuracy.append(compute_accuracy(predict_labels(w, x_train), y_train))
                    folds_validation_accuracy.append(compute_accuracy(predict_labels(w, x_test), y_test))
                except Exception:
                    pass
            train_accuracy_matrix[jet_num, 2, i, j] = (np.mean(folds_train_accuracy), np.std(folds_train_accuracy))
            validation_accuracy_matrix[jet_num, 2, i, j] = (np.mean(folds_validation_accuracy), np.std(folds_validation_accuracy))
            

In [7]:
def find_best_hyperparameters(error_values, lambdas, degrees):
    """ A function which finds the hyperparameters that give the highest accuracy
        
    Args:
        error_values (np.array) Cross-validation error values of all different 
        combinations of hyperparameters.
        lambdas (np.array): Array of different regularization coefficients to try.
        degrees (np.array): Array of different degrees coefficients to try for 
        feature expansion.

    Returns:
        (float), (int): Lambda and degree combination that result in lowest error
    """
    best_deg, best_lmbda = np.unravel_index(np.argmax(error_values), error_values.shape)
    # Extract the lambda and degree resulting in the highest accuracy.
    degree_best = degrees[best_deg]
    lambda_best = lambdas[best_lmbda]
    return lambda_best, degree_best

In [None]:
# ToDo : save train_accuracy_matrix / test_accuracy_matrix

#### THIS ABOVE SHOULD BE IN TRAINING.PY

#####THIS BELOW SHOULD BE IN RUN.PY

In [65]:
models_log = []

In [67]:
best_hyper = [(11, 2e-1), (10, 2e-1), (12, 2e-1), (12, 6e-2)]

In [69]:
for (deg, gamma), train_classes_split, train_data_split in zip(best_hyper[:], train_classes_jet_num_splits[:], train_data_jet_num_splits[:]):
    data_split, columns_to_remove, mean, std = preprocessing_pipeline(train_data_split, degree=np.int(deg), cross_term=True, norm_first=False)
    initial_w = np.zeros((data_split.shape[1],))
    w, loss = reg_logistic_regression(train_classes_split, data_split, 0, initial_w, 500, gamma, 1)
    print(f'Loss: {loss:.3f} Accuracy : {compute_accuracy(predict_labels(w, data_split), train_classes_split)}')
    # w, loss = reg_logistic_regression(train_classes_split, data_split, 1e-3, initial_w, 1001, 1e-5)
    # w, loss = svm(train_classes_split, data_split, 1e-3, initial_w, 1001, 1e-5)#5e-8)
    #print(w, loss)
    models_log.append((w, loss, columns_to_remove, mean, std))

0 59474.5837442635
50 34924.1754279905
100 34145.61171942162
converged at iter : 144
Loss: 34043.085 Accuracy : 0.8492688639116031
0 47839.02421297877
50 32278.223862363735
100 31821.801361800313
converged at iter : 125
Loss: 31781.714 Accuracy : 0.8132157226864748
0 30222.982805878484
50 17779.232029577157
100 17355.100065551153
converged at iter : 131
Loss: 17300.824 Accuracy : 0.8495206335973322
0 14610.258514485595
50 10350.443424413483
100 9389.609115668256
150 8873.373044491407
200 8554.4257320503
250 8337.70710705939
300 8182.904650568386
350 8066.056709368944
400 7978.275509149935
450 7910.783176154313
converged at iter : 468
Loss: 7890.391 Accuracy : 0.8521476267821693


In [17]:
test_classes, test_data, test_ids = load_csv_data("data/test.csv")

In [18]:
test_classes_jet_num_splits, test_data_jet_num_splits, test_ids_jet_num_splits = \
    split_data_by_categorical_column(test_classes,
                                     test_data,
                                     test_ids,
                                     PRI_JET_NUM_INDEX)

In [72]:
results = None
for (w, _, col_to_rm, mean, std), (deg, _), test_classes_split, test_data_split, test_ids_split in zip(models_log, best_hyper, test_classes_jet_num_splits, test_data_jet_num_splits, test_ids_jet_num_splits):
    data_split, _, _, _ = preprocessing_pipeline(test_data_split, degree=np.int(deg), columns_to_remove=col_to_rm, cross_term=True, norm_first=False, mean=mean, std=std)
    pred = predict_labels(w, data_split)
    out = np.stack((test_ids_split, pred), axis=-1)
    results = out if results is None else np.vstack((results, out))
    
print(results.shape)

(568238, 2)


In [None]:
results = None
for (w, _, col_to_rm, mean, std), (deg, _), test_classes_split, test_data_split, test_ids_split in zip(models_ridge, best_hyper_ridge, test_classes_jet_num_splits, test_data_jet_num_splits, test_ids_jet_num_splits):
    data_split, _, _, _ = preprocessing_pipeline(test_data_split, degree=np.int(deg), columns_to_remove=col_to_rm, norm_first=True, mean=mean, std=std)
    pred = predict_labels(w, data_split)
    out = np.stack((test_ids_split, pred), axis=-1)
    results = out if results is None else np.vstack((results, out))
    
print(results.shape)

In [73]:
from helpers import create_csv_submission

create_csv_submission(results[:, 0], results[:, 1], 'results_final_log.csv')