In [1]:
import numpy as np
from proj1_helpers import *
from cross_validation import *
from tools import *
from implementations import *

%load_ext autoreload
%autoreload 2
seed = 10

In [2]:
# Load data

DATA_TRAIN_PATH = '/Users/sonychan/Desktop/EPFL/Course2017_Autum/Machine_Learning/project/project1/epfl-ml2017-project1-master/train.csv'
DATA_TEST_PATH = '/Users/sonychan/Desktop/EPFL/Course2017_Autum/Machine_Learning/project/project1/epfl-ml2017-project1-master/test.csv'

y, tx_train, ids_train = load_csv_data(DATA_TRAIN_PATH)
_, tx_test, ids_test = load_csv_data(DATA_TEST_PATH)

# Split data by jet_no
dict_jets_train = group_features_by_jet(tx_train)
dict_jets_test = group_features_by_jet(tx_test)

In [3]:
# Generate Test parameters

def gen_para(lambdas, degree):
    test_para = []
    for la in lambdas:
        for d in range(1, degree+1):
            test_para.append((la,d))
    return test_para


test_para = gen_para([0.0001, 0.001, 0.01], 15)
        

In [6]:
# Cross validation

def cross_validation(y, x, k_indices, k, lambda_):
    test_indice = k_indices[k]
    train_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    train_indice = train_indice.reshape(-1)

    ty_test = y[test_indice]
    ty_train = y[train_indice]
    tx_test = x[test_indice]
    tx_train = x[train_indice]
    
    ################# replace regression method here ##################
    weight, loss_train = ridge_regression(ty_train, tx_train, lambda_)

    y_train_pred = predict_labels(weight, tx_train)
    y_test_pred = predict_labels(weight, tx_test)

    accuracy_train = compute_accuracy(y_train_pred, ty_train)
    accuracy_test = compute_accuracy(y_test_pred, ty_test)

    return accuracy_train, accuracy_test

In [7]:
# Grid Search 

resaults = []

# Set k_fold_no
k_fold = 5

for index in range(3):
    for lambda_, degree in test_para:  
        #data processing
        x_train = tx_train[dict_jets_train[index]]
        y_train = y[dict_jets_train[index]]
        x_test = tx_test[dict_jets_test[index]]

        x_train, _ = process_data(x_train, x_test)
        
        # Build Poly matrix
        x_train = build_polynomial_features(x_train, degree)
        x_train = np.hstack((np.ones((x_train.shape[0], 1)), x_train))

        # Split data in k-fold
        k_indices = build_k_indices(y_train, k_fold, seed)
        list_accuracy_train = []
        list_accuracy_test = []

        for k in range(k_fold):
            a_train, a_test = cross_validation(y_train, x_train, k_indices, k, lambda_)
            list_accuracy_train.append(a_train)
            list_accuracy_test.append(a_test)

        mean_acc= np.mean(list_accuracy_train)
        var_acc = np.var(list_accuracy_train)
        tmean_acc= np.mean(list_accuracy_test)
        tvar_acc = np.var(list_accuracy_test)

        total_acc = (mean_acc + tmean_acc) /2

        resaults.append((index, total_acc , (lambda_, degree)))

        print('Jet_no: {}, train_Avg_acc; {}, test_Avg_acc: {}, total_acc: {},  Var_acc: {} (lambda, degree): ({},{})'.format(index, mean_acc, 
                                                                                              tmean_acc,total_acc,var_acc,lambda_, degree, x_train.shape))

        

Jet_no: 0, train_Avg_acc; 0.8442723451105996, test_Avg_acc: 0.8433690321289159, total_acc: 0.8438206886197577,  Var_acc: 4.426088373941631e-07 (lambda, degree): (0.0001,1)
Jet_no: 0, train_Avg_acc; 0.8446201581423282, test_Avg_acc: 0.8434891402262036, total_acc: 0.8440546491842659,  Var_acc: 3.7657753453436884e-07 (lambda, degree): (0.0001,2)
Jet_no: 0, train_Avg_acc; 0.8453282954659193, test_Avg_acc: 0.8442398158342508, total_acc: 0.844784055650085,  Var_acc: 3.412765213043906e-07 (lambda, degree): (0.0001,3)
Jet_no: 0, train_Avg_acc; 0.8464192773496146, test_Avg_acc: 0.8454408968071265, total_acc: 0.8459300870783706,  Var_acc: 2.1786698409910936e-07 (lambda, degree): (0.0001,4)
Jet_no: 0, train_Avg_acc; 0.8471574416975278, test_Avg_acc: 0.8459213291962767, total_acc: 0.8465393854469023,  Var_acc: 3.315089475834197e-07 (lambda, degree): (0.0001,5)
Jet_no: 0, train_Avg_acc; 0.8482559303373035, test_Avg_acc: 0.8470023020718648, total_acc: 0.8476291162045841,  Var_acc: 3.030828035493083e

Jet_no: 1, train_Avg_acc; 0.8086245808614908, test_Avg_acc: 0.8064740778952799, total_acc: 0.8075493293783853,  Var_acc: 3.322895771834427e-07 (lambda, degree): (0.0001,4)
Jet_no: 1, train_Avg_acc; 0.8096530822801136, test_Avg_acc: 0.8076863554294557, total_acc: 0.8086697188547847,  Var_acc: 3.3852663362814853e-07 (lambda, degree): (0.0001,5)
Jet_no: 1, train_Avg_acc; 0.8132737941707505, test_Avg_acc: 0.810729945834408, total_acc: 0.8120018700025793,  Var_acc: 2.222471113114435e-07 (lambda, degree): (0.0001,6)
Jet_no: 1, train_Avg_acc; 0.81513412432293, test_Avg_acc: 0.8123033273149343, total_acc: 0.8137187258189321,  Var_acc: 4.710640830771928e-07 (lambda, degree): (0.0001,7)
Jet_no: 1, train_Avg_acc; 0.8160207634769151, test_Avg_acc: 0.8132447768893474, total_acc: 0.8146327701831313,  Var_acc: 4.884238901815011e-07 (lambda, degree): (0.0001,8)
Jet_no: 1, train_Avg_acc; 0.8170202476141346, test_Avg_acc: 0.8140959504771731, total_acc: 0.8155580990456539,  Var_acc: 5.316674815311619e-07

Jet_no: 2, train_Avg_acc; 0.8507340777502067, test_Avg_acc: 0.8457816377171217, total_acc: 0.8482578577336641,  Var_acc: 5.011590916364501e-07 (lambda, degree): (0.0001,7)
Jet_no: 2, train_Avg_acc; 0.8515336366142817, test_Avg_acc: 0.8468706920319823, total_acc: 0.849202164323132,  Var_acc: 3.653993054724212e-07 (lambda, degree): (0.0001,8)
Jet_no: 2, train_Avg_acc; 0.8524159084642957, test_Avg_acc: 0.8479873173421562, total_acc: 0.8502016129032259,  Var_acc: 3.343040193676872e-07 (lambda, degree): (0.0001,9)
Jet_no: 2, train_Avg_acc; 0.8528604907637167, test_Avg_acc: 0.8483733112765371, total_acc: 0.850616901020127,  Var_acc: 5.661289980508762e-07 (lambda, degree): (0.0001,10)
Jet_no: 2, train_Avg_acc; 0.8525916735594155, test_Avg_acc: 0.8483181692859112, total_acc: 0.8504549214226633,  Var_acc: 3.29790570658785e-07 (lambda, degree): (0.0001,11)
Jet_no: 2, train_Avg_acc; 0.8516163496002207, test_Avg_acc: 0.8465811965811966, total_acc: 0.8490987730907087,  Var_acc: 9.870793551321133e-0

In [16]:
def find_max(resault):
    jet_no = list(set([x[0] for x in resault]))
    for no in jet_no:
        jet_dict = {x[1]: x[2] for x in resault if x[0] == no}

        order = sorted(jet_dict.keys())

        print("Jet_no:{}, Max_acc: {:.5f} at {}, 2nd_acc: {:.5f} at {}".format(no, order[-1],jet_dict[order[-1]], order[-2],
                                                                               jet_dict[order[-2]] ))
        



In [17]:
find_max(resaults)

Jet_no:0, Max_acc: 0.84895 at (0.0001, 10), 2nd_acc: 0.84858 at (0.001, 10)
Jet_no:1, Max_acc: 0.81677 at (0.0001, 13), 2nd_acc: 0.81636 at (0.0001, 10)
Jet_no:2, Max_acc: 0.85062 at (0.0001, 10), 2nd_acc: 0.85045 at (0.0001, 11)
