# Project 1 : Higgs Boson classification

<hr style="clear:both">
This notebook was made for the CS-433 class (Machine Learning) at EPFL.
    
**Authors:** \
[Tristan Carruzzo](https://people.epfl.ch/tristan.carruzzo)\
[Victor Dubien](https://people.epfl.ch/victor.dubien)\
[Anne-Valérie Preto](https://people.epfl.ch/anne-valerie.preto)
<hr style="clear:both">

### Contents

In [1]:
import numpy as np
import seaborn as sn
from implementations import *
import math as ma
import matplotlib.pyplot as plt
import random
random.seed(16)

# 1. Pre-processing

## 1.1. Data import

In [2]:
y_train, X_train, ID_train = load_csv_data('train.csv', sub_sample=False)

In [4]:
y_test, X_test, ID_test = load_csv_data('test.csv', sub_sample=False)

In [5]:
y_train[y_train==-1] = 0

## 1.2. Data separation according to column 22 "PRI_jet_num"

In [6]:
#splitting the data
X_train_0 = X_train[(X_train[:,22] == 0),:]
X_train_1 = X_train[(X_train[:,22] == 1),:]
X_train_2 = X_train[(X_train[:,22] == 2),:]
X_train_3 = X_train[(X_train[:,22] == 3),:]
X_test_0 = X_test[(X_test[:,22] == 0),:]
X_test_1 = X_test[(X_test[:,22] == 1),:]
X_test_2 = X_test[(X_test[:,22] == 2),:]
X_test_3 = X_test[(X_test[:,22] == 3),:]
y_train_0 = y_train[X_train[:,22] == 0]
y_train_1 = y_train[X_train[:,22] == 1]
y_train_2 = y_train[X_train[:,22] == 2]
y_train_3 = y_train[X_train[:,22] == 3]
ID_train_0 = ID_train[X_train[:,22] == 0]
ID_train_1 = ID_train[X_train[:,22] == 1]
ID_train_2 = ID_train[X_train[:,22] == 2]
ID_train_3 = ID_train[X_train[:,22] == 3]
ID_test_0 = ID_test[X_test[:,22] == 0]
ID_test_1 = ID_test[X_test[:,22] == 1]
ID_test_2 = ID_test[X_test[:,22] == 2]
ID_test_3 = ID_test[X_test[:,22] == 3]

## 1.3. NaN handling

In [7]:
#detection of columns which are full of -999 or full of 0
col_0 = [22] 
col_1 = [22]
col_2 = [22]
col_3 = [22]
for col in range(0,30):
    if (np.all(X_train_0[:,col]==-999) == True) or (np.all(X_train_0[:,col]==0) == True):
        col_0.append(col)
    if (np.all(X_train_1[:,col]==-999) == True) or (np.all(X_train_1[:,col]==0) == True):
        col_1.append(col)
    if (np.all(X_train_2[:,col]==-999) == True) or (np.all(X_train_2[:,col]==0) == True):
        col_2.append(col)
    if (np.all(X_train_3[:,col]==-999) == True) or (np.all(X_train_3[:,col]==0) == True):
        col_3.append(col)
print(col_0,"\n",col_1,"\n",col_2,"\n",col_3) 

[22, 4, 5, 6, 12, 22, 23, 24, 25, 26, 27, 28, 29] 
 [22, 4, 5, 6, 12, 26, 27, 28] 
 [22] 
 [22]


In [8]:
#Deleting the said columns
X_train_0=np.delete(X_train_0,col_0,axis=1)
X_test_0=np.delete(X_test_0,col_0,axis=1)

X_train_1=np.delete(X_train_1,col_1,axis=1)
X_test_1=np.delete(X_test_1,col_1,axis=1)

X_train_2=np.delete(X_train_2,col_2,axis=1)
X_test_2=np.delete(X_test_2,col_2,axis=1)

X_train_3=np.delete(X_train_3,col_3,axis=1)
X_test_3=np.delete(X_test_3,col_3,axis=1)

Columns 0 of each of the 8 arrays still contains some "-999", we replace them with NaNs in order to deal with them. They will be replaced with the mean of the column

In [9]:
X_train_nan_0 = np.where(X_train_0[:,:] == -999, np.nan, X_train_0[:,:])
X_train_nan_1 = np.where(X_train_1[:,:] == -999, np.nan, X_train_1[:,:])
X_train_nan_2 = np.where(X_train_2[:,:] == -999, np.nan, X_train_2[:,:])
X_train_nan_3 = np.where(X_train_3[:,:] == -999, np.nan, X_train_3[:,:]) 
nanmeantrain0 = np.nanmean(X_train_nan_0[:,0])
nanmeantrain1 = np.nanmean(X_train_nan_1[:,0])
nanmeantrain2 = np.nanmean(X_train_nan_2[:,0])
nanmeantrain3 = np.nanmean(X_train_nan_3[:,0])
X_train_0[:,0] = np.where(X_train_0[:,0] == -999, nanmeantrain0, X_train_0[:,0])
X_train_1[:,0] = np.where(X_train_1[:,0] == -999, nanmeantrain1, X_train_1[:,0])
X_train_2[:,0] = np.where(X_train_2[:,0] == -999, nanmeantrain2, X_train_2[:,0])
X_train_3[:,0] = np.where(X_train_3[:,0] == -999, nanmeantrain3, X_train_3[:,0])
X_test_0[:,0] = np.where(X_test_0[:,0] == -999, nanmeantrain0, X_test_0[:,0])
X_test_1[:,0] = np.where(X_test_1[:,0] == -999, nanmeantrain1, X_test_1[:,0])
X_test_2[:,0] = np.where(X_test_2[:,0] == -999, nanmeantrain2, X_test_2[:,0])
X_test_3[:,0] = np.where(X_test_3[:,0] == -999, nanmeantrain3, X_test_3[:,0])

In [10]:
#keeping copies of the arrays for the third part of the code before modifying them
X_train_0_bis = X_train_0.copy()
X_train_1_bis = X_train_1.copy()
X_train_2_bis = X_train_2.copy()
X_train_3_bis = X_train_3.copy()
X_test_0_bis = X_test_0.copy()
X_test_1_bis = X_test_1.copy()
X_test_2_bis = X_test_2.copy()
X_test_3_bis = X_test_3.copy()

## 1.4. Feature expansion

In [10]:
# degree for each set
expansion_degree_0 = 4
expansion_degree_1 = 7 
expansion_degree_2 = 4 
expansion_degree_3 = 5 

In [11]:
#creation of the new arrays
new_X_train_0 = np.zeros((X_train_0.shape[0], X_train_0.shape[1]*expansion_degree_0))
new_X_train_1 = np.zeros((X_train_1.shape[0], X_train_1.shape[1]*expansion_degree_1))
new_X_train_2 = np.zeros((X_train_2.shape[0], X_train_2.shape[1]*expansion_degree_2))
new_X_train_3 = np.zeros((X_train_3.shape[0], X_train_3.shape[1]*expansion_degree_3))
new_X_test_0 = np.zeros((X_test_0.shape[0], X_test_0.shape[1]*expansion_degree_0))
new_X_test_1 = np.zeros((X_test_1.shape[0], X_test_1.shape[1]*expansion_degree_1))
new_X_test_2 = np.zeros((X_test_2.shape[0], X_test_2.shape[1]*expansion_degree_2))
new_X_test_3 = np.zeros((X_test_3.shape[0], X_test_3.shape[1]*expansion_degree_3))

#replacing the original values in the first columns
new_X_train_0[:,0:X_train_0.shape[1]] = X_train_0
new_X_train_1[:,0:X_train_1.shape[1]] = X_train_1
new_X_train_2[:,0:X_train_2.shape[1]] = X_train_2
new_X_train_3[:,0:X_train_3.shape[1]] = X_train_3
new_X_test_0[:,0:X_test_0.shape[1]] = X_test_0
new_X_test_1[:,0:X_test_1.shape[1]] = X_test_1
new_X_test_2[:,0:X_test_2.shape[1]] = X_test_2
new_X_test_3[:,0:X_test_3.shape[1]] = X_test_3
                      
#placing the values in the array
count=0
for i in range(X_train_0.shape[1]):
    for j in range(2, expansion_degree_0+1):
        new_X_train_0[:,count+X_train_0.shape[1]] = X_train_0[:,i]**j
        new_X_test_0[:, count+X_test_0.shape[1]] = X_test_0[:,i]**j
        count += 1
        
count=0
for i in range(X_train_1.shape[1]):
    for j in range(2, expansion_degree_1+1):
        new_X_train_1[:,count+X_train_1.shape[1]] = X_train_1[:,i]**j
        new_X_test_1[:, count+X_test_1.shape[1]] = X_test_1[:,i]**j
        count += 1

count=0
for i in range(X_train_2.shape[1]):
    for j in range(2, expansion_degree_2+1):
        new_X_train_2[:,count+X_train_2.shape[1]] = X_train_2[:,i]**j
        new_X_test_2[:, count+X_test_2.shape[1]] = X_test_2[:,i]**j
        count += 1
        
count=0
for i in range(X_train_3.shape[1]):
    for j in range(2, expansion_degree_3+1):
        new_X_train_3[:,count+X_train_3.shape[1]] = X_train_3[:,i]**j
        new_X_test_3[:, count+X_test_3.shape[1]] = X_test_3[:,i]**j
        count += 1
        
        
X_train_0 = new_X_train_0
X_train_1 = new_X_train_1
X_train_2 = new_X_train_2
X_train_3 = new_X_train_3
X_test_0 = new_X_test_0
X_test_1 = new_X_test_1
X_test_2 = new_X_test_2
X_test_3 = new_X_test_3

## 1.5. Normalisation

In [12]:
mean_0 = np.mean(X_train_0,axis=0)
std_0 = np.std(X_train_0,axis=0)
X_train_0 = normalize(X_train_0,mean_0,std_0)
X_test_0 = normalize(X_test_0,mean_0,std_0)

mean_1 = np.mean(X_train_1,axis=0)
std_1 = np.std(X_train_1,axis=0)
X_train_1 = normalize(X_train_1,mean_1,std_1)
X_test_1 = normalize(X_test_1,mean_1,std_1)

mean_2 = np.mean(X_train_2,axis=0)
std_2 = np.std(X_train_2,axis=0)
X_train_2 = normalize(X_train_2,mean_2,std_2)
X_test_2 = normalize(X_test_2,mean_2,std_2)

mean_3 = np.mean(X_train_3,axis=0)
std_3 = np.std(X_train_3,axis=0)
X_train_3 = normalize(X_train_3,mean_3,std_3)
X_test_3 = normalize(X_test_3,mean_3,std_3)

## 1.6. K-Fold for cross validation

In [13]:
#splitting the train arrays in equal parts
k_fold_0=11
fold_x_0,fold_y_0 = k_fold_data(X_train_0,y_train_0,k_fold_0)

k_fold_1=8
fold_x_1,fold_y_1 = k_fold_data(X_train_1,y_train_1,k_fold_1)

k_fold_2=7
fold_x_2,fold_y_2 = k_fold_data(X_train_2,y_train_2,k_fold_2)

k_fold_3=6
fold_x_3,fold_y_3 = k_fold_data(X_train_3,y_train_3,k_fold_3)

#rearranging
xs_0 = np.zeros((k_fold_0,fold_x_0.shape[1]*(k_fold_0-1),X_train_0.shape[1])) 
ys_0 = np.zeros((fold_x_0.shape[1]*(k_fold_0-1),k_fold_0))
for i in range(k_fold_0):
    ys_0[:,i] = np.delete(fold_y_0,i,axis=1).reshape((fold_x_0.shape[1]*(k_fold_0-1),),order='F') 
    xs_0[i,:,:] = np.delete(fold_x_0,i,axis=0).reshape((fold_x_0.shape[1]*(k_fold_0-1),X_train_0.shape[1])) 

xs_1 = np.zeros((k_fold_1,fold_x_1.shape[1]*(k_fold_1-1),X_train_1.shape[1])) 
ys_1 = np.zeros((fold_x_1.shape[1]*(k_fold_1-1),k_fold_1))
for i in range(k_fold_1):
    ys_1[:,i] = np.delete(fold_y_1,i,axis=1).reshape((fold_x_1.shape[1]*(k_fold_1-1),),order='F')
    xs_1[i,:,:] = np.delete(fold_x_1,i,axis=0).reshape((fold_x_1.shape[1]*(k_fold_1-1),X_train_1.shape[1])) 

xs_2 = np.zeros((k_fold_2,fold_x_2.shape[1]*(k_fold_2-1),X_train_2.shape[1])) 
ys_2 = np.zeros((fold_x_2.shape[1]*(k_fold_2-1),k_fold_2))
for i in range(k_fold_2):
    ys_2[:,i] = np.delete(fold_y_2,i,axis=1).reshape((fold_x_2.shape[1]*(k_fold_2-1),),order='F')
    xs_2[i,:,:] = np.delete(fold_x_2,i,axis=0).reshape((fold_x_2.shape[1]*(k_fold_2-1),X_train_2.shape[1])) 

xs_3 = np.zeros((k_fold_3,fold_x_3.shape[1]*(k_fold_3-1),X_train_3.shape[1])) 
ys_3 = np.zeros((fold_x_3.shape[1]*(k_fold_3-1),k_fold_3))
for i in range(k_fold_3):
    ys_3[:,i] = np.delete(fold_y_3,i,axis=1).reshape((fold_x_3.shape[1]*(k_fold_3-1),),order='F')
    xs_3[i,:,:] = np.delete(fold_x_3,i,axis=0).reshape((fold_x_3.shape[1]*(k_fold_3-1),X_train_3.shape[1]))

# 2. Base Models

## Mean squared error gradient descent, mean squared error stochastic gradient descent, least squares and ridge regression

### On dataset 0

In [14]:
w_s=[]
losses=[]
initial_w=np.full((X_train_0.shape[1],1), 1e-16) 
max_iters=100
accs=[]
precs=[]
recs=[]
F1s=[]
batch_size = 1 
lambda_ = 1e-3
gamma = 1e-2

for i in range(k_fold_0):
    ys_i = ys_0[:,i].reshape((fold_x_0.shape[1]*(k_fold_0-1),1))
    xs_i = xs_0[i,:,:]

    #1) mean_squared_error_gd
    #gamma = 0.1 
    #w,loss = least_squares_GD(ys_i, xs_i, initial_w, max_iters, gamma)

    #2) mean_squared_error_sgd
    #gamma = 1e-3 
    #w,loss = least_squares_SGD(ys_i, xs_i,initial_w, max_iters, gamma)
        
    #3) least_squares
    #w,loss = least_squares(ys_i, xs_i)

    #4) ridge_regression
    #lambda =  0.001 | Accuracy moyenne : 0.7595 | D = 4
    w,loss = ridge_regression(ys_i, xs_i, lambda_)
    
    #5) logistic regression
    #w,loss = logistic_regression(y_train_0, X_train_0, initial_w, max_iters, gamma)

    #6) reg_logistic_regression
    #w,loss = reg_logistic_regression(y_train_0, X_train_0, lambda_ ,initial_w, max_iters, gamma) 

        
    losses.append(loss)
    
    val_x_i = fold_x_0[i,:,:]
        
    val_y_i = fold_y_0[:,i].reshape((fold_x_0.shape[1],1)) 

    y_pred=val_x_i.dot(w)
    y_pred = compute_sigmoid(y_pred)
    y_pred[y_pred>0.5] = 1
    y_pred[y_pred<=0.5] = 0

    TP = 0
    FP = 0
    TN = 0
    FN = 0
    for pred in range(len(y_pred)):
        if (y_pred[pred] == 1 and val_y_i[pred] == 1):
            TP+=1
        elif (y_pred[pred] == 1 and val_y_i[pred] == 0):
            FP+=1
        elif (y_pred[pred] == 0 and val_y_i[pred] == 1):
            FN+=1
        else :
            TN+=1
    acc = (TP+TN)/len(y_pred)
    prec = TP/(TP+FP)
    rec = TP/(TP+FN)
    F1score = 2*prec*rec/(prec+rec)

    if acc > 0.75 :
        w_s.append(w)

    accs.append(acc)
    precs.append(prec)
    recs.append(recs)
    F1s.append(F1score)
    conf_matrix = np.array([[TP, FP], [FN, TN]])
        
w_s = np.asarray(w_s)
print("lambda = ",lambda_,"| Accuracy moyenne :",np.mean(accs),"| Acc>0.75 :",w_s.shape[0])
print(accs)
w_avg_0 = np.mean(w_s,axis = 0)
print("Norme de w_0 :",np.linalg.norm(w_avg_0),"\n\n")

KernelInterrupted: Execution interrupted by the Jupyter kernel.

### On dataset 1

In [15]:
w_s=[]
losses=[]
initial_w=np.full((X_train_1.shape[1],1), 1e-16) 
max_iters=100
accs=[]
precs=[]
recs=[]
F1s=[]
batch_size = 1 
lambda_ = 1e-16
gamma = 1e-2

for i in range(k_fold_1):
    ys_i = ys_1[:,i].reshape((fold_x_1.shape[1]*(k_fold_1-1),1))
    xs_i = xs_1[i,:,:]

    #1) mean_squared_error_gd
    #gamma = 1e-5
    #w,loss = least_squares_GD(ys_i, xs_i, initial_w, max_iters, gamma)

    #2) mean_squared_error_sgd
    #gamma = 1e-3
    #w,loss = least_squares_SGD(ys_i, xs_i,initial_w, max_iters, gamma)
        
    #3) least_squares
    #w,loss = least_squares(ys_i, xs_i)

    #4) ridge_regression
    #lambda =  1e-10 | Accuracy moyenne : 0.72605 | D = 4
    w,loss = ridge_regression(ys_i, xs_i, lambda_)
    
    #5) logistic regression
    #w,loss = logistic_regression(y_train_0, X_train_0, initial_w, max_iters, gamma)

    #6) reg_logistic_regression
    #w,loss = reg_logistic_regression(y_train_0, X_train_0, lambda_ ,initial_w, max_iters, gamma)

        
    losses.append(loss)

    val_x_i = fold_x_1[i,:,:]
        
    val_y_i = fold_y_1[:,i].reshape((fold_x_1.shape[1],1)) 

    y_pred=val_x_i.dot(w)
    y_pred = compute_sigmoid(y_pred)
    y_pred[y_pred>0.5] = 1
    y_pred[y_pred<=0.5] = 0

    TP = 0
    FP = 0
    TN = 0
    FN = 0
    for pred in range(len(y_pred)):
        if (y_pred[pred] == 1 and val_y_i[pred] == 1):
            TP+=1
        elif (y_pred[pred] == 1 and val_y_i[pred] == 0):
            FP+=1
        elif (y_pred[pred] == 0 and val_y_i[pred] == 1):
            FN+=1
        else :
            TN+=1
    acc = (TP+TN)/len(y_pred)
    prec = TP/(TP+FP)
    rec = TP/(TP+FN)
    F1score = 2*prec*rec/(prec+rec)

    if acc > 0.72 :
        w_s.append(w)

    accs.append(acc)
    precs.append(prec)
    recs.append(recs)
    F1s.append(F1score)
    conf_matrix = np.array([[TP, FP], [FN, TN]])
        
w_s = np.asarray(w_s)
print("lambda = ",lambda_,"| Accuracy moyenne :",np.mean(accs),"| Acc>0.72 :",w_s.shape[0])
print(accs)
w_avg_1 = np.mean(w_s,axis = 0)
print("Norme de w_1 :",np.linalg.norm(w_avg_1),"\n\n")

lambda =  1e-16 | Accuracy moyenne : 0.7359950479727638 | Acc>0.72 : 7
[0.7365108841431961, 0.7368203858454555, 0.7339317032910347, 0.7425977509542969, 0.7439389249974209, 0.7408439079748272, 0.7137109254100897, 0.7396059011657897]
Norme de w_1 : 101065.50376765091 




### On dataset 2

In [16]:
w_s=[]
losses=[]
initial_w=np.full((X_train_2.shape[1],1), 1e-16) 
max_iters=100
accs=[]
precs=[]
recs=[]
F1s=[]
batch_size = 1 
lambda_ = 1e-10
gamma = 1e-2

for i in range(k_fold_2):
    ys_i = ys_2[:,i].reshape((fold_x_2.shape[1]*(k_fold_2-1),1))
    xs_i = xs_2[i,:,:]

    #1) mean_squared_error_gd
    #gamma = 0.1
    #w,loss = least_squares_GD(ys_i, xs_i, initial_w, max_iters, gamma)

    #2) mean_squared_error_sgd
    #gamma = 1e-3 
    #w,loss = least_squares_SGD(ys_i, xs_i,initial_w, max_iters, gamma)
        
    #3) least_squares
    #w,loss = least_squares(ys_i, xs_i)

    #4) ridge_regression
    #lambda =  1e-16 | Accuracy moyenne : 0.72605 | D = 4
    w,loss = ridge_regression(ys_i, xs_i, lambda_)
    
    #5) logistic regression
    #w,loss = logistic_regression(y_train_0, X_train_0, initial_w, max_iters, gamma)

    #6) reg_logistic_regression
    #w,loss = reg_logistic_regression(y_train_0, X_train_0, lambda_ ,initial_w, max_iters, gamma)

        
    losses.append(loss)

    val_x_i = fold_x_2[i,:,:]
        
    val_y_i = fold_y_2[:,i].reshape((fold_x_2.shape[1],1)) 

    y_pred=val_x_i.dot(w)
    y_pred = compute_sigmoid(y_pred)
    y_pred[y_pred>0.5] = 1
    y_pred[y_pred<=0.5] = 0

    TP = 0
    FP = 0
    TN = 0
    FN = 0
    for pred in range(len(y_pred)):
        if (y_pred[pred] == 1 and val_y_i[pred] == 1):
            TP+=1
        elif (y_pred[pred] == 1 and val_y_i[pred] == 0):
            FP+=1
        elif (y_pred[pred] == 0 and val_y_i[pred] == 1):
            FN+=1
        else :
            TN+=1
    acc = (TP+TN)/len(y_pred)
    prec = TP/(TP+FP)
    rec = TP/(TP+FN)
    F1score = 2*prec*rec/(prec+rec)

    if acc > 0.72 :
        w_s.append(w)

    accs.append(acc)
    precs.append(prec)
    recs.append(recs)
    F1s.append(F1score)
    conf_matrix = np.array([[TP, FP], [FN, TN]])
        
w_s = np.asarray(w_s)
print("lambda = ",lambda_,"| Accuracy moyenne :",np.mean(accs),"| Acc>0.72 :",w_s.shape[0])
print(accs)
w_avg_2 = np.mean(w_s,axis = 0)
print("Norme de w_2 :",np.linalg.norm(w_avg_2),"\n\n")

lambda =  1e-10 | Accuracy moyenne : 0.806129538101193 | Acc>0.72 : 7
[0.8086702792830346, 0.8072808114492149, 0.8081144921495067, 0.8089481728497985, 0.8078365985827428, 0.795331388078366, 0.806725024315687]
Norme de w_2 : 24.644642300643167 




### On dataset 3

In [17]:
w_s=[]
losses=[]
initial_w=np.full((X_train_3.shape[1],1), 1e-16) 
max_iters=100
accs=[]
precs=[]
recs=[]
F1s=[]
batch_size = 1 
lambda_ = 1e-16
gamma = 1e-2

for i in range(k_fold_3):
    ys_i = ys_3[:,i].reshape((fold_x_3.shape[1]*(k_fold_3-1),1))
    xs_i = xs_3[i,:,:]

    #1) mean_squared_error_gd
    #gamma = 1e-5 
    #w,loss = least_squares_GD(ys_i, xs_i, initial_w, max_iters, gamma)

    #2) mean_squared_error_sgd
    #gamma = 1e-3 
    #w,loss = least_squares_SGD(ys_i, xs_i,initial_w, max_iters, gamma)
        
    #3) least_squares
    #w,loss = least_squares(ys_i, xs_i)

    #4) ridge_regression
    #lambda =  1e-16 | Accuracy moyenne : 0.72605 | D = 4
    w,loss = ridge_regression(ys_i, xs_i, lambda_)
    
    #5) logistic regression
    #w,loss = logistic_regression(y_train_0, X_train_0, initial_w, max_iters, gamma)

    #6) reg_logistic_regression
    #w,loss = reg_logistic_regression(y_train_0, X_train_0, lambda_ ,initial_w, max_iters, gamma)

        
    losses.append(loss)

    val_x_i = fold_x_3[i,:,:]
        
    val_y_i = fold_y_3[:,i].reshape((fold_x_3.shape[1],1)) 

    y_pred=val_x_i.dot(w)
    y_pred = compute_sigmoid(y_pred)
    y_pred[y_pred>0.5] = 1
    y_pred[y_pred<=0.5] = 0

    TP = 0
    FP = 0
    TN = 0
    FN = 0
    for pred in range(len(y_pred)):
        if (y_pred[pred] == 1 and val_y_i[pred] == 1):
            TP+=1
        elif (y_pred[pred] == 1 and val_y_i[pred] == 0):
            FP+=1
        elif (y_pred[pred] == 0 and val_y_i[pred] == 1):
            FN+=1
        else :
            TN+=1
    acc = (TP+TN)/len(y_pred)
    prec = TP/(TP+FP)
    rec = TP/(TP+FN)
    F1score = 2*prec*rec/(prec+rec)

    if acc > 0.7 :
        w_s.append(w)

    accs.append(acc)
    precs.append(prec)
    recs.append(recs)
    F1s.append(F1score)
    conf_matrix = np.array([[TP, FP], [FN, TN]])
        
w_s = np.asarray(w_s)
print("lambda = ",lambda_,"| Accuracy moyenne :",np.mean(accs),"| Acc>0.7 :",w_s.shape[0])
print(accs)
w_avg_3 = np.mean(w_s,axis = 0)
print("Norme de w_3 :",np.linalg.norm(w_avg_3),"\n\n")

lambda =  1e-16 | Accuracy moyenne : 0.7095289658906334 | Acc>0.7 : 5
[0.7030319436924742, 0.7149431510557661, 0.7165674066053059, 0.7144017325392529, 0.6997834325933947, 0.7084461288576069]
Norme de w_3 : 163.16556669010563 




## 2.2 Predictions

In [18]:
y_test_0 = np.dot(X_test_0,w_avg_0)
y_test_0 = compute_sigmoid(y_test_0)
y_test_0[y_test_0>=0.5] = 1 
y_test_0[y_test_0<0.5] = -1 

y_test_1 = np.dot(X_test_1,w_avg_1)
y_test_1 = compute_sigmoid(y_test_1)
y_test_1[y_test_1>=0.5] = 1 
y_test_1[y_test_1<0.5] = -1 

y_test_2 = np.dot(X_test_2,w_avg_2)
y_test_2 = compute_sigmoid(y_test_2)
y_test_2[y_test_2>=0.5] = 1 
y_test_2[y_test_2<0.5] = -1 

y_test_3 = np.dot(X_test_3,w_avg_3)
y_test_3 = compute_sigmoid(y_test_3)
y_test_3[y_test_3>=0.5] = 1 
y_test_3[y_test_3<0.5] = -1 

## 2.3. Putting everything together and creating the csv

In [19]:
ID = np.concatenate((ID_test_0,ID_test_1,ID_test_2,ID_test_3),axis=0)
ID.shape = [568238,1]
y_test = np.concatenate((y_test_0,y_test_1,y_test_2,y_test_3),axis=0) 
y_to_sort = np.concatenate((ID,y_test),axis=1)
y_sorted = y_to_sort[y_to_sort[:, 0].argsort()]
y_to_submit = y_sorted[:,1]
#create_csv_submission(ID_test, y_to_submit, "method_1.csv")

# 3. Advanced Models

# Regularised logistic regression with binary cross-entropy loss

In [11]:
X_train_0 = X_train_0_bis
X_train_1 = X_train_1_bis
X_train_2 = X_train_2_bis
X_train_3 = X_train_3_bis
X_test_0 = X_test_0_bis
X_test_1 = X_test_1_bis
X_test_2 = X_test_2_bis
X_test_3 = X_test_3_bis

In [12]:
# degree
expansion_degree_0 = 5
expansion_degree_1 = 9 #à changer
expansion_degree_2 = 9 #4
expansion_degree_3 = 9

#creation of the new arrays
new_X_train_0 = np.zeros((X_train_0.shape[0], X_train_0.shape[1]*expansion_degree_0))
new_X_train_1 = np.zeros((X_train_1.shape[0], X_train_1.shape[1]*expansion_degree_1))
new_X_train_2 = np.zeros((X_train_2.shape[0], X_train_2.shape[1]*expansion_degree_2))
new_X_train_3 = np.zeros((X_train_3.shape[0], X_train_3.shape[1]*expansion_degree_3))
new_X_test_0 = np.zeros((X_test_0.shape[0], X_test_0.shape[1]*expansion_degree_0))
new_X_test_1 = np.zeros((X_test_1.shape[0], X_test_1.shape[1]*expansion_degree_1))
new_X_test_2 = np.zeros((X_test_2.shape[0], X_test_2.shape[1]*expansion_degree_2))
new_X_test_3 = np.zeros((X_test_3.shape[0], X_test_3.shape[1]*expansion_degree_3))

#replacing the original values in the first columns
new_X_train_0[:,0:X_train_0.shape[1]] = X_train_0
new_X_train_1[:,0:X_train_1.shape[1]] = X_train_1
new_X_train_2[:,0:X_train_2.shape[1]] = X_train_2
new_X_train_3[:,0:X_train_3.shape[1]] = X_train_3
new_X_test_0[:,0:X_test_0.shape[1]] = X_test_0
new_X_test_1[:,0:X_test_1.shape[1]] = X_test_1
new_X_test_2[:,0:X_test_2.shape[1]] = X_test_2
new_X_test_3[:,0:X_test_3.shape[1]] = X_test_3
                      
#placing the values in the array
count=0
for i in range(X_train_0.shape[1]):
    for j in range(2, expansion_degree_0+1):
        new_X_train_0[:,count+X_train_0.shape[1]] = X_train_0[:,i]**j
        new_X_test_0[:, count+X_test_0.shape[1]] = X_test_0[:,i]**j
        count += 1
        
count=0
for i in range(X_train_1.shape[1]):
    for j in range(2, expansion_degree_1+1):
        new_X_train_1[:,count+X_train_1.shape[1]] = X_train_1[:,i]**j
        new_X_test_1[:, count+X_test_1.shape[1]] = X_test_1[:,i]**j
        count += 1

count=0
for i in range(X_train_2.shape[1]):
    for j in range(2, expansion_degree_2+1):
        new_X_train_2[:,count+X_train_2.shape[1]] = X_train_2[:,i]**j
        new_X_test_2[:, count+X_test_2.shape[1]] = X_test_2[:,i]**j
        count += 1
        
count=0
for i in range(X_train_3.shape[1]):
    for j in range(2, expansion_degree_3+1):
        new_X_train_3[:,count+X_train_3.shape[1]] = X_train_3[:,i]**j
        new_X_test_3[:, count+X_test_3.shape[1]] = X_test_3[:,i]**j
        count += 1
        
        
X_train_0 = new_X_train_0
X_train_1 = new_X_train_1
X_train_2 = new_X_train_2
X_train_3 = new_X_train_3
X_test_0 = new_X_test_0
X_test_1 = new_X_test_1
X_test_2 = new_X_test_2
X_test_3 = new_X_test_3
X_train_0.shape

(99913, 90)

In [13]:
X_train_0, X_val_0, y_train_0, y_val_0 = separation_validation(X_train_0, y_train_0, 0.8)
X_train_1, X_val_1, y_train_1, y_val_1 = separation_validation(X_train_1, y_train_1, 0.8)
X_train_2, X_val_2, y_train_2, y_val_2 = separation_validation(X_train_2, y_train_2, 0.8)
X_train_3, X_val_3, y_train_3, y_val_3 = separation_validation(X_train_3, y_train_3, 0.8)

In [14]:
mean_0 = np.mean(X_train_0, axis=0)
mean_1 = np.mean(X_train_1, axis=0)
mean_2 = np.mean(X_train_2, axis=0)
mean_3 = np.mean(X_train_3, axis=0)
std_0 = np.std(X_train_0, axis=0)
std_1 = np.std(X_train_1, axis=0)
std_2 = np.std(X_train_2, axis=0)
std_3 = np.std(X_train_3, axis=0)
X_train_0 = normalize(X_train_0, mean_0, std_0)
X_train_1 = normalize(X_train_1, mean_1, std_1)
X_train_2 = normalize(X_train_2, mean_2, std_2)
X_train_3 = normalize(X_train_3, mean_3, std_3)
X_val_0 = normalize(X_val_0, mean_0, std_0)
X_val_1 = normalize(X_val_1, mean_1, std_1)
X_val_2 = normalize(X_val_2, mean_2, std_2)
X_val_3 = normalize(X_val_3, mean_3, std_3)
X_test_0 = normalize(X_test_0, mean_0, std_0)
X_test_1 = normalize(X_test_1, mean_1, std_1)
X_test_2 = normalize(X_test_2, mean_2, std_2)
X_test_3 = normalize(X_test_3, mean_3, std_3)

In [15]:
max_iters_0 = 403
lr0 = 0.2
loss_freq = 40
decay = 0.995
lambda_0 = 0.001

w_0, b_0, logger_0 = train_reg_logistic_regression(X_train_0, y_train_0, max_iters_0, lr0, loss_freq, decay, lambda_0)

Step size = 80
  sigmoid = 1/(1+np.exp(-x))
Loss at iter 0: 2.49802
Accuracy at iter 0:  0.5816214187413987
Learning rate : 0.20000
Loss at iter 40: 0.63817
Accuracy at iter 40:  0.746340547979482
Learning rate : 0.17294
Loss at iter 80: 0.50933
Accuracy at iter 80:  0.8131114725384712
Learning rate : 0.14152
Loss at iter 120: 0.48766
Accuracy at iter 120:  0.8216439384461404
Learning rate : 0.11581
Loss at iter 160: 0.47787
Accuracy at iter 160:  0.8250594269986238
Learning rate : 0.09477
Loss at iter 200: 0.47198
Accuracy at iter 200:  0.8271362442136869
Learning rate : 0.07755
Loss at iter 240: 0.46795
Accuracy at iter 240:  0.8284248717627924
Learning rate : 0.06346
Loss at iter 280: 0.46504
Accuracy at iter 280:  0.8290754410108845
Learning rate : 0.05193
Loss at iter 320: 0.46287
Accuracy at iter 320:  0.8298636306768422
Learning rate : 0.04250
Loss at iter 360: 0.46122
Accuracy at iter 360:  0.830038783935944
Learning rate : 0.03478
Loss at iter 400: 0.45994
Accuracy at iter 400

In [16]:
y_pred_0 = logistic_output(X_val_0, w_0, b_0)
y_pred_0[y_pred_0>0.5] = 1
y_pred_0[y_pred_0<=0.5] = 0
TP = 0
FP = 0
TN = 0
FN = 0
for pred in range(len(y_val_0)):
    if (y_pred_0[pred] == 1) & (y_val_0[pred] == 1):
        TP+=1
    elif (y_pred_0[pred] == 1) & (y_val_0[pred] == 0):
        FP+=1
    elif (y_pred_0[pred] == 0) & (y_val_0[pred] == 1):
        FN+=1
    else :
        TN+=1
acc = (TP+TN)/len(y_pred_0)
prec = TP/(TP+FP)
rec = TP/(TP+FN)
F1score = 2*prec*rec/(prec+rec)
print("Validation accuracy : ", acc*100, "%, precision = ", prec, "recall = ", rec, "F1-Score = ", F1score)

Validation accuracy :  82.89546114197067 %, precision =  0.7128125805620005 recall =  0.5454724797790491 F1-Score =  0.6180151989271345


In [17]:
y_test_0 = logistic_output(X_test_0, w_0, b_0)
y_test_0[y_test_0>0.5] = 1
y_test_0[y_test_0<=0.5] = -1

In [18]:
max_iters_1 = 403
lr1 = 0.8
loss_freq = 40
decay = 0.9999
lambda_1 = 0.0001

w_1, b_1, logger_1 = train_reg_logistic_regression(X_train_1, y_train_1, max_iters_1, lr1, loss_freq, decay, lambda_1)

Step size = 80
Loss at iter 0: 4.13413
Accuracy at iter 0:  0.544047714999597
Learning rate : 0.80000
Loss at iter 40: 0.59721
Accuracy at iter 40:  0.7327476424599016
Learning rate : 0.79768
Loss at iter 80: 0.53650
Accuracy at iter 80:  0.7524784395905537
Learning rate : 0.79450
Loss at iter 120: 0.51939
Accuracy at iter 120:  0.7599903280406223
Learning rate : 0.79133
Loss at iter 160: 0.51024
Accuracy at iter 160:  0.7633110340936569
Learning rate : 0.78817
Loss at iter 200: 0.50411
Accuracy at iter 200:  0.7662287418392842
Learning rate : 0.78502
Loss at iter 240: 0.49911
Accuracy at iter 240:  0.7688724107358749
Learning rate : 0.78189
Loss at iter 280: 0.49532
Accuracy at iter 280:  0.7706778431530588
Learning rate : 0.77877
Loss at iter 320: 0.49231
Accuracy at iter 320:  0.7719190779398727
Learning rate : 0.77566
Loss at iter 360: 0.48985
Accuracy at iter 360:  0.773128072862094
Learning rate : 0.77256
Loss at iter 400: 0.48773
Accuracy at iter 400:  0.774320947852019
Learning

In [19]:
y_pred_1 = logistic_output(X_val_1, w_1, b_1)
y_pred_1[y_pred_1>0.5] = 1
y_pred_1[y_pred_1<=0.5] = 0
TP = 0
FP = 0
TN = 0
FN = 0
for pred in range(len(y_val_1)):
    if (y_pred_1[pred] == 1) & (y_val_1[pred] == 1):
        TP+=1
    elif (y_pred_1[pred] == 1) & (y_val_1[pred] == 0):
        FP+=1
    elif (y_pred_1[pred] == 0) & (y_val_1[pred] == 1):
        FN+=1
    else :
        TN+=1
acc = (TP+TN)/len(y_pred_1)
prec = TP/(TP+FP)
rec = TP/(TP+FN)
F1score = 2*prec*rec/(prec+rec)
print("Validation accuracy : ", acc*100, "%, precision = ", prec, "recall = ", rec, "F1-Score = ", F1score)

Validation accuracy :  77.59365529692437 %, precision =  0.7151834583417799 recall =  0.6302250803858521 F1-Score =  0.6700218402810749


In [20]:
y_test_1 = logistic_output(X_test_1, w_1, b_1)
y_test_1[y_test_1>0.5] = 1
y_test_1[y_test_1<=0.5] = -1

In [21]:
max_iters_2 = 403
lr2 = 0.55
loss_freq = 40
decay = 0.999
lambda_2 = 0.0001

w_2, b_2, logger_2 = train_reg_logistic_regression(X_train_2, y_train_2, max_iters_2, lr2, loss_freq, decay, lambda_2)

Step size = 80
Loss at iter 0: 4.56593
Accuracy at iter 0:  0.513187603900454
Learning rate : 0.55000
Loss at iter 40: 0.75417
Accuracy at iter 40:  0.7036945140560256
Learning rate : 0.53427
Loss at iter 80: 0.59318
Accuracy at iter 80:  0.7416073245167853
Learning rate : 0.51331
Loss at iter 120: 0.54950
Accuracy at iter 120:  0.7566930501451505
Learning rate : 0.49317
Loss at iter 160: 0.53045
Accuracy at iter 160:  0.7634667394486763
Learning rate : 0.47383
Loss at iter 200: 0.51855
Accuracy at iter 200:  0.7679329082202317
Learning rate : 0.45524
Loss at iter 240: 0.51053
Accuracy at iter 240:  0.7707862938242811
Learning rate : 0.43738
Loss at iter 280: 0.50473
Accuracy at iter 280:  0.7737637396719848
Learning rate : 0.42022
Loss at iter 320: 0.50017
Accuracy at iter 320:  0.7755998312780686
Learning rate : 0.40374
Loss at iter 360: 0.49641
Accuracy at iter 360:  0.7778081036151155
Learning rate : 0.38790
Loss at iter 400: 0.49318
Accuracy at iter 400:  0.7791479542465821
Learni

In [22]:
y_pred_2 = logistic_output(X_val_2, w_2, b_2)
y_pred_2[y_pred_2>0.5] = 1
y_pred_2[y_pred_2<=0.5] = 0
TP = 0
FP = 0
TN = 0
FN = 0
for pred in range(len(y_val_2)):
    if (y_pred_2[pred] == 1) & (y_val_2[pred] == 1):
        TP+=1
    elif (y_pred_2[pred] == 1) & (y_val_2[pred] == 0):
        FP+=1
    elif (y_pred_2[pred] == 0) & (y_val_2[pred] == 1):
        FN+=1
    else :
        TN+=1
acc = (TP+TN)/len(y_pred_2)
prec = TP/(TP+FP)
rec = TP/(TP+FN)
F1score = 2*prec*rec/(prec+rec)
print("Validation accuracy : ", acc*100, "%, precision = ", prec, "recall = ", rec, "F1-Score = ", F1score)

Validation accuracy :  78.19571258435887 %, precision =  0.7869008974603781 recall =  0.7921953094963475 F1-Score =  0.7895392278953923


In [23]:
y_test_2 = logistic_output(X_test_2, w_2, b_2)
y_test_2[y_test_2>0.5] = 1
y_test_2[y_test_2<=0.5] = -1

In [24]:
max_iters_3 = 403
lr3 = 0.8
loss_freq = 40
decay = 0.9999
lambda_3 = 0.001

w_3, b_3, logger_3 = train_reg_logistic_regression(X_train_3, y_train_3, max_iters_3, lr3, loss_freq, decay, lambda_3)

Step size = 80
Loss at iter 0: 4.73206
Accuracy at iter 0:  0.5045400710619818
Learning rate : 0.80000
Loss at iter 40: 0.83146
Accuracy at iter 40:  0.7114657943714399
Learning rate : 0.79768
Loss at iter 80: 0.70568
Accuracy at iter 80:  0.7345327392702047
Learning rate : 0.79450
Loss at iter 120: 0.64704
Accuracy at iter 120:  0.7477863628672946
Learning rate : 0.79133
Loss at iter 160: 0.60926
Accuracy at iter 160:  0.7568665049912583
Learning rate : 0.78817
Loss at iter 200: 0.58425
Accuracy at iter 200:  0.7637471095820879
Learning rate : 0.78502
Loss at iter 240: 0.56590
Accuracy at iter 240:  0.7669054198860752
Learning rate : 0.78189
Loss at iter 280: 0.55162
Accuracy at iter 280:  0.7687665670294964
Learning rate : 0.77877
Loss at iter 320: 0.54059
Accuracy at iter 320:  0.7704585189780611
Learning rate : 0.77566
Loss at iter 360: 0.53243
Accuracy at iter 360:  0.7725452597146241
Learning rate : 0.77256
Loss at iter 400: 0.52520
Accuracy at iter 400:  0.7735604308837629
Learn

In [25]:
y_pred_3 = logistic_output(X_val_3, w_3, b_3)
y_pred_3[y_pred_3>0.5] = 1
y_pred_3[y_pred_3<=0.5] = 0
TP = 0
FP = 0
TN = 0
FN = 0
for pred in range(len(y_val_3)):
    if (y_pred_3[pred] == 1) & (y_val_3[pred] == 1):
        TP+=1
    elif (y_pred_3[pred] == 1) & (y_val_3[pred] == 0):
        FP+=1
    elif (y_pred_3[pred] == 0) & (y_val_3[pred] == 1):
        FN+=1
    else :
        TN+=1
acc = (TP+TN)/len(y_pred_3)
prec = TP/(TP+FP)
rec = TP/(TP+FN)
F1score = 2*prec*rec/(prec+rec)
print("Validation accuracy : ", acc*100, "%, precision = ", prec, "recall = ", rec, "F1-Score = ", F1score)

Validation accuracy :  77.64493570945183 %, precision =  0.7061611374407583 recall =  0.44510828976848393 F1-Score =  0.5460375629867155


In [26]:
y_test_3 = logistic_output(X_test_3, w_3, b_3)
y_test_3[y_test_3>0.5] = 1
y_test_3[y_test_3<=0.5] = -1

In [27]:
ID = np.concatenate((ID_test_0,ID_test_1,ID_test_2,ID_test_3),axis=0)
ID.shape = [568238,1]
y_test = np.concatenate((y_test_0,y_test_1,y_test_2,y_test_3),axis=0) 
y_test.shape = [568238,1]
y_to_sort = np.concatenate((ID,y_test),axis=1)
y_sorted = y_to_sort[y_to_sort[:, 0].argsort()]
y_to_submit = y_sorted[:,1]

In [28]:
count =0
for i in range(len(y_to_submit)) :
    if y_to_submit[i] == 1:
        count+=1
print(count*100/len(y_to_submit),"% de valeur à 1") #34,27% dans y_train

29.830810329474623 % de valeur à 1


In [29]:
create_csv_submission(ID_test, y_to_submit, "test_final2.csv")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=a9cac0cd-fc8e-4c76-a346-a5de9c0344c9' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>