## Classification ML project

In [None]:
from IPython.core.display import SVG
SVG(filename='pipeline.svg')

## Loading the Data:

In [None]:
import logistic as log
import split as spl
import least_squares as lst
import helpers as hlp
import pre_processing as pre

In [None]:
DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here 
DATA_TEST_PATH = '../data/test.csv'

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
import pandas as pd # cannot use external libraries, just pandas for data exploration

In [None]:
y, tX, ids = hlp.load_csv_data(DATA_TRAIN_PATH)
_, tX_test, ids_test = hlp.load_csv_data(DATA_TEST_PATH)

In [None]:
print(tX.shape)
print(y.shape)

## Data exploration

In [None]:
data = pd.read_csv(DATA_TRAIN_PATH)
test_data = pd.read_csv(DATA_TEST_PATH)
dic = {'s':1,'b':-1}
data.Prediction = data.Prediction.map(dic)
test_data.Prediction = test_data.Prediction.map(dic)
data.head(10)

In [None]:
mask = data.isin([-999]).any(axis = 1)
print(len(data[mask]))
print(len(data))

_The vast majoriy of our data has -999 values: we'd better handle it carefully_

In [None]:
#data.replace(to_replace = -999,value = np.nan, inplace = True)

In [None]:
#replace_val = np.nan
#tX = np.where(tX == -999,replace_val,tX)

In [None]:
std = np.nanstd(tX,axis = 0)
mean = np.nanmean(tX,axis = 0)

In [None]:
print('Train set size: {} samples x {} features'.format(pd.DataFrame(tX).shape[0], pd.DataFrame(tX).shape[1]))
print('Test set size: {} samples x {} features'.format(test_data.shape[0], pd.DataFrame(tX).shape[1]))

In [None]:
data.info()

In [None]:
data.describe()

## Data Cleaning

In [None]:
#totrash before submit: we use pandas to know to which index PRI_jet_num does correspond.
np.where(data.columns.values == "PRI_jet_num")

In [None]:
data_trains = spl.split_categorical_data(tX,22,labels = y,split = False)
data_tests = spl.split_categorical_data(tX_test,22,split = False)

In [None]:
mean = 0
stdev = 0
clean_data_trains = []
clean_data_tests = []
for (x_train,y_train),(x_test,_) in zip(data_trains,data_tests):
    x_train,x_test = pre.clean_variance(x_train,x_test)
    
    x_train = pre.clean_value(x_train,-999,np.nan)
    x_test = pre.clean_value(x_test,-999,np.nan)
    
    x_train,mean,stdev =  pre.standardize_data(x_train)
    x_test,_,_ = pre.standardize_data(x_test, mean,stdev)
    
    x_train = pre.clean_value(x_train,np.nan,0,inplace = True)
    x_test = pre.clean_value(x_test,np.nan,0,inplace = True)
    
    x_train = np.c_[np.ones((x_train.shape[0], 1)), x_train]
    x_test = np.c_[np.ones((x_test.shape[0], 1)), x_test]
    
    clean_data_trains.append((x_train,y_train))
    clean_data_tests.append((x_test,None))

## Data Processing

_We now need to standardize the function so that they all take the same type of parameters as inputs_

In [None]:
init_w = np.random.rand(x_train.shape[1])
maxiters = 100
gamma = 0.01

#method 1
log_reg = lambda  y, x: log.logistic_regression(y_train,x_train,init_w,maxiters,gamma)

#method 2
reg_log_reg = lambda y,x : log.reg_logistic_regression(y_train, x_train, lambda_, init_w, maxiters, gamma)

#method 3
ridge = lambda y, x: lst.ridge_regression(y,x,lambda_)

methods = [log_reg]

_At this point we try the different models defined in the cell above: to do so run the cell below, and check the obtained accuracies._

In [None]:
accuracies_group_means = []
accuracies_group_stds = []
for round_,((x_train,y_train),meth) in enumerate(zip(clean_data_trains,methods)):
    print("#################################")
    print("**********treating the {i}th group of data:**************".format(i = round_+1))
    acc_mean, acc_std = spl.k_fold_cv(y_train,x_train,2,meth)
    accuracies_group_means.append(acc_mean)
    accuracies_group_stds.append(acc_std)
print("done! Obtained accuracies:",accuracies_group_means)

## Submission

_We now interpolate the data thanks to the model defined 2 cells higher..._

In [None]:
y_submit = []
for (x_test,y_test),meth in zip(clean_data_tests,methods):
    w_fin,loss = meth(y_train,x_train)
    y_test = x_test @ w_fin
    print(y_test)
    y_test = [-1-0 if i < 0.5 else 1.0 for i in y_test]
    y_submit.append(y_test)
y_submit = np.concatenate(y_submit,axis = 0)

_And finally save the results to csv._

In [None]:
hlp.create_csv_submission(ids_test,y_submit,"obiwan.csv")

## Gianni's corner
_desole mec, javais cru que tu modifierais mon code: comme la cellule avait pas changé, javais pensé que tavais pas modifié mon code dans le main, seulement les fctions ==> jai pas pensé a scroller plus bas :/ jy ai quand meme laissé dans le doute_

#### Remove zero variance

In [None]:
#G
x_trains = spl.split_categorical_data(tX,22,labels = y,split = True)
x_tests = spl.split_categorical_data(tX_test,22,split = True)

In [None]:
#G
# Sanity check
print('Shape of category 0: {} x {}'.format(x_trains[0][0].shape[0], x_trains[0][0].shape[1]))
print('Shape of category 1: {} x {}'.format(x_trains[1][0].shape[0], x_trains[1][0].shape[1]))
print('Shape of category 2 and 3: {} x {}'.format(x_trains[2][0].shape[0], x_trains[2][0].shape[1]))

print('Total: {}'.format(x_trains[0][0].shape[0] + x_trains[1][0].shape[0] + x_trains[2][0].shape[0]))

In [None]:
#G
reduced_x_trains = []
reduced_x_tests = []
for i in range(len(x_trains)):
    tr = x_trains[i][0]
    te = x_tests[i]
    
    x_tr, x_te = pre.clean_variance(tr, te, inplace=False)
    reduced_x_trains.append((x_tr, x_trains[i][1])) # x_trains[i][1] = labels
    reduced_x_tests.append(x_te)

In [None]:
#G
# Sanity check
print('Shape of category 0 (train): {} x {}'.format(reduced_x_trains[0][0].shape[0], reduced_x_trains[0][0].shape[1]))
print('Shape of category 0 (test): {} x {}'.format(reduced_x_tests[0].shape[0], reduced_x_tests[0].shape[1]))

print('Shape of category 1 (train): {} x {}'.format(reduced_x_trains[1][0].shape[0], reduced_x_trains[1][0].shape[1]))
print('Shape of category 1 (test): {} x {}'.format(reduced_x_tests[1].shape[0], reduced_x_tests[1].shape[1]))

print('Shape of category 2 and 3 (train): {} x {}'.format(reduced_x_trains[2][0].shape[0], reduced_x_trains[2][0].shape[1]))
print('Shape of category 2 and 3 (test): {} x {}'.format(reduced_x_tests[2].shape[0], reduced_x_tests[2].shape[1]))


In [None]:
#G
# on veut clean les variances nulles dans le train ET dans le test
# we don't care about labels here
#for (tr, te) in zip(x_trains[0], x_test[0]):
#    blablabla

## Data Processing

#### Data standardization

In [None]:
#G
std_train_0, mu_0, sigma_0 = pre.standardize_data(reduced_x_trains[0][0])
std_test_0, mu_0, sigma_0 = pre.standardize_data(reduced_x_tests[0], mu_0, sigma_0)

In [None]:
#G
# Sanity check
print('Shape of category 0 (std train): {} x {}'.format(std_train_0.shape[0], std_train_0.shape[1]))
print('Shape of category 0 (std test): {} x {}'.format(std_test_0.shape[0], std_test_0.shape[1]))

## Put your useful trash here

In [None]:
#trash random dataframe
np.random.seed(2)
df = pd.DataFrame(np.random.randint(-1002,-995,size =(3,4)), columns=list('ABCD'))
df.replace(to_replace = -999,value = np.nan)

In [None]:
#Yann
OUTPUT_PATH = '' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)