## Classification ML project

In [None]:
from IPython.core.display import SVG
SVG(filename='../data/pipeline.svg')

## Loading the Data:

In [None]:
import logistic as log
import split as spl
import least_squares as lst
import helpers as hlp
import pre_processing as pre
import vizu as viz

In [None]:
DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here 
DATA_TEST_PATH = '../data/test.csv'

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
import pandas as pd # cannot use external libraries, just pandas for data exploration

In [None]:
y, tX, ids = hlp.load_csv_data(DATA_TRAIN_PATH)
_, tX_test, ids_test = hlp.load_csv_data(DATA_TEST_PATH)

In [None]:
print(tX.shape)
print(y.shape)

## Data exploration

In [None]:
data = pd.read_csv(DATA_TRAIN_PATH)
test_data = pd.read_csv(DATA_TEST_PATH)
dic = {'s':1,'b':-1}
data.Prediction = data.Prediction.map(dic)
test_data.Prediction = test_data.Prediction.map(dic)
data.head(10)

In [None]:
mask = data.isin([-999]).any(axis = 1)
print(len(data[mask]))
print(len(data))

_The vast majoriy of our data has -999 values: we'd better handle it carefully_

In [None]:
#data.replace(to_replace = -999,value = np.nan, inplace = True)

In [None]:
#replace_val = np.nan
#tX = np.where(tX == -999,replace_val,tX)

In [None]:
std = np.nanstd(tX,axis = 0)
mean = np.nanmean(tX,axis = 0)

In [None]:
print('Train set size: {} samples x {} features'.format(pd.DataFrame(tX).shape[0], pd.DataFrame(tX).shape[1]))
print('Test set size: {} samples x {} features'.format(test_data.shape[0], pd.DataFrame(tX).shape[1]))

In [None]:
data.info()

In [None]:
data.describe()

## Data Cleaning

In [None]:
#totrash before submit: we use pandas to know to which index PRI_jet_num does correspond.
np.where(data.columns.values == "PRI_jet_num")

In [None]:
data_trains = spl.split_categorical_data(tX,22,labels = y,split = True)
data_tests = spl.split_categorical_data(tX_test,22,split = True)

In [None]:
mean = 0
stdev = 0
clean_data_trains = []
clean_data_tests = []
degre_polys = [12,12,13]
for i,((x_train,y_train),(x_test,test_indx)) in enumerate(zip(data_trains,data_tests)):
    x_train,x_test = pre.clean_variance(x_train,x_test)
    
    x_train = pre.clean_value(x_train,-999,np.nan)
    x_test = pre.clean_value(x_test,-999,np.nan)
    
    """
    pre.PCA_visualize(x_train,label = i)
    
    mean,eigvecs,eigvals = pre.get_PCA(x_train)
    x_test = x_test - mean
    
    x_train = pre.reduce_PCA(eigvecs,x_train,10)
    x_test = pre.reduce_PCA(eigvecs,x_test,10)
    """
    
    x_train,mean,stdev =  pre.standardize_data(x_train)
    x_test,_,_ = pre.standardize_data(x_test, mean,stdev)
    
    x_train = pre.clean_value(x_train,np.nan,0,inplace = True)
    x_test = pre.clean_value(x_test,np.nan,0,inplace = True)
    
    
    x_train = pre.build_poly(x_train, degre_polys[i])
    x_test = pre.build_poly(x_test,degre_polys[i])
    x_train = np.c_[np.ones((x_train.shape[0], 1)), x_train]
    x_test = np.c_[np.ones((x_test.shape[0], 1)), x_test]
    
    clean_data_trains.append((x_train,y_train))
    clean_data_tests.append((x_test,test_indx))

-Ridge regression avec optimization de lamdbas

## Data Processing

_We now need to standardize the function so that they all take the same type of parameters as inputs_

In [None]:
init_w = np.random.rand(clean_data_tests[0][0].shape[1])
maxiters = 100
gamma = 0.01

#method 1
meth1 = lambda  y, x: lst.ridge_regression(y,x,5.17E-5)

if (len(clean_data_tests) > 1):
    init_w2 = np.random.rand(clean_data_tests[1][0].shape[1])
    #method 2
    #reg_log_reg = lambda y,x : log.reg_logistic_regression(y, x, lambda_, init_w2, maxiters, gamma)
    #meth2 = lambda  y, x: log.logistic_regression(y,x,init_w2,5,gamma)
    meth2 = lambda y,x : lst.ridge_regression(y,x,0.0013)

    init_w3 = np.random.rand(clean_data_tests[2][0].shape[1])
    #method 3
    lambda_ = 0.1
    meth3 = lambda y, x: lst.ridge_regression(y,x,0.001389)
    #log_reg3 = lambda  y, x: log.logistic_regression(y,x,init_w3,5,gamma)

methods = [meth1,meth2,meth3]

_At this point we try the different models defined in the cell above: to do so run the cell below, and check the obtained accuracies._

Kfold for the methods

In [None]:
accuracies_group_means = []
accuracies_group_stds = []
cutoffs_group = []
used_metric = hlp.f1
for round_,((x_train,y_train),meth) in enumerate(zip(clean_data_trains,methods)):
    print("#################################")
    print("**********treating the {i}th group of data:**************".format(i = round_+1))
    accuracies, accu_stds,opt_cutoffs = spl.k_fold_cv(y_train,x_train,2,meth,metric = used_metric)
    accuracies_group_means.append(accuracies)
    accuracies_group_stds.append(accu_stds)
    cutoffs_group.append(opt_cutoffs)
print("\n done! Obtained :" + hlp.dico[used_metric],[np.mean(i) for i in accuracies_group_means])
print("ideal cutoffs for these groups-methods pairs:",[np.mean(i) for i in cutoffs_group])

Kfold for the decision threshold

In [None]:
np.linspace(-1,1,3)

In [None]:
cutoffs = np.linspace(-2,2,20)
metric_trains = []
metric_tests = []
opt_cutoffs = []
for round_,((x_train,y_train),meth) in enumerate(zip(clean_data_trains,methods)):
    print("#################################")
    print("**********treating the {i}th group of data:**************".format(i = round_+1))
    metric_train,metric_test,opt_cutoff = spl.k_fold_cutoff(y_train,x_train,4,meth,cutoffs,metric = hlp.accuracy)
    metric_trains.append(metric_train)
    metric_tests.append(metric_test)
    opt_cutoffs.append(opt_cutoff)
print("Obtained average metric on the test sets: ",[np.mean(i) for i in metric_tests])
print("Obtained average metric on the train sets: ",[np.mean(i) for i in metric_trains])
#one cutoff per fold, 3 groups ==> k * 3 cutoffs
print("Optimal obtained cutoffs: (k * 3)",opt_cutoffs)

#best cutoffs for accuracy as a metric: [[-0.10526315789473695, -0.10526315789473695, 2.0, 0.10526315789473673], [0.10526315789473673, 0.10526315789473673, 0.10526315789473673, 0.10526315789473673], [0.10526315789473673, 0.10526315789473673, 0.10526315789473673, 0.10526315789473673]]
#best cutoffs for f1 as a metric:  [[-0.3157894736842106, -0.3157894736842106, -0.3157894736842106, -0.3157894736842106], [-0.10526315789473695, -0.10526315789473695, -0.10526315789473695, -0.10526315789473695], [-0.10526315789473695, -0.10526315789473695, -0.10526315789473695, -0.10526315789473695]]

## Optimization

#### Ridge

In [None]:
# for ridge: for every models test different lambdas and degrees
degrees = np.arange(1, 3)
lambdas = np.logspace(-5, 0, 2)

accuracies_tot = []

for idx_subset, (x_train, y_train) in enumerate(clean_data_trains):
    print('##### START SUBSET {} #####'.format(idx_subset))
    accuracies = np.zeros((len(lambdas), len(degrees)))
    for idx_deg, deg in enumerate(degrees):
        x_poly = pre.build_poly(x_train, deg)
        
        for idx_lambda, lambda_ in enumerate(lambdas):
            ridge = lambda y, x: lst.ridge_regression(y,x,lambda_)
            _, k_accuracies_test = spl.k_fold_cv(y_train, x_poly, 2, ridge)
            
            # update table
            accuracies[idx_lambda][idx_deg] = np.mean(k_accuracies_test)
    
    accuracies_tot.append(accuracies)
    print('##### END SUBSET {} #####'.format(idx_subset))


In [None]:
# save accuracies
import pickle

with open('acc_ridge.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump(accuracies_tot, f)

In [None]:
import seaborn as sns

plt.subplots(figsize=(15,5))
plt.subplot(1,3,1)
fig = sns.heatmap(accuracies_tot[0])
fig.set_yticklabels(np.round(lambdas, 5), rotation=60)
fig.set_xticklabels(degrees)
fig.set_xlabel('degree')
fig.set_ylabel('lambda')
fig.set_title('Accuracy')
plt.subplot(1,3,2)
fig = sns.heatmap(accuracies_tot[1])
fig.set_xticklabels(degrees)
fig.set_xlabel('degree')
fig.set_ylabel('lambda')
fig.set_title('Accuracy')
plt.subplot(1,3,3)
fig = sns.heatmap(accuracies_tot[2])
fig.set_xticklabels(degrees)
fig.set_xlabel('degree')
fig.set_ylabel('lambda')
fig.set_title('Accuracy')

In [None]:
# y: ↓ (lambdas), x: → (degree)
for nb, acc in enumerate(accuracies_tot):
    print('SUBSET {}'.format(nb))
    ymax = np.asscalar(np.where(acc == np.max(acc))[0])
    xmax = np.asscalar(np.where(acc == np.max(acc))[1])
    
    print('Best degree for subset {}: {}'.format(nb, degrees[xmax]))
    print('Best lambda for subset {}: {}'.format(nb, lambdas[ymax]))
    print('Accuracy: {}'.format(acc[ymax][xmax]))

## Visualization

In [None]:
lambdas = np.logspace(-5, 0, 5)
plt.figure(1, figsize=(15, 11))
for idx_subset, (x_train, y_train) in enumerate(clean_data_trains):
    accuracy_train = np.zeros((len(lambdas)))
    accuracy_test = np.zeros((len(lambdas)))
    
    for idx_lambda, lambda_ in enumerate(lambdas):
            ridge = lambda y, x: lst.ridge_regression(y,x,lambda_)
            k_accuracies_train, k_accuracies_test = spl.k_fold_cv(y_train, x_train, 4, ridge, hlp.accuracy)
            plt.subplot(3,2,idx_lambda+1)
            plt.boxplot(k_accuracies_train, positions = [idx_lambda])
            plt.boxplot(k_accuracies_test, positions = [idx_lambda])
            plt.xlabel("lambda")
            plt.ylabel("testing rmse")
            plt.title("cross validation for categorical subset{i}".format(i=idx_subset))
            # update table
            accuracy_train[idx_lambda] = np.mean(k_accuracies_train)
            accuracy_test[idx_lambda] = np.mean(k_accuracies_test)
    viz.cross_validation_visualization(lambdas, accuracy_train, accuracy_test)

## Submission

_We now interpolate the data thanks to the model defined 2 cells higher..._

In [None]:
y_submit = np.zeros(len(tX_test))
assert(len(tX_test) == sum([i[0].shape[0] for i in clean_data_tests]))
for (x_test,y_indx),(x_train,y_train),meth in zip(clean_data_tests,clean_data_trains,methods):
    w_fin,loss = meth(y_train,x_train)
    y_test = x_test @ w_fin
    y_test = [-1 if i < 0 else 1.0 for i in y_test]
    y_submit[y_indx] = y_test

_And finally save the results to csv._

In [None]:
hlp.create_csv_submission(ids_test,y_submit,"anakin.csv")

## Put your useful trash here

In [None]:
#trash random dataframe
np.random.seed(2)
df = pd.DataFrame(np.random.randint(-1002,-995,size =(3,4)), columns=list('ABCD'))
df.replace(to_replace = -999,value = np.nan)

In [None]:
#Yann
OUTPUT_PATH = '' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)