In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
Y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Training

In [3]:
from implementations import *
from cross_validation import *
from preprocess import *

SEED = 1

### Linear Regression

#### Gradient Descent

In [4]:
k_fold = 5
degree = 3
one_hot_enc = True
strategy = 'most_freq'

gamma = 0.1
max_iters = 1000

trainer = lambda y, x: least_squares_GD(y, x, initial_w=initial_w, max_iters=max_iters,gamma=gamma)

y, tx_train, _ = preprocess(Y, tX, degree=degree, strategy=strategy, one_hot_enc=True, train=True)
initial_w = np.zeros(tx_train.shape[1])

run_k_fold(y, tx_train, trainer=trainer, compute_loss=compute_mse, k_fold=k_fold, seed=SEED)



Summary: 
	        train loss: 0.3065653039753832
	        test loss: 0.30892820469425386
	        train acc: 0.781954
	        test acc: 0.7818400000000001
	        train var acc: 2.5513240000000265e-06
	        test var acc: 5.521920000000019e-06
	

        train var loss: 1.959380092267875e-06
	        test var loss: 2.1140245696062797e-05
	        


#### Stochastic Gradient Descent

In [6]:
k_fold = 5
degree = 2
one_hot_enc = True
strategy = 'most_freq'

gamma = 0.01
max_iters = 1000

trainer = lambda y, x: least_squares_SGD(y, x, initial_w=initial_w, max_iters=max_iters,gamma=gamma)

y, tx_train, _ = preprocess(Y, tX, degree=degree, strategy=strategy, one_hot_enc=True, train=True)
initial_w = np.zeros(tx_train.shape[1])

run_k_fold(y, tx_train, trainer=trainer, compute_loss=compute_mse, k_fold=k_fold, seed=SEED)



Summary: 
	        train loss: 20.308648805083973
	        test loss: 21.784521719073616
	        train acc: 0.566392
	        test acc: 0.567372
	        train var acc: 0.010443671386000003
	        test var acc: 0.010281062176000003
	

        train var loss: 244.97894652543442
	        test var loss: 210.74593613097886
	        


### Least Square

In [7]:
k_fold = 5
degree = 2
one_hot_enc = True
strategy = 'most_freq'

trainer = lambda y, x: least_squares(y, x)
y, tx_train, _ = preprocess(Y, tX, degree=degree, strategy=strategy, one_hot_enc=True, train=True, log=False)
run_k_fold(y, tx_train, trainer=trainer, compute_loss=compute_mse, log=False, k_fold=k_fold, seed=SEED)


Summary: 
	        train loss: 0.31514722203195605
	        test loss: 0.37944762192084347
	        train acc: 0.7749179999999999
	        test acc: 0.7746639999999999
	        train var acc: 1.1629600000000736e-07
	        test var acc: 1.0789439999999838e-06
	

        train var loss: 8.519226186985896e-08
	        test var loss: 0.015969889273541156
	        


### Ridge Regression

In [8]:
k_fold = 5
degree = 3
one_hot_enc = True
strategy = 'most_freq'

lambda_ = 0.000610540229658532

trainer = lambda y, x: ridge_regression(y, x, lambda_=lambda_)
y, tx_train, _ = preprocess(Y, tX, degree=degree, strategy=strategy, one_hot_enc=True, train=True, log=False)
run_k_fold(y, tx_train, trainer=trainer, compute_loss=compute_mse, log=False, k_fold=k_fold, seed=SEED)


Summary: 
	        train loss: 0.30778000264512695
	        test loss: 0.3067555006354182
	        train acc: 0.783219
	        test acc: 0.783008
	        train var acc: 9.095400000001107e-08
	        test var acc: 2.753695999999972e-06
	

        train var loss: 3.5512923987480334e-08
	        test var loss: 6.13763926601411e-06
	        


### Logistic Regression

In [4]:
k_fold = 5
degree = 2
one_hot_enc = True
strategy = 'most_freq'

gamma = 1e-6
max_iters = 1000

log = True
threshold = 0.5

trainer = lambda y, x: logistic_regression(y,x, initial_w=initial_w, max_iters=max_iters,gamma=gamma)
compute_loss = lambda y, x, w: compute_loss_logistic_regression(y, x, w)

y, tx_train, _ = preprocess(Y, tX, degree=degree, strategy=strategy, one_hot_enc=True, train=True, log=log)

initial_w = np.zeros(tx_train.shape[1])

run_k_fold(y, tx_train, trainer=trainer, compute_loss=compute_loss, threshold=threshold, log=log, k_fold=k_fold, seed=SEED)


Summary: 
	        train loss: 90936.71779913598
	        test loss: 22746.827701103743
	        train acc: 0.7846409999999999
	        test acc: 0.784464
	        train var acc: 3.942399999999874e-08
	        test var acc: 7.33663999999997e-07
	

        train var loss: 4159.592660704778
	        test var loss: 4378.850173873131
	        


### Regularized Logistic Regression

In [5]:
k_fold = 5
degree = 2
one_hot_enc = True
strategy = 'most_freq'

lambda_ = 1
gamma = 1e-6
max_iters = 1000

log = True
threshold = 0.5

trainer = lambda y, x: reg_logistic_regression(y, x, initial_w=initial_w, max_iters=max_iters, lambda_=lambda_, gamma=gamma)
compute_loss = lambda y, x, w: compute_loss_logistic_regression(y, x, w) + loss_reg_logistic_regression(lambda_, w)

y, tx_train, _ = preprocess(Y, tX, degree=degree, strategy=strategy, one_hot_enc=True, train=True, log=log)

initial_w = np.zeros(tx_train.shape[1])

run_k_fold(y, tx_train, trainer=trainer, compute_loss=compute_loss, threshold=threshold, log=log, k_fold=k_fold, seed=SEED)


Summary: 
	        train loss: 90938.46846015878
	        test loss: 22747.321445882797
	        train acc: 0.784635
	        test acc: 0.784456
	        train var acc: 4.03699999999947e-08
	        test var acc: 7.183039999999815e-07
	

        train var loss: 4158.642787075166
	        test var loss: 4377.843362935946
	        


## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
_, tX_test = preprocess(_, tX_test, degree=3, strategy='most_freq', train=False, one_hot_enc=True, log=True)

In [None]:
OUTPUT_PATH = '../output/logistic_regression_N_Md_D3_1H_lam1_gam1e-06_trainall_2000.csv' 
y_pred = make_prediction(tX_test, w, threshold=0.5, log=True, test=True)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)