In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here 
Y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Do your thing crazy machine learning thing here :) ...

## Preprocess

In [3]:
from preprocess import one_hot_encode, build_poly

MEAN = None
STD = None
VAL_TO_REPLACE_NEG_999 = None

def preprocess(y, tx, degree=1, strategy='most_freq', log=False, train=True, one_hot_enc=False):
    global MEAN, STD, VAL_TO_REPLACE_NEG_999

    tx = np.copy(tx)

    #for logistic regression, we assum y to be in {0, 1}
    if log:
        y[y < 0] = 0

    if train:
        # Initiate an empty array that will store value to replace -999 of each column
        VAL_TO_REPLACE_NEG_999 = np.zeros(shape=(tx.shape[1],))

    if one_hot_enc:
        col = 22
        one_hot = one_hot_encode(tx[:, col])
        tx = np.c_[tx[:, :col], tx[:, col+1:]]

    # in x there are unknown values marked as -999
    for col in range(tx.shape[1]):
        feature = tx[:,col]
        if train:
            if strategy == 'min':
                min_val = np.min(feature[feature != -999])
                min_val -= 0.001
                feature[feature == -999] = min_val
                VAL_TO_REPLACE_NEG_999[col] = min_val
            elif strategy == 'most_freq':
                values, counts = np.unique(feature[feature != -999], return_counts=True)
                most_freq = values[np.argmax(counts)]
                feature[feature == -999] = most_freq
                VAL_TO_REPLACE_NEG_999[col] = most_freq
            elif strategy == 'mean':
                mean_val = np.mean(feature[feature != -999])
                feature[feature == -999] = mean_val
                VAL_TO_REPLACE_NEG_999[col] = mean_val
        else:
            feature[feature == -999] = VAL_TO_REPLACE_NEG_999[col]
        tx[:,col] = feature

    # in addition, we will add further feature to x
    tx = build_poly(tx, degree)

    # now let's normalize the value in x to mean of 0 and variance of 1
    if train:
        MEAN = np.mean(tx, axis=0)
        STD = np.std(tx, axis=0)
    tx -= MEAN
    tx /= STD

    # let's make a model with shift scalar
    num_samples = len(y)
    tx = np.c_[np.ones(num_samples), tx]

    # concatenate the one hot encoded columns
    if one_hot_enc:
        tx = np.c_[tx, one_hot]

    return y, tx

## Training

In [4]:
from implementations import logistic_regression, compute_loss_logistic_regression, compute_mse, least_squares
from cross_validation import build_k_indices, cross_validation, compute_acc

In [5]:
seed = 1

np.random.seed(seed)

### Gradient Descent

In [6]:
from implementations import least_squares_GD

In [10]:
k_fold = 5
max_iters_range = [20]
gammas = [0.001, 0.01, 0.1]

loss_trs = []
loss_tes = []
acc_trs = []
acc_tes = []
ws = []
hyperparameters = []
one_hot_enc = True

for degree in range(1,5):
    for strategy in ['most_freq', 'min', 'mean']:
        y, tx_train = preprocess(Y, tX, degree, strategy=strategy, one_hot_enc=one_hot_enc)
        
        k_indices = build_k_indices(y, k_fold, seed)
        for gamma in gammas:
            for max_iters in max_iters_range:
                acc_tr_mean = acc_te_mean = rmse_tr_mean = rmse_te_mean = 0
                initial_w = np.zeros(tx_train.shape[1])
                initial_w = initial_w.astype(np.float128)
                n = 0
                regression = lambda y, x: least_squares_GD(y, x, initial_w, max_iters, gamma)
                for k in range(k_fold):
                    acc_tr, acc_te, loss_tr, loss_te, weight = cross_validation(y, tx_train, k_indices, k, regression, compute_mse, log=False)
                    ws.append(weight)
                    rmse_tr_mean += loss_tr
                    rmse_te_mean += loss_te
                    acc_tr_mean += acc_tr
                    acc_te_mean += acc_te
                    n += 1
                loss_trs.append(rmse_tr_mean / n)
                loss_tes.append(rmse_te_mean / n)
                acc_trs.append(acc_tr_mean / n)
                acc_tes.append(acc_te_mean / n)
                print(f'Summary: train loss {rmse_tr_mean / n} train acc {acc_tr_mean / n}, test loss {rmse_te_mean / n} test acc {acc_te_mean / n}')

                hyperparameters.append({
                    'max_iters': max_iters,
                    'gamma': gamma,
                    'degree': degree,
                    'strategy': strategy,
                    'one_hot_enc': one_hot_enc
                })
                print('hyperparameters', hyperparameters[-1])

Summary: train loss 0.4681205594929926 train acc 0.681248, test loss 0.46762915179648373 test acc 0.681164
hyperparameters {'max_iters': 20, 'gamma': 0.001, 'degree': 1, 'strategy': 'most_freq', 'one_hot_enc': True}
Summary: train loss 0.38693389989744675 train acc 0.71259, test loss 0.3861599581159285 test acc 0.712512
hyperparameters {'max_iters': 20, 'gamma': 0.01, 'degree': 1, 'strategy': 'most_freq', 'one_hot_enc': True}
Summary: train loss 0.34453241658526645 train acc 0.737896, test loss 0.3443027960446142 test acc 0.738348
hyperparameters {'max_iters': 20, 'gamma': 0.1, 'degree': 1, 'strategy': 'most_freq', 'one_hot_enc': True}
Summary: train loss 0.4680605248108254 train acc 0.667247, test loss 0.46758020506090703 test acc 0.667064
hyperparameters {'max_iters': 20, 'gamma': 0.001, 'degree': 1, 'strategy': 'min', 'one_hot_enc': True}
Summary: train loss 0.38723657788448973 train acc 0.7112260000000001, test loss 0.38642376483607105 test acc 0.711192
hyperparameters {'max_iters'

In [16]:
hype = pd.DataFrame(hyperparameters)
hype['accur_test'] = acc_tes
hype['loss_train'] =loss_trs

In [17]:
hype

Unnamed: 0,max_iters,gamma,degree,strategy,one_hot_enc,accur_test,loss_train
0,20,0.001,1,most_freq,True,0.681164,0.4681206
1,20,0.01,1,most_freq,True,0.712512,0.3869339
2,20,0.1,1,most_freq,True,0.738348,0.3445324
3,20,0.001,1,min,True,0.667064,0.4680605
4,20,0.01,1,min,True,0.711192,0.3872366
5,20,0.1,1,min,True,0.739336,0.3441659
6,20,0.001,1,mean,True,0.708572,0.4701394
7,20,0.01,1,mean,True,0.720124,0.3855638
8,20,0.1,1,mean,True,0.738204,0.3443205
9,20,0.001,2,most_freq,True,0.692128,0.4583057


### Stochastic Gradient Descent

In [6]:
from implementations import least_squares_SGD

In [7]:
k_fold = 5
max_iters_range = [20]
gammas = [0.001, 0.01, 0.1]

loss_trs = []
loss_tes = []
acc_trs = []
acc_tes = []
ws = []
hyperparameters_sgd = []
one_hot_enc = True

for degree in range(1,5):
    for strategy in ['most_freq', 'min', 'mean']:
        y, tx_train = preprocess(Y, tX, degree, strategy=strategy, one_hot_enc=one_hot_enc)
        
        k_indices = build_k_indices(y, k_fold, seed)
        for gamma in gammas:
            for max_iters in max_iters_range:
                acc_tr_mean = acc_te_mean = rmse_tr_mean = rmse_te_mean = 0
                initial_w = np.zeros(tx_train.shape[1])
                initial_w = initial_w.astype(np.float128)
                n = 0
                regression = lambda y, x: least_squares_SGD(y, x, initial_w, max_iters, gamma)
                for k in range(k_fold):
                    acc_tr, acc_te, loss_tr, loss_te, weight = cross_validation(y, tx_train, k_indices, k, regression, compute_mse, log=False)
                    ws.append(weight)
                    rmse_tr_mean += loss_tr
                    rmse_te_mean += loss_te
                    acc_tr_mean += acc_tr
                    acc_te_mean += acc_te
                    n += 1
                loss_trs.append(rmse_tr_mean / n)
                loss_tes.append(rmse_te_mean / n)
                acc_trs.append(acc_tr_mean / n)
                acc_tes.append(acc_te_mean / n)
                print(f'Summary: train loss {rmse_tr_mean / n} train acc {acc_tr_mean / n}, test loss {rmse_te_mean / n} test acc {acc_te_mean / n}')

                hyperparameters_sgd.append({
                    'max_iters': max_iters,
                    'gamma': gamma,
                    'degree': degree,
                    'strategy': strategy,
                    'one_hot_enc': one_hot_enc
                })
                print('hyperparameters', hyperparameters_sgd[-1])

Summary: train loss 0.47727750013242964 train acc 0.654551, test loss 0.4760869012414026 test acc 0.6539400000000001
hyperparameters {'max_iters': 20, 'gamma': 0.001, 'degree': 1, 'strategy': 'most_freq', 'one_hot_enc': True}
Summary: train loss 0.42593925996573967 train acc 0.6842469999999999, test loss 0.4251306308589764 test acc 0.685048
hyperparameters {'max_iters': 20, 'gamma': 0.01, 'degree': 1, 'strategy': 'most_freq', 'one_hot_enc': True}
Summary: train loss 578796.8672432419 train acc 0.48264200000000007, test loss 715174.763181222 test acc 0.4812600000000001
hyperparameters {'max_iters': 20, 'gamma': 0.1, 'degree': 1, 'strategy': 'most_freq', 'one_hot_enc': True}
Summary: train loss 0.4775133195117001 train acc 0.6447680000000001, test loss 0.4763342565317616 test acc 0.6445000000000001
hyperparameters {'max_iters': 20, 'gamma': 0.001, 'degree': 1, 'strategy': 'min', 'one_hot_enc': True}
Summary: train loss 0.4217461206781293 train acc 0.6820700000000001, test loss 0.41946928

## Generate predictions and save ouput in csv format for submission:

In [96]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
_, tX_test = preprocess(_, tX_test, degree=2, strategy='most_freq', train=False)


(568238, 30) before
(568238, 29) remove col to be one hot


In [97]:
tX_test.shape

(568238, 63)

In [98]:
OUTPUT_PATH = '../data/pred_least_squre_normalized_poly2_properonehot_10fold_mean_weight.csv' # TODO: fill in desired name of output file for submission
y_pred = make_prediction(tX_test, weighted_mean, threshold=0, log=False)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)