In [39]:
import numpy as np
from proj1_helpers import *
from cross_validation import *
from tools import *
from implementations import *

%load_ext autoreload
%autoreload 2

seed = 10

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Import data

In [2]:
DATA_TRAIN_PATH = 'data/train.csv'
DATA_TEST_PATH = 'data/test.csv'

In [4]:
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [4]:
tX.shape

(250000, 30)

### Correlation

In [26]:
def calculate_correlation(tx):
    corr = np.ones((tx.shape[1], tx.shape[1]))
    for feature1 in range(0, tx.shape[1]):
        for feature2 in range(0, tx.shape[1]):
            corr[feature1, feature2] = np.corrcoef(tx[:, feature1], tx[:, feature2])[0, 1]
            if (corr[feature1, feature2] >= 0.85 and feature1 != feature2):
                print("Features {f1} and {f2} are highly correlated: {corr}".format(f1 =feature1, f2 = feature2, corr = corr[feature1, feature2]))

In [27]:
jets_train = group_features_by_jet(tX)

In [28]:
len(jets_train)

3

In [29]:
x,y = replace_missing_data_by_frequent_value(tX[jets_train[0]],tX[jets_train[0]])
calculate_correlation(x)

  c /= stddev[:, None]
  c /= stddev[None, :]


Features 3 and 8 are highly correlated: 0.9999999999988476
Features 8 and 3 are highly correlated: 0.9999999999988476


In [14]:
x,y = replace_missing_data_by_frequent_value(tX[jets_train[1]],tX[jets_train[1]])
calculate_correlation(x)

Features 0 and 2 are highly correlated: 0.8278513781506542
Features 2 and 0 are highly correlated: 0.8278513781506542


  c /= stddev[:, None]
  c /= stddev[None, :]


Features 3 and 9 are highly correlated: 0.8632110677076703
Features 3 and 23 are highly correlated: 0.936759033214476
Features 3 and 29 are highly correlated: 0.9367590311733179
Features 9 and 3 are highly correlated: 0.8632110677076704
Features 9 and 23 are highly correlated: 0.9051662981927859
Features 9 and 29 are highly correlated: 0.9051663003822761
Features 23 and 3 are highly correlated: 0.9367590332144761
Features 23 and 9 are highly correlated: 0.9051662981927859
Features 23 and 29 are highly correlated: 0.9999999999990182
Features 29 and 3 are highly correlated: 0.9367590311733179
Features 29 and 9 are highly correlated: 0.9051663003822761
Features 29 and 23 are highly correlated: 0.9999999999990182


In [15]:
x,y = replace_missing_data_by_frequent_value(tX[jets_train[2]],tX[jets_train[2]])
calculate_correlation(x)

Features 0 and 2 are highly correlated: 0.8127116079837958
Features 2 and 0 are highly correlated: 0.8127116079837958
Features 9 and 21 are highly correlated: 0.9012851815814814
Features 9 and 23 are highly correlated: 0.8789936580551966
Features 9 and 29 are highly correlated: 0.9553001014430405
Features 21 and 9 are highly correlated: 0.9012851815814814
Features 21 and 29 are highly correlated: 0.876350661043983
Features 23 and 9 are highly correlated: 0.8789936580551967
Features 23 and 29 are highly correlated: 0.881239517517324
Features 29 and 9 are highly correlated: 0.9553001014430403
Features 29 and 21 are highly correlated: 0.876350661043983
Features 29 and 23 are highly correlated: 0.881239517517324


### Reimport data

In [41]:
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Gradient descent

In [43]:
k_fold = 5
gamma = 0.01
max_iters = 500
# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)
losses_train = []
losses_test = []

for k in range(k_fold):
    loss_train, loss_test = cross_validation(y, tX, k_indices, k, least_squares_gd, initial_w=None, max_iters=max_iters, gamma=gamma)
    losses_train.append(loss_train)
    losses_test.append(loss_test)
    
for i in range(len(losses_train)):
    print("{} - Training loss: {} - Test loss: {}".format(i, losses_train[i], losses_test[i]))
print("Average test loss: {}".format(np.mean(losses_test)))
print("Variance test loss: {}".format(np.var(losses_test)))

0 - Training loss: 0.45082341401577236 / Test loss: 0.44919703102221353
1 - Training loss: 0.45045895806996167 / Test loss: 0.4506402874539313
2 - Training loss: 0.4504526625472779 / Test loss: 0.4506652766107049
3 - Training loss: 0.45036133252034155 / Test loss: 0.4510280209824329
4 - Training loss: 0.45038023528767973 / Test loss: 0.45095290877583066
Average test loss: 0.45049670496902267
Variance test loss: 4.4572066316372895e-07


## Stochastic gradient descent

In [44]:
k_fold = 5
gamma = 0.01
max_iters = 500
# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)
losses_train = []
losses_test = []

for k in range(k_fold):
    loss_train, loss_test = cross_validation(y, tX, k_indices, k, least_squares_sgd, initial_w=None, max_iters=max_iters, gamma=gamma)
    losses_train.append(loss_train)
    losses_test.append(loss_test)
    
for i in range(len(losses_train)):
    print("{} - Training loss: {} - Test loss: {}".format(i, losses_train[i], losses_test[i]))
print("Average test loss: {}".format(np.mean(losses_test)))
print("Variance test loss: {}".format(np.var(losses_test)))

0 - Training loss: 0.4514420069441826 / Test loss: 0.4505877245451047
1 - Training loss: 0.45047548950479904 / Test loss: 0.45073015408608624
2 - Training loss: 0.45046467399687345 / Test loss: 0.45069018416474915
3 - Training loss: 0.4520666793653859 / Test loss: 0.45222240628465415
4 - Training loss: 0.45049214587484815 / Test loss: 0.4512321791988299
Average test loss: 0.45109252965588487
Variance test loss: 3.688298677555393e-07


## Least squares

In [45]:
k_fold = 5
# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)
losses_train = []
losses_test = []

for k in range(k_fold):
    loss_train, loss_test = cross_validation(y, tX, k_indices, k, least_squares)
    losses_train.append(loss_train)
    losses_test.append(loss_test)
    
for i in range(len(losses_train)):
    print("{} - Training loss: {} - Test loss: {}".format(i, losses_train[i], losses_test[i]))
print("Average test loss: {}".format(np.mean(losses_test)))
print("Variance test loss: {}".format(np.var(losses_test)))

0 - Training loss: 0.4508212478 / Test loss: 0.44918415140000006
1 - Training loss: 0.4504567758 / Test loss: 0.4506393482
2 - Training loss: 0.45045048000000004 / Test loss: 0.4506645440000001
3 - Training loss: 0.45035914595000015 / Test loss: 0.4510302876500001
4 - Training loss: 0.45037804955000016 / Test loss: 0.45095455445000016
Average test loss: 0.4504945771400001
Variance test loss: 4.5312557792114635e-07


## Ridge regression

In [46]:
#without grouping
k_fold = 5
lambda_ = 0.002
# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)
losses_train = []
losses_test = []

for k in range(k_fold):
    loss_train, loss_test = cross_validation(y, tX, k_indices, k, ridge_regression, lambda_ = lambda_)
    losses_train.append(loss_train)
    losses_test.append(loss_test)

for i in range(len(losses_train)):
    print("{} - Training loss: {} - Test loss: {}".format(i, losses_train[i], losses_test[i]))
print("Average test loss: {}".format(np.mean(losses_test)))
print("Variance test loss: {}".format(np.var(losses_test)))

0 - Training loss: 0.4508220284027237 / Test loss: 0.44919145429913804
1 - Training loss: 0.45045756218790195 / Test loss: 0.45063940720782214
2 - Training loss: 0.45045126648783357 / Test loss: 0.4506644776432121
3 - Training loss: 0.45035993388755735 / Test loss: 0.45102840171624253
4 - Training loss: 0.45037883718750493 / Test loss: 0.45095304525523394
Average test loss: 0.45049535722432965
Variance test loss: 4.486259432284236e-07


In [40]:
#group by jets
k_fold = 5
lambda_ = [0.002, 0.002, 0.001]
degree = [1, 4, 5]
# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)
losses_train = []
losses_test = []

for k in range(k_fold):
    loss_train, loss_test = cross_validation_ridge_regression(y, tX, k_indices, k, lambda_, degree)
    losses_train.append(loss_train)
    losses_test.append(loss_test)

for i in range(len(losses_train)):
    print("{} - Training loss: {} - Test loss: {}".format(i, losses_train[i], losses_test[i]))
print("Average test loss: {}".format(np.mean(losses_test)))
print("Variance test loss: {}".format(np.var(losses_test)))

0 - Training loss: 0.27641006415678093 - Test loss: 0.28116923876058664
1 - Training loss: 0.27615545557588717 - Test loss: 0.27898587725652396
2 - Training loss: 0.2753791494242234 - Test loss: 0.2819127993284606
3 - Training loss: 0.2760791921759144 - Test loss: 0.27807856503874867
4 - Training loss: 0.2761538794294806 - Test loss: 0.2764520940152006
Average test loss: 0.2793197148799041
Variance test loss: 4.003995090645514e-06


### run.py code --> ridge regression

In [22]:
#version 4  best result
print('Start!\n')

DATA_TRAIN_PATH = 'data/train.csv'
DATA_TEST_PATH = 'data/test.csv'

y, tx_train, ids = load_csv_data(DATA_TRAIN_PATH)
_, tx_test, ids_test = load_csv_data(DATA_TEST_PATH)

# Ridge regression parameters for each subset
lambda_ = [0.002,0.002,0.001]
# Polynomial features degree for each subset
degree = [1,4,5]

msks_jet_train = get_jet_masks(tx_train)
msks_jet_test = get_jet_masks(tx_test)

# Vector to store the final prediction
y_pred = np.zeros(tx_test.shape[0])

for idx in range(len(msks_jet_train)):
    x_train = tx_train[msks_jet_train[idx]]
    x_test = tx_test[msks_jet_test[idx]]
    y_train = y[msks_jet_train[idx]]

    # Pre-processing of data
    x_train, x_test = process_data(x_train, x_test)

    phi_train = build_poly(x_train, degree[idx])
    phi_test = build_poly(x_test, degree[idx])

    weights, loss = ridge_regression(y_train, phi_train, lambda_[idx])

    y_test_pred = predict_labels(weights, phi_test)

    y_pred[msks_jet_test[idx]] = y_test_pred

OUTPUT_PATH = 'data/output_ridge_regression.csv'
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

print('Finish!')

Start!

Done !


In [42]:
#version 7
print('Start!\n')

DATA_TRAIN_PATH = 'data/train.csv'
DATA_TEST_PATH = 'data/test.csv'

y, tx_train, ids = load_csv_data(DATA_TRAIN_PATH)
_, tx_test, ids_test = load_csv_data(DATA_TEST_PATH)

# Ridge regression parameters for each subset
lambda_ = [0.002,0.002,0.001]
# Polynomial features degree for each subset
degree = [1,4,5]

dict_jet_train = group_features_by_jet(tx_train)
dict_jet_test = group_features_by_jet(tx_test)

# Vector to store the final prediction
y_pred = np.zeros(tx_test.shape[0])

for idx in range(len(dict_jet_train)):
    x_train = tx_train[dict_jet_train[idx]]
    x_test = tx_test[dict_jet_test[idx]]
    y_train = y[dict_jet_train[idx]]
    
    #correlation --> delete columns
    if idx == 0:
        x_train = np.delete(x_train, [4,5,6,8,12,22,23,24,25,26,27,28], 1)
        x_test = np.delete(x_test, [4,5,6,8,12,22,23,24,25,26,27,28], 1)
    elif idx == 1:
        x_train = np.delete(x_train, [4,5,6,12,22,26,27,28,29], 1)
        x_test = np.delete(x_test, [4,5,6,12,22,26,27,28,29], 1)

    # Pre-processing of data
    x_train, x_test = process_data_ridge_regression(x_train, x_test,idx)

    temp_train = build_poly(x_train, degree[idx])
    temp_test = build_poly(x_test, degree[idx])

    weights, loss = ridge_regression(y_train, temp_train, lambda_[idx])

    y_test_pred = predict_labels(weights, temp_test)

    y_pred[dict_jet_test[idx]] = y_test_pred

OUTPUT_PATH = 'data/output_ridge_regression.csv'
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

print('Finish!')

Start!

Finish!


## Logistic regression

In [10]:
k_fold = 10
gamma = 0.6
max_iters = 100

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)
losses_train = []
losses_test = []


for k in range(k_fold):
    loss_train, loss_test = cross_validation(y, tX, k_indices, k, logistic_regression, initial_w=None ,max_iters=max_iters, gamma=gamma)
    losses_train.append(loss_train)
    losses_test.append(loss_test)
    
for i in range(len(losses_train)):
    print("{} - Training loss: {} / Test loss: {}".format(i, losses_train[i], losses_test[i]))
print("Average test loss: {}".format(np.mean(losses_test)))
print("Variance test loss: {}".format(np.var(losses_test)))
print("Min test loss: {}".format(np.min(losses_test)))
print("Max test loss: {}".format(np.max(losses_test)))

  return 1.0 / (1 + np.exp(-t))
  pred = sigmoid(np.ones(len(w))+tx.dot(w))


0 - Training loss: nan / Test loss: 339032417.7031971
1 - Training loss: nan / Test loss: 470008119.29838735
2 - Training loss: nan / Test loss: 455406651.5480087
3 - Training loss: nan / Test loss: 560444754.7400097
4 - Training loss: nan / Test loss: 457219234.9831971
5 - Training loss: nan / Test loss: 562455338.7031963
6 - Training loss: nan / Test loss: 613889940.9031962
7 - Training loss: nan / Test loss: 464505568.8200088
8 - Training loss: nan / Test loss: 387523905.06319726
9 - Training loss: nan / Test loss: 696379747.738406
Average test loss: 500686567.9500805
Variance test loss: 1.0362501053044878e+16
Min test loss: 339032417.7031971
Max test loss: 696379747.738406


## Regularized logistic regression