In [1]:
import scripts.proj1_helpers as helper
import implementations as imp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Feature engineering

In [None]:
df = pd.read_csv('train.csv')

In [None]:
dfb = pd.read_csv('test.csv')

In [None]:
len(df[df == -999.0])

In [None]:
for i, c in enumerate(df.columns):
    print(i, c)

In [None]:
df.PRI_jet_num.unique()

## Import the data

boson is -1

not boson is 1

In [2]:
y_train, x_train, ids_train = helper.load_csv_data('train.csv')
y_test, x_test, ids_test = helper.load_csv_data('test.csv')
print(x_train.shape)
print(x_test.shape)

(250000, 30)
(568238, 30)


In [3]:
print('Number of boson:', np.count_nonzero(y_train-1))
print('Number of other:', np.count_nonzero(y_train+1))

Number of boson: 164333
Number of other: 85667


In [4]:
def col_standardize(x):
    means = [np.mean(col) for col in x.T]
    stds = [np.std(col) for col in x.T]
    return (x - means) / stds

In [5]:
def standardize(x):
    return (x - np.mean(x, axis=0)) / np.std(x, axis=0)

In [6]:
std_x_train = standardize(x_train)
std_x_test = standardize(x_test)

In [7]:
col_std_x_train = col_standardize(x_train)
col_std_x_test = col_standardize(x_test)

In [8]:
y_train[y_train < 0] = 0

## Logistic regression per category

In [9]:
x_test[:, 22]

array([ 0.,  1.,  0., ...,  0.,  1.,  0.])

In [10]:
x_test.shape, x_train.shape

((568238, 30), (250000, 30))

In [11]:
cat_col = 22
PRI_jet_nums = np.unique(x_train[:, cat_col])
predictions = np.zeros(x_test.shape[0])

for num in PRI_jet_nums:
    cat_indices = np.where(x_train[:, cat_col] == num)
    
    x_train_cat = x_train[cat_indices]
    x_train_cat = np.delete(x_train_cat, cat_col, axis=1)
    
    stds = np.std(x_train_cat, axis=0)
    deleted_cols_ids = np.where(stds == 0)
    x_train_cat = np.delete(x_train_cat, deleted_cols_ids, axis=1)
    x_train_cat = standardize(x_train_cat)
    
    _, w, _ = imp.logistic_regression(y_train[cat_indices],
                                      x_train_cat,
                                      max_iter=100,
                                      threshold=10**(-4))
    
    x_test_cat = x_test[cat_indices]
    x_test_cat = np.delete(x_test_cat, cat_col, axis=1)
    x_test_cat = np.delete(x_test_cat, deleted_cols_ids, axis=1)
    x_test_cat = standardize(x_test_cat)
    
    x_test_cat = np.hstack((np.ones((x_test_cat.shape[0], 1)), x_test_cat))
    
    predictions_cat = helper.predict_labels(w, x_test_cat)
    predictions[cat_indices] = predictions_cat

Current iteration=0, the loss=69254.41425128581
Current iteration=1, the loss=69208.87608757608
Current iteration=2, the loss=69163.42890145033
Current iteration=3, the loss=69072.76129623411
Current iteration=4, the loss=68937.36926593368
Current iteration=5, the loss=68713.2838731276
Current iteration=6, the loss=68314.71065192547
Current iteration=7, the loss=67621.04252895042
Current iteration=8, the loss=66451.5246190784
Current iteration=9, the loss=64570.46925149276
Current iteration=10, the loss=61647.83631342309
Current iteration=11, the loss=57541.75130472867
Current iteration=12, the loss=52407.475617801836
Current iteration=13, the loss=47021.84335877551
Current iteration=14, the loss=42540.593403390245
Current iteration=15, the loss=39887.175640660265
Current iteration=16, the loss=39141.47178373085
Current iteration=17, the loss=39097.868114203004
Current iteration=18, the loss=39097.642530213176




Current iteration=0, the loss=53749.4049693404
Current iteration=1, the loss=53729.62380884933
Current iteration=2, the loss=53709.882131355895
Current iteration=3, the loss=53690.1799936917
Current iteration=4, the loss=53670.51716911932
Current iteration=5, the loss=53631.28923795276
Current iteration=6, the loss=53572.71115550812
Current iteration=7, the loss=53495.1106710966
Current iteration=8, the loss=53379.7469874574
Current iteration=9, the loss=53209.000522258626
Current iteration=10, the loss=52967.18135011858
Current iteration=11, the loss=52641.57181992406
Current iteration=12, the loss=52189.16134064753
Current iteration=13, the loss=51563.751022929995
Current iteration=14, the loss=50742.713198306636
Current iteration=15, the loss=49707.33528489381
Current iteration=16, the loss=48464.35793509028
Current iteration=17, the loss=47074.45200062036
Current iteration=18, the loss=45642.26197520335
Current iteration=19, the loss=44310.364571219194
Current iteration=20, the los

In [None]:
helper.create_csv_submission(ids_test, predictions, 'separated.csv')

## Logistic regression

In [None]:
niter = 5000

In [None]:
start = time.perf_counter()
l, w, data = imp.stochastic_logistic_regression(y_train, std_x_train, max_iter=niter, batch_size=150, threshold=10**(-4))
end = time.perf_counter()

In [None]:
print("Runtime:", end - start)

In [None]:
len(data)

In [None]:
plt.plot(data[100:])

In [None]:
print("Loss first iteration", data[0])
print("Loss iteration at iteration", niter, data[-1])
print("delta of losses", data[0] - data[-1])

In [None]:
std_x_test = np.hstack((np.ones((std_x_test.shape[0], 1)), std_x_test))                   

In [None]:
predictions = helper.predict_labels(w, std_x_test)

In [None]:
predictions

In [None]:
helper.create_csv_submission(ids_test, predictions, 'newton.csv')

### Raw least squares (score: ?)

In [None]:
(w, loss) = imp.least_squares(y_train, x_train)
loss

### Std least squares (score: 0.73)

In [None]:
(w, loss) = imp.least_squares(y_train, col_std_x_train)
loss

In [None]:
(w, loss) = imp.least_squares(y_train, std_x_train)
loss

### Eigenvalues reduction (score: 0.62)

In [None]:
a = np.matrix([[1,2],[3,4]])
np.tile(a, 2)

In [None]:
std_x_train = standardize(x_train)

In [None]:
u, s, v = np.linalg.svd(std_x_train, full_matrices=False)
print('u shape:', u.shape)
print('s shape:', s.shape)
print('v shape:', v.shape)

In [None]:
plt.plot(s)
plt.yscale('log')
plt.title('log')
plt.grid(True)
plt.show()

In [None]:
shortened_x_train = u[:, :23] @ np.diag(s[:23]) @ v[:23,:]

In [None]:
shortened_x_train.shape

In [None]:
(w, loss) = imp.least_squares(y_train, x_train)

In [None]:
loss

In [None]:
y_pred = helper.predict_labels(w, x_test)

In [None]:
print('Number of boson:', np.count_nonzero(y_pred+1))
print('Number of other:', np.count_nonzero(y_pred-1))

In [None]:
helper.create_csv_submission(ids_test, y_pred, 'shortened_eigenvalues_submission.csv')

In [None]:
unique_values_per_column_count = [len(set(col)) for col in train_data.T]

In [None]:
unique_values_per_column_count

In [None]:
def y_map(y):
    if y == -1:
        return 0
    else:
        return 0.2

y_train_mapped = np.vectorize(y_map)(y_train)

### PCA

In [None]:
square_train_data = std_x_train.T @ std_x_train

In [None]:
w, v = np.linalg.eigh(square_train_data)

In [None]:
print(w.shape[0])
print(v.shape)

In [None]:
w

In [None]:
def keep_variance(percentage, vec):
    r = list(range(1, w.shape[0] + 1))
    total = np.sum(vec)
    sums = list(map(lambda i: np.sum(vec[-i:]), r))
    ratio = sums / total
    return np.argmin(abs(ratio - percentage)) + 1

In [None]:
index_keeper = keep_variance(0.9, w)
print(index_keeper)

In [None]:
plt.plot(w)
plt.yscale('log')
plt.title('log')
plt.grid(True)
plt.show()

In [None]:
filtered_v = v[:,-index_keeper:]

In [None]:
filtered_v.shape

In [None]:
project_x_train = std_x_train @ filtered_v

In [None]:
tupled_boson = np.array(list(zip(*filter(lambda pair: pair[1] == -1, zip(project_x_train.tolist(), y_train))))[0])
tupled_other = np.array(list(zip(*filter(lambda pair: pair[1] == 1, zip(project_x_train.tolist(), y_train))))[0])

In [None]:
tupled_boson.shape

In [None]:
plt.plot(tupled_boson[1000:2000, :1], tupled_boson[1000:2000, 1:], 'bo')
plt.plot(tupled_other[1000:2000, :1], tupled_other[1000:2000, 1:], 'ro')
plt.show()

In [None]:
(weight, loss) = imp.least_squares(y_train, project_x_train)

In [None]:
loss

In [None]:
y_pred = helper.predict_labels(weight, std_x_test @ filtered_v)

In [None]:
print('Number of boson:', np.count_nonzero(y_pred+1))
print('Number of other:', np.count_nonzero(y_pred-1))

### Polynomial feature

In [None]:
np.array(range(2))

In [None]:
def polynomial_enhancement(x, deg):
    stacked_x = np.tile(x, deg+1)
    power_vec = np.repeat(np.array(range(deg+1)), x.shape[1])
    return stacked_x ** power_vec

In [None]:
enhanced_col_std_x_train = polynomial_enhancement(std_x_train, 9)
enhanced_col_std_x_test = polynomial_enhancement(std_x_test, 9)
enhanced_col_std_x_train.shape

In [None]:
def pseudo_least_squares(y, x):
    U, S, V = np.linalg.svd(x, full_matrices=False)
    w = V.T @ np.diag(1/S) @ U.T @ y
    loss = imp.mse(y, x, w)
    return (w, loss)

In [None]:
(w, loss) = pseudo_least_squares(y_train, enhanced_col_std_x_train)
loss

In [None]:
y_pred = helper.predict_labels(w, enhanced_col_std_x_test)

In [None]:
helper.create_csv_submission(ids_test, y_pred, 'basic_poly_enhancement_9.csv')

#### Cross validation

In [None]:
acc, loss_train, loss_test, w = imp.cross_validation_v2(y_train, std_x_train, 10, imp.pseudo_least_squares, 5)

In [None]:
acc

In [None]:
enhanced_col_std_x_test = imp.polynomial_enhancement(std_x_test, 5)

In [None]:
y_pred = helper.predict_labels(w, enhanced_col_std_x_test)

In [None]:
helper.create_csv_submission(ids_test, y_pred, '0_8_accuracy_poly_5.csv')

In [None]:
test = []

test.append(1)
test

In [None]:
acc, loss_train, loss_test, w = imp.cross_validation_v2(y_train, std_x_train, 10, imp.pseudo_least_squares, 5)