In [None]:
import scripts.proj1_helpers as helper
import run as imp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Feature engineering

In [None]:
df = pd.read_csv('train.csv')

In [None]:
dfb = pd.read_csv('test.csv')

In [None]:
len(df[df == -999.0])

In [None]:
for i, c in enumerate(df.columns):
    print(i, c)

In [None]:
df.PRI_jet_num.unique()

## Import the data

boson is -1

not boson is 1

In [None]:
y_train, x_train, ids_train = helper.load_csv_data('train.csv')
y_test, x_test, ids_test = helper.load_csv_data('test.csv')

In [None]:
print('Number of boson:', np.count_nonzero(y_train-1))
print('Number of other:', np.count_nonzero(y_train+1))

In [None]:
y_train[y_train < 0] = 0

## Logistic regression per category

In [None]:
start = time.perf_counter()
preds = imp.train_predict_logistic_cat(y_train, x_train, x_test, deg=4)
end = time.perf_counter()

In [None]:
helper.create_csv_submission(ids_test, preds, 'poly_cat_logistic.csv')

### Cross validation

In [None]:
start = time.perf_counter()
accuracy = imp.logistic_cross_validation(y_train,
                                         x_train,
                                         k_fold=3,
                                         train_predict_logistic=imp.train_predict_logistic_cat, deg=4)
end = time.perf_counter()

In [None]:
accuracy

In [None]:
sum(accuracy) / len(accuracy)

## Logistic regression

In [None]:
preds = imp.train_predict_logistic(y_train, x_train, x_test, max_iter=100, threshold=1)

### Cross Validation

In [None]:
start = time.perf_counter()
accuracy = imp.logistic_cross_validation(y_train,
                                         x_train,
                                         k_fold=5,
                                         train_predict_logistic=imp.train_predict_logistic)
end = time.perf_counter()

In [None]:
accuracy

In [None]:
sum(accuracy) / len(accuracy)

### Raw least squares (score: ?)

In [None]:
(w, loss) = imp.least_squares(y_train, x_train)
loss

### Std least squares (score: 0.73)

In [None]:
(w, loss) = imp.least_squares(y_train, col_std_x_train)
loss

In [None]:
(w, loss) = imp.least_squares(y_train, std_x_train)
loss

### Eigenvalues reduction (score: 0.62)

In [None]:
a = np.matrix([[1,2],[3,4]])
np.tile(a, 2)

In [None]:
std_x_train = standardize(x_train)

In [None]:
u, s, v = np.linalg.svd(std_x_train, full_matrices=False)
print('u shape:', u.shape)
print('s shape:', s.shape)
print('v shape:', v.shape)

In [None]:
plt.plot(s)
plt.yscale('log')
plt.title('log')
plt.grid(True)
plt.show()

In [None]:
shortened_x_train = u[:, :23] @ np.diag(s[:23]) @ v[:23,:]

In [None]:
shortened_x_train.shape

In [None]:
(w, loss) = imp.least_squares(y_train, x_train)

In [None]:
loss

In [None]:
y_pred = helper.predict_labels(w, x_test)

In [None]:
print('Number of boson:', np.count_nonzero(y_pred+1))
print('Number of other:', np.count_nonzero(y_pred-1))

In [None]:
helper.create_csv_submission(ids_test, y_pred, 'shortened_eigenvalues_submission.csv')

In [None]:
unique_values_per_column_count = [len(set(col)) for col in train_data.T]

In [None]:
unique_values_per_column_count

In [None]:
def y_map(y):
    if y == -1:
        return 0
    else:
        return 0.2

y_train_mapped = np.vectorize(y_map)(y_train)

### PCA

In [None]:
square_train_data = std_x_train.T @ std_x_train

In [None]:
w, v = np.linalg.eigh(square_train_data)

In [None]:
print(w.shape[0])
print(v.shape)

In [None]:
w

In [None]:
def keep_variance(percentage, vec):
    r = list(range(1, w.shape[0] + 1))
    total = np.sum(vec)
    sums = list(map(lambda i: np.sum(vec[-i:]), r))
    ratio = sums / total
    return np.argmin(abs(ratio - percentage)) + 1

In [None]:
index_keeper = keep_variance(0.9, w)
print(index_keeper)

In [None]:
plt.plot(w)
plt.yscale('log')
plt.title('log')
plt.grid(True)
plt.show()

In [None]:
filtered_v = v[:,-index_keeper:]

In [None]:
filtered_v.shape

In [None]:
project_x_train = std_x_train @ filtered_v

In [None]:
tupled_boson = np.array(list(zip(*filter(lambda pair: pair[1] == -1, zip(project_x_train.tolist(), y_train))))[0])
tupled_other = np.array(list(zip(*filter(lambda pair: pair[1] == 1, zip(project_x_train.tolist(), y_train))))[0])

In [None]:
tupled_boson.shape

In [None]:
plt.plot(tupled_boson[1000:2000, :1], tupled_boson[1000:2000, 1:], 'bo')
plt.plot(tupled_other[1000:2000, :1], tupled_other[1000:2000, 1:], 'ro')
plt.show()

In [None]:
(weight, loss) = imp.least_squares(y_train, project_x_train)

In [None]:
loss

In [None]:
y_pred = helper.predict_labels(weight, std_x_test @ filtered_v)

In [None]:
print('Number of boson:', np.count_nonzero(y_pred+1))
print('Number of other:', np.count_nonzero(y_pred-1))

### Polynomial feature

In [None]:
np.array(range(2))

In [None]:
def polynomial_enhancement(x, deg):
    stacked_x = np.tile(x, deg+1)
    power_vec = np.repeat(np.array(range(deg+1)), x.shape[1])
    return stacked_x ** power_vec

In [None]:
enhanced_col_std_x_train = polynomial_enhancement(std_x_train, 9)
enhanced_col_std_x_test = polynomial_enhancement(std_x_test, 9)
enhanced_col_std_x_train.shape

In [None]:
def pseudo_least_squares(y, x):
    U, S, V = np.linalg.svd(x, full_matrices=False)
    w = V.T @ np.diag(1/S) @ U.T @ y
    loss = imp.mse(y, x, w)
    return (w, loss)

In [None]:
(w, loss) = pseudo_least_squares(y_train, enhanced_col_std_x_train)
loss

In [None]:
y_pred = helper.predict_labels(w, enhanced_col_std_x_test)

In [None]:
helper.create_csv_submission(ids_test, y_pred, 'basic_poly_enhancement_9.csv')

#### Cross validation

In [None]:
acc, loss_train, loss_test, w = imp.cross_validation_v2(y_train, std_x_train, 10, imp.pseudo_least_squares, 5)

In [None]:
acc

In [None]:
enhanced_col_std_x_test = imp.polynomial_enhancement(std_x_test, 5)

In [None]:
y_pred = helper.predict_labels(w, enhanced_col_std_x_test)

In [None]:
helper.create_csv_submission(ids_test, y_pred, '0_8_accuracy_poly_5.csv')

In [None]:
test = []

test.append(1)
test

In [None]:
acc, loss_train, loss_test, w = imp.cross_validation_v2(y_train, std_x_train, 10, imp.pseudo_least_squares, 5)