In [22]:
import scripts.proj1_helpers as helper
import run as run
import implementations as imp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import logistic as logistic
import minimizers
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
def special_preprocess(x_tr, x_test, deg_der=1, deg_pri=1, deg_cat=1):
    x = x_tr.copy()
    stds = np.std(x, axis=0)
    deleted_cols_ids = np.where(stds == 0)
    x = np.delete(x, deleted_cols_ids, axis=1)
    run.mean_spec(x)
    x = run.standardize(x)
    #x = special_poly_enhancement(x, deg_der, deg_pri, deg_cat)
    #x = np.hstack((np.ones((x.shape[0], 1)), x))
    x = run.polynomial_enhancement(x, deg_der)
    
    x_te = x_test.copy()
    x_te = np.delete(x_te, deleted_cols_ids, axis=1)
    run.mean_spec(x_te)
    x_te = run.standardize(x_te)
    #x_te = special_poly_enhancement(x_te, deg_der, deg_pri, deg_cat)
    #x_te = np.hstack((np.ones((x_te.shape[0], 1)), x_te))
    x_te = run.polynomial_enhancement(x_te, deg_der)
    return x, x_te

def special_poly_enhancement(x_tr, deg_der, deg_pri, deg_cat):
    x = x_tr.copy()
    
    arr = np.repeat([1], 30)
    arr[0: 14] = deg_der
    arr[14:] = deg_pri
    arr[22] = deg_cat
    arr = arr.tolist()
    
    x = np.repeat(x, arr, axis=1)
    
    powers = list(map(lambda pow_: range(1, pow_+1), arr))
    powers = [p for list_p in powers for p in list_p]
    return x ** powers

## Import the data

In [3]:
y_train, x_train, ids_train = helper.load_csv_data('train.csv')
y_test, x_test, ids_test = helper.load_csv_data('test.csv')

In [None]:
y_train_logistic = y_train.copy()
y_train_logistic[y_train_logistic < 0] = 0

In [81]:
half_index = int(x_train.shape[0]/2)
x_train_half_1 = x_train[:half_index]
x_train_half_2 = x_train[half_index:]

## Good models

#### Least squares custom degrees (4, 7, 10)

In [82]:
preprocessed_x_train = special_preprocess(x_train, deg_der=4, deg_pri=7, deg_cat=10)

In [83]:
accuracy, loss_tr, loss_te, weigths = run.cross_validation(y_train, preprocessed_x_train, 5, run.pseudo_least_squares, 1, seed=1, compute_loss=imp.rmse)


In [84]:
accuracy

[0.79282, 0.7913, 0.79394, 0.79344, 0.79384]

In [85]:
sum(accuracy) / len(accuracy)

0.793068

#### Least squares custom degrees (8, 4, 13)

In [293]:
preprocessed_x_train, preprocessed_x_test = special_preprocess(x_train, x_test, deg_der=8, deg_pri=4, deg_cat=13)

In [87]:
accuracy, loss_tr, loss_te, weigths = run.cross_validation(y_train, preprocessed_x_train, 5, run.pseudo_least_squares, 1, seed=1, compute_loss=imp.rmse)

In [294]:
w, loss = run.pseudo_least_squares(y_train, preprocessed_x_train)

In [295]:
y_pred = preprocessed_x_test @ w

In [297]:
y_pred = helper.predict_labels(w, preprocessed_x_test)

In [300]:
helper.create_csv_submission(ids_test, y_pred, 'least_squares_8_4_13.csv')

In [88]:
accuracy

[0.80826, 0.80734, 0.81196, 0.8071, 0.80668]

In [89]:
sum(accuracy) / len(accuracy)

0.808268

#### Logistic regression deg=4

In [54]:
preprocessed_x_train = special_preprocess(x_train, deg_der=4, deg_pri=4, deg_cat=4)

In [55]:
newton_minimzer = lambda y, tx, w: minimizers.newton_step(y, tx, w,
            logistic.compute_gradient, logistic.compute_hessian)

In [56]:
accuracy = logistic.logistic_regression(y_train, x_train, 100, 1, newton_minimzer)

Completed logistic regression with loss 92434.4637377


In [None]:
accuracy

### Model ensembling

In [37]:
def model_1(y, x_train, x_test):
    preprocessed_x_train, preprocessed_x_test = special_preprocess(x_train, x_test, deg_der=4, deg_pri=7, deg_cat=10)
    w, loss = run.pseudo_least_squares(y, preprocessed_x_train)
    return preprocessed_x_test @ w

def model_2(y, x_train, x_test):
    preprocessed_x_train,preprocessed_x_test  = special_preprocess(x_train, x_test, deg_der=8, deg_pri=4, deg_cat=13)
    w, loss = run.pseudo_least_squares(y, preprocessed_x_train)
    return preprocessed_x_test @ w

def model_3(y, x_train, x_test):
    preprocessed_x_train,preprocessed_x_test  = special_preprocess(x_train, x_test, deg_der=4, deg_pri=4, deg_cat=4)
    w, loss = run.pseudo_least_squares(y, preprocessed_x_train)
    return preprocessed_x_test @ w

def model_4(y, x_train, x_test):
    preprocessed_x_train,preprocessed_x_test  = special_preprocess(x_train, x_test, deg_der=5, deg_pri=5, deg_cat=5)
    w, loss = run.pseudo_least_squares(y, preprocessed_x_train)
    return preprocessed_x_test @ w

def model_5(y, x_train, x_test):
    preprocessed_x_train,preprocessed_x_test  = special_preprocess(x_train, x_test, deg_der=6, deg_pri=6, deg_cat=6)
    w, loss = run.pseudo_least_squares(y, preprocessed_x_train)
    return preprocessed_x_test @ w

def model_6(y, x_train, x_test):
    preprocessed_x_train,preprocessed_x_test  = special_preprocess(x_train, x_test, deg_der=7, deg_pri=7, deg_cat=7)
    w, loss = run.pseudo_least_squares(y, preprocessed_x_train)
    return preprocessed_x_test @ w

def model_7(y, x_train, x_test):
    preprocessed_x_train,preprocessed_x_test  = special_preprocess(x_train, x_test, deg_der=8, deg_pri=8, deg_cat=8)
    w, loss = run.pseudo_least_squares(y, preprocessed_x_train)
    return preprocessed_x_test @ w

In [43]:
def model_ensembling(models, meta_model, x_train, y_train):
    x = x_train.copy()
    half_index = int(x.shape[0]/2)
    x_half_1 = x[:half_index, :]
    x_half_2 = x[half_index:, :]
    
    y_half_1 = y_train[:half_index]
    y_half_2 = y_train[half_index:]
    
    results_model = [model(y_half_1, x_half_1, x_half_2) for model in models]
    
    new_x = np.array(results_model).T
    
    #new_x[new_x > 0] = 1
    #new_x[new_x <= 0] = -1
    
    #summed = new_x[:1] + new_x[1:]
    #print(new_x.shape)
    
    #print(summed[summed == 0].size / summed.size)
    #new_x = run.standardize(new_x)
    #new_x = np.concatenate((run.polynomial_enhancement(new_x[:, :1], 3), run.polynomial_enhancement(new_x[:, 1:], 4)), axis=1)
    new_x = run.polynomial_enhancement(new_x, 4)[:, 1:]
    #new_x = np.hstack((np.ones((new_x.shape[0], 1)), new_x))
    
    accuracy, loss_tr, loss_te, weigths = run.cross_validation(y_half_2, new_x, 4, meta_model, 1, seed=1, compute_loss=imp.rmse)    
    return accuracy 

In [44]:
accuracies = model_ensembling([model_6, model_7], run.pseudo_least_squares, x_train, y_train)
accuracies

(241,)


[0.807168, 0.804768, 0.807136, 0.807584]

In [40]:
sum(accuracies) / len(accuracies)

0.8066639999999999

In [501]:
def model_ensembling_train(models, meta_model, x_train, y_train, x_test):
    x = x_train.copy()
    half_index = int(x.shape[0]/2)
    x_half_1 = x[:half_index, :]
    x_half_2 = x[half_index:, :]
    
    y_half_1 = y_train[:half_index]
    y_half_2 = y_train[half_index:]
    
    results_model = [model(y_half_1, x_half_1, x_half_2) for model in models]
    
    test_stage_2 = [model(y_half_1, x_half_1, x_test) for model in models]
    
    new_x = np.array(results_model).T
    new_x_test = np.array(test_stage_2).T
    #new_x[new_x > 0] = 1
    #new_x[new_x <= 0] = -1
    
    
    
    #summed = new_x[:1] + new_x[1:]
    
    #print(summed[summed == 0].size / summed.size)
    #new_x = run.standardize(new_x)
    #new_x_test = run.standardize(new_x_test)
    
    new_x = run.polynomial_enhancement(new_x, 4)
    new_x_test = run.polynomial_enhancement(new_x_test, 4)
    
    w, loss = meta_model(y_half_2, new_x)
    
    y_pred = helper.predict_labels(w, new_x_test)
    
    helper.create_csv_submission(ids_test, y_pred, 'model_ensembling_2_6_7.csv')

In [502]:
model_ensembling_train([model_2, model_6, model_7], run.pseudo_least_squares, x_train, y_train, x_test)

### Model ensembling compatibility

In [None]:
def model_ensembling_train(models, meta_model, x_train, y_train, x_test):
    x_tr = x_train.copy()
    half_index = int(x_tr.shape[0]/2)
    x_half_1 = x_tr[:half_index, :]
    x_half_2 = x_tr[half_index:, :]
    
    y_half_1 = y_train[:half_index]
    y_half_2 = y_train[half_index:]
    
    results_model = [model(y_half_1, x_half_1, x_half_2) for model in models]
    
    test_stage_2 = [model(y_half_1, x_half_1, x_test) for model in models]
    
    new_x = np.array(results_model).T
    new_x_test = np.array(test_stage_2).T
    #new_x[new_x > 0] = 1
    #new_x[new_x <= 0] = -1
    
    
    
    #summed = new_x[:1] + new_x[1:]
    
    #print(summed[summed == 0].size / summed.size)
    #new_x = run.standardize(new_x)
    #new_x_test = run.standardize(new_x_test)
    
    new_x = run.polynomial_enhancement(new_x, 4)
    new_x_test = run.polynomial_enhancement(new_x_test, 4)
    
    w, loss = meta_model(y_half_2, new_x)
    
    y_pred = helper.predict_labels(w, new_x_test)
    
    helper.create_csv_submission(ids_test, y_pred, 'model_ensembling_2_6_7.csv')

### Boosting test

In [302]:
test = np.array([1, 2, 3, 4])
np.linalg.pinv(test)

LinAlgError: 1-dimensional array given. Array must be at least two-dimensional

In [None]:
def boosting(y_train, x_train, weights_step=0.01):
    