In [3]:
import scripts.proj1_helpers as helper
import run as run
import implementations as imp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import cross_validation as cv
from preprocessor import Preprocessor
from least_squares import LeastSquares
from model_ensembler import Model_Ensembler
from logistic import LogisticRegression
%matplotlib inline
%load_ext autoreload
%autoreload 2

### Import data

In [58]:
y_train, x_train, ids_train = helper.load_csv_data('train.csv')
y_test, x_test, ids_test = helper.load_csv_data('test.csv')

In [5]:
half_index = int(x_train.shape[0] / 2)

#### Preprocess option 1

In [59]:
def preprocess_train(x_tr, deg=1):
    x = x_tr.copy()
    stds = np.std(x, axis=0)
    deleted_cols_ids = np.where(stds == 0)
    x = np.delete(x, deleted_cols_ids, axis=1)
    run.mean_spec(x)
    x = run.standardize(x)
    return x, deleted_cols_ids

def preprocess_test(x_te, dependency, deg=1): 
    x = x_te.copy()
    stds = np.std(x, axis=0)
    x = np.delete(x, dependency, axis=1)
    run.mean_spec(x)
    x = run.standardize(x)
    return x

In [60]:
x_train, dependency = preprocess_train(x_train)
x_test = preprocess_test(x_test, dependency)

#### Preprocess option 2

In [138]:
def preprocess(x_train, x_test, deg=1):
    x_train_end = x_train.shape[0]
    
    x_stacked = np.vstack((x_train, x_test))
    
    x = x_stacked.copy()
    stds = np.std(x, axis=0)
    deleted_cols_ids = np.where(stds == 0)
    x = np.delete(x, deleted_cols_ids, axis=1)
    run.mean_spec(x)
    x = run.standardize(x)
    #x = run.polynomial_enhancement(x, deg)
    return x[:x_train_end], x[x_train_end:]
    

In [139]:
x_train, x_test = preprocess(x_train, x_test)

### Test

In [69]:
least_square_model_1 = LeastSquares(degree=8)

In [None]:
accuracy = cv.cross_validation(y_train, x_train, 5, least_square_model_1, seed=1, compute_loss=imp.rmse)
accuracy 

Step 1 / 5
Step 2 / 5
Step 3 / 5


### Garbage (kept just in case)

In [53]:
def preprocess_train(x_tr, deg=4):
    x = x_tr.copy()
    x = run.polynomial_enhancement(x, deg)
    return x, None

def preprocess_test(x_te, dependency, deg=4): 
    x = x_te.copy()
    x = run.polynomial_enhancement(x, deg)
    return x

preprocess_train_model_1 = lambda x_tr: preprocess_train(x_tr, deg=4)
preprocess_test_model_1 = lambda x_te, dependency: preprocess_test(x_te, dependency, deg=4)    

def preprocess_train_meta_model(x_tr, deg=4):
    x = x_tr.copy()
    stacked_x = np.tile(x, deg)
    power_vec = np.repeat(np.array(range(1, deg + 1)), x.shape[1])
    return stacked_x ** power_vec, None
    
def preprocess_test_meta_model(x_te, dependency, deg=4):
    x = x_te.copy()
    stacked_x = np.tile(x, deg)
    power_vec = np.repeat(np.array(range(1, deg + 1)), x.shape[1])
    return stacked_x ** power_vec
    

#preprocess_train_meta_model = lambda x_tr: run.polynomial_enhancement(x_tr, 4), None
#preprocess_test_meta_model = lambda x_te: run.polynomial_enhancement(x_te, 4)

In [65]:
least_square_model_1 = LeastSquares(degree=4)

In [42]:
preprocessor_1 = Preprocessor(preprocess_train, preprocess_test)
preprocessor_2 = Preprocessor(preprocess_train_model_1, preprocess_test_model_1)
preprocessor_meta = Preprocessor(preprocess_train_meta_model, preprocess_test_meta_model)

In [43]:
least_square_model_1 = LeastSquares(preprocessor_1)
least_square_model_2 = LeastSquares(preprocessor_2)

least_square_meta_model = LeastSquares(preprocessor_meta)

In [34]:
model_ensembler = Model_Ensembler([least_square_model_1, least_square_model_2], least_square_meta_model)

In [48]:
least_square_model_1 = LeastSquares(preprocessor_1)
least_square_model_1.train(y_train[:half_index], x_train[:half_index])

least_square_model_2 = LeastSquares(preprocessor_2)
least_square_model_2.train(y_train[:half_index], x_train[:half_index])

models = [least_square_model_1, least_square_model_2]

stage_0_results = np.hstack([model.predict(x_train[half_index:]) for model in models])



In [49]:
logistic_regression = LogisticRegression(preprocessor_1, gamma=0.1)

In [66]:
accuracy = cv.cross_validation(y_train, x_train, 5, least_square_model_1, seed=1, compute_loss=imp.rmse)
accuracy 

Step 1 / 5
Step 2 / 5
Step 3 / 5
Step 4 / 5
Step 5 / 5


[0.79205999999999999,
 0.79039999999999999,
 0.79315999999999998,
 0.79247999999999996,
 0.79383999999999999]

In [158]:
sum(accuracy) / len(accuracy)

0.46317199999999997