In [1]:
import scripts.proj1_helpers as helper

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

#import implementations as imp
import cross_validation as cv
#from preprocessor import Preprocessor
from model_ensembler import ModelEnsembler
import preprocessing

import run
from least_squares import LeastSquares
from logistic import LogisticRegression
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [211]:
def global_preprocess(x_train, x_test, deg=1):
    x_train_end = x_train.shape[0]
    
    x_stacked = np.vstack((x_train, x_test))
    
    x = x_stacked.copy()
    stds = np.std(x, axis=0)
    deleted_cols_ids = np.where(stds == 0)
    x = np.delete(x, deleted_cols_ids, axis=1)
    run.mean_spec(x)
    x = run.standardize(x)
    #x = run.polynomial_enhancement(x, deg)
    return x[:x_train_end], x[x_train_end:]
    

### Import data

In [2]:
y_train, x_train, ids_train = helper.load_csv_data('train.csv')
y_test, x_test, ids_test = helper.load_csv_data('test.csv')
y_train[y_train < 0] = 0

In [77]:
y_train, x_train, ids_train = helper.load_csv_data('train.csv')
y_test, x_test, ids_test = helper.load_csv_data('test.csv')

#### Preprocess option 1

In [96]:
x_train, x_test = global_preprocess(x_train, x_test)

In [6]:
x_train, x_test = preprocessing.preprocess(x_train, x_test)

In [7]:
lr_1 = LogisticRegression(degree=5, gamma=0.1)
lr_2 = LogisticRegression(degree=3, gamma=0.1)
lr_meta = LogisticRegression(degree=1, gamma=0.1)

model_ensembler = ModelEnsembler([lr_1, lr_2], lr_meta)

In [10]:
accuracy = cv.cross_validation(y_train, x_train, 5, lr_1, seed=1)
accuracy 

Step 1 / 5
0.81856
Step 2 / 5
0.81724
Step 3 / 5
0.81776
Step 4 / 5
0.81712
Step 5 / 5
0.81704


[0.81855999999999995,
 0.81723999999999997,
 0.81776000000000004,
 0.81711999999999996,
 0.81703999999999999]

In [86]:
model_ensembler.train(y_train, x_train)

In [87]:
preds = model_ensembler.predict_labels(x_test)
preds[preds == 0] = -1
helper.create_csv_submission(ids_test, preds, 'model_ensembling_global_preprocess_logistic_regression_4_3_1.csv')

In [88]:
preds[preds > 0].size

175715

In [104]:
sum(accuracy) / len(accuracy)

0.81210799999999994

In [99]:
sum(accuracy) / len(accuracy)

0.81210799999999994

#### Preprocess option 2

### Logistic cat cross validation

In [3]:
def remove_outliers(y, x, threshold=0.1):
    x = x.copy()
    stds = np.std(x, axis=0)
    means = np.mean(x, axis=0)
    centered = np.absolute(x - means)  
    variance_ratio = centered / stds
    rows_to_keep = np.any(variance_ratio < threshold, axis=1).reshape(y.shape)
    return y[rows_to_keep], x[rows_to_keep]

In [4]:
logistic_regression_0 = LogisticRegression(degree=3, gamma=0.1)
remove_outliers_0 = lambda y, x: remove_outliers(y, x, 0.05)

logistic_regression_1 = LogisticRegression(degree=6, gamma=0.1)
remove_outliers_1 = lambda y, x: remove_outliers(y, x, 0.2)

logistic_regression_2 = LogisticRegression(degree=6, gamma=0.1)
remove_outliers_2 = lambda y, x: remove_outliers(y, x, 0.1)

logistic_regression_3 = LogisticRegression(degree=6, gamma=0.1)
remove_outliers_3 = lambda y, x: remove_outliers(y, x, 0.1)


i = 0
models = [logistic_regression_0, logistic_regression_1, logistic_regression_2, logistic_regression_3]
outliers_remover = [remove_outliers_0, remove_outliers_1, remove_outliers_2, remove_outliers_3]
#models = [logistic_regression_0, model_ensembler_1, model_ensembler_2, model_ensembler_3]
accuracies = []

for cat_data in run.category_iter(y_train, x_train, 22, x_test):
    #if i != 3:
     #   i = i + 1
      #  continue
    y_train_cat, x_train_cat, x_test_cat, cat_indicies_te = cat_data
    x_train_cat, x_test_cat = preprocessing.preprocess(x_train_cat, x_test_cat)
    #y_train_cat, x_train_cat = outliers_remover[i](y_train_cat, x_train_cat)
    #print(y_train_test.shape)
    print('Cat number:', i, 'with:', y_train_cat.size, 'elements')
    
    accuracies.append(np.array(cv.cross_validation(y_train_cat, x_train_cat, 5, models[i])))
    i = i + 1
    print('')

Cat number: 0 with: 99913 elements
Step 1 / 5
0.838955059554
Step 2 / 5
0.836002402162
Step 3 / 5
0.838154338905
Step 4 / 5
0.842207987188
Step 5 / 5
0.847112401161

Cat number: 1 with: 77544 elements
Step 1 / 5
0.801134898117
Step 2 / 5
0.803004900696
Step 3 / 5
0.804359040495
Step 4 / 5
0.803649729172
Step 5 / 5
0.803907660562

Cat number: 2 with: 50379 elements
Step 1 / 5
0.832853598015
Step 2 / 5
0.828089330025
Step 3 / 5
0.839602977667
Step 4 / 5
0.837717121588
Step 5 / 5
0.827791563275

Cat number: 3 with: 22164 elements
Step 1 / 5
0.824007220217
Step 2 / 5
0.840703971119
Step 3 / 5
0.827842960289
Step 4 / 5
0.825135379061
Step 5 / 5
0.834837545126



In [5]:
accuracies = np.ravel(np.array(accuracies))
np.sum(accuracies) / accuracies.size

0.82685350421980708

In [48]:
accuracies = np.ravel(np.array(accuracies))
np.sum(accuracies) / accuracies.size

0.82686525456742788

In [58]:
accuracies = np.ravel(np.array(accuracies))
np.sum(accuracies) / accuracies.size

0.82942238267148016

## Logistic cat train

In [14]:
logistic_regression_0 = LogisticRegression(degree=3, gamma=0.1)
logistic_regression_1 = LogisticRegression(degree=6, gamma=0.1)
logistic_regression_2 = LogisticRegression(degree=6, gamma=0.1)
logistic_regression_3 = LogisticRegression(degree=6, gamma=0.1)

i = 0
models = [logistic_regression_0, logistic_regression_1, logistic_regression_2, logistic_regression_3]
accuracies = []
predictions = np.zeros((y_test.shape[0], 1))

for cat_data in run.category_iter(y_train, x_train, 22, x_test):
    #if i != 3:
        #i = i + 1
        #continue
    y_train_cat, x_train_cat, x_test_cat, cat_indices_te = cat_data
    x_train_cat, x_test_cat = preprocessing.preprocess(x_train_cat, x_test_cat)
    print('Cat number:', i, 'with:', y_train_cat.size, 'elements')
    
    models[i].train(y_train_cat, x_train_cat)
    labels = models[i].predict_labels(x_test_cat)
    labels[labels == 0] = -1
    
    predictions[cat_indices_te] = labels
    i = i + 1
    print('')

Cat number: 0 with: 99913 elements

Cat number: 1 with: 77544 elements

Cat number: 2 with: 50379 elements

Cat number: 3 with: 22164 elements



In [15]:
helper.create_csv_submission(ids_test, predictions, 'logistic_cat_3_6_6_6_test.csv')

## Least squares cross validation

In [160]:
least_squares_0 = LeastSquares(degree=7)
least_squares_1 = LeastSquares(degree=12)
least_squares_2 = LeastSquares(degree=11)
least_squares_3 = LeastSquares(degree=11)

i = 0
models = [least_squares_0, least_squares_1, least_squares_2, least_squares_3]
accuracies = []

for cat_data in run.category_iter(y_train, x_train, 22, x_test):
    #if i != 3:
        #i = i + 1
        #continue
    y_train_cat, x_train_cat, x_test_cat, cat_indicies_te = cat_data
    x_train_cat, x_test_cat = preprocessing.preprocess(x_train_cat, x_test_cat)
    print('Cat number:', i, 'with:', y_train_cat.size, 'elements')
    
    accuracies.append(cv.cross_validation(y_train_cat, x_train_cat, 5, models[i]))
    i = i + 1
    print('')

Cat number: 0 with: 99913 elements
Step 1 / 5
0.14833350015
Step 2 / 5
0.149684716245
Step 3 / 5
0.147682914623
Step 4 / 5
0.149534581123
Step 5 / 5
0.152737463717

Cat number: 1 with: 77544 elements
Step 1 / 5
0.243229301006
Step 2 / 5
0.242262058292
Step 3 / 5


KeyboardInterrupt: 

In [121]:
accuracies = np.ravel(np.array(accuracies))
np.sum(accuracies) / accuracies.size

0.82526539088158013

### Test

In [None]:
least_square_model_1 = LeastSquares(degree=8)

In [None]:
accuracy = cv.cross_validation(y_train, x_train, 5, least_square_model_1, seed=1, compute_loss=imp.rmse)
accuracy 

### Garbage (kept just in case)

In [None]:
def preprocess_train(x_tr, deg=4):
    x = x_tr.copy()
    x = run.polynomial_enhancement(x, deg)
    return x, None

def preprocess_test(x_te, dependency, deg=4): 
    x = x_te.copy()
    x = run.polynomial_enhancement(x, deg)
    return x

preprocess_train_model_1 = lambda x_tr: preprocess_train(x_tr, deg=4)
preprocess_test_model_1 = lambda x_te, dependency: preprocess_test(x_te, dependency, deg=4)    

def preprocess_train_meta_model(x_tr, deg=4):
    x = x_tr.copy()
    stacked_x = np.tile(x, deg)
    power_vec = np.repeat(np.array(range(1, deg + 1)), x.shape[1])
    return stacked_x ** power_vec, None
    
def preprocess_test_meta_model(x_te, dependency, deg=4):
    x = x_te.copy()
    stacked_x = np.tile(x, deg)
    power_vec = np.repeat(np.array(range(1, deg + 1)), x.shape[1])
    return stacked_x ** power_vec
    

#preprocess_train_meta_model = lambda x_tr: run.polynomial_enhancement(x_tr, 4), None
#preprocess_test_meta_model = lambda x_te: run.polynomial_enhancement(x_te, 4)

In [None]:
least_square_model_1 = LeastSquares(degree=4)

In [None]:
preprocessor_1 = Preprocessor(preprocess_train, preprocess_test)
preprocessor_2 = Preprocessor(preprocess_train_model_1, preprocess_test_model_1)
preprocessor_meta = Preprocessor(preprocess_train_meta_model, preprocess_test_meta_model)

In [None]:
least_square_model_1 = LeastSquares(preprocessor_1)
least_square_model_2 = LeastSquares(preprocessor_2)

least_square_meta_model = LeastSquares(preprocessor_meta)

In [None]:
model_ensembler = Model_Ensembler([least_square_model_1, least_square_model_2], least_square_meta_model)

In [None]:
least_square_model_1 = LeastSquares(preprocessor_1)
least_square_model_1.train(y_train[:half_index], x_train[:half_index])

least_square_model_2 = LeastSquares(preprocessor_2)
least_square_model_2.train(y_train[:half_index], x_train[:half_index])

models = [least_square_model_1, least_square_model_2]

stage_0_results = np.hstack([model.predict(x_train[half_index:]) for model in models])



In [None]:
logistic_regression = LogisticRegression(preprocessor_1, gamma=0.1)

In [None]:
accuracy = cv.cross_validation(y_train, x_train, 5, least_square_model_1, seed=1, compute_loss=imp.rmse)
accuracy 

In [None]:
sum(accuracy) / len(accuracy)