In [1]:
import scripts.proj1_helpers as helper

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

#import implementations as imp
import cross_validation as cv
#from preprocessor import Preprocessor
from model_ensembler import ModelEnsembler
import preprocessing

import run
from least_squares import LeastSquares
from logistic import LogisticRegression
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Import data

In [3]:
y_train, x_train, ids_train = helper.load_csv_data('train.csv')
y_test, x_test, ids_test = helper.load_csv_data('test.csv')
y_train_logi = y_train.copy()
y_train_logi[y_train_logi < 0] = 0
x_train_preprocessed = preprocessing.preprocess(x_train)

## Models

In [29]:
def get_mean_and_variance(accuracies):
    mean = sum(accuracies) / len(accuracies)
    variances = np.std(np.array(accuracies))
    return mean, variances

In [33]:
def display_result_basic(accuracy):
    mean, variance = get_mean_and_variance(accuracy)
    print('Accuracies', accuracy)
    print('Mean:', mean)
    print('Variance:', variance)

In [23]:
def print_result(accuracy):
    print('Accuracy per cat:', list(map(lambda list_: sum(list_) / len(list_), accuracy)))
    variances = list(map(lambda list_: np.std(np.array(list_)), accuracy))
    print('Variances per cat:', variances)
    accuracies = np.ravel(np.array(accuracy))
    print('Total mean:', np.sum(accuracies) / accuracies.size)
    

In [24]:
def cat_cross_validation(models, y_train, x_train):
    accuracies = []
    variances = []
    i=0

    for cat_data in run.category_iter(y_train, x_train, 22, x_test):
        y_train_cat, x_train_cat, x_test_cat, cat_indicies_te = cat_data
        x_train_cat, x_test_cat = preprocessing.preprocess(x_train_cat, x_test_cat)
        #print('Cat number:', i, 'with:', y_train_cat.size, 'elements')

        accuracy = np.array(cv.cross_validation(y_train_cat, x_train_cat, 5, models[i]))

        accuracies.append(accuracy)
        variances.append(np.std(np.array(accuracies[i])))
        #print('Variance:', variances[i])
        i = i + 1
        #print('')
    
    return accuracies, variances

### Basic least squares

In [36]:
basic_least_squares = LeastSquares()

accuracy = cv.cross_validation(y_train, x_train_preprocessed, 5, basic_least_squares)

Step 1 / 5
0.74328
Step 2 / 5
0.742
Step 3 / 5
0.74784
Step 4 / 5
0.74468
Step 5 / 5
0.74414


In [37]:
display_result_basic(accuracy)

Accuracies [0.74328000000000005, 0.74199999999999999, 0.74783999999999995, 0.74468000000000001, 0.74414000000000002]
Mean: 0.744388
Variance: 0.00194901410975


### Least squares degree 8

In [41]:
least_squares_deg_8 = LeastSquares(degree=8)

accuracy = cv.cross_validation(y_train, x_train_preprocessed, 5, least_squares_deg_8)

Step 1 / 5
0.80738
Step 2 / 5
0.806
Step 3 / 5
0.8121
Step 4 / 5
0.80544
Step 5 / 5
0.80742


In [42]:
display_result_basic(accuracy)

Accuracies [0.80737999999999999, 0.80600000000000005, 0.81210000000000004, 0.80544000000000004, 0.80742000000000003]
Mean: 0.807668
Variance: 0.00234662651481


### Logistic degree 4

In [43]:
logistic_deg_4 = LogisticRegression(degree=4, gamma=0.1)

accuracy = cv.cross_validation(y_train_logi, x_train_preprocessed, 5, logistic_deg_4)

Step 1 / 5
0.81762
Step 2 / 5
0.81664
Step 3 / 5
0.81512
Step 4 / 5
0.81618
Step 5 / 5
0.81604


In [44]:
display_result_basic(accuracy)

Accuracies [0.81762000000000001, 0.81664000000000003, 0.81511999999999996, 0.81618000000000002, 0.81603999999999999]
Mean: 0.81632
Variance: 0.00081613724336


### Least squares cat

In [45]:
least_squares_0 = LeastSquares(degree=7)
least_squares_1 = LeastSquares(degree=12)
least_squares_2 = LeastSquares(degree=11)
least_squares_3 = LeastSquares(degree=11)

models = [least_squares_0, least_squares_1, least_squares_2, least_squares_3]

accuracies, variances = cat_cross_validation(models, y_train, x_train)

Step 1 / 5
0.837503753378
Step 2 / 5
0.835251726554
Step 3 / 5
0.839955960364
Step 4 / 5
0.843809428486
Step 5 / 5
0.847212491242
Step 1 / 5
0.800683518184
Step 2 / 5
0.803585246324
Step 3 / 5
0.806229043075
Step 4 / 5
0.801973175135
Step 5 / 5
0.804681454733
Step 1 / 5
0.830669975186
Step 2 / 5
0.825012406948
Step 3 / 5
0.837419354839
Step 4 / 5
0.836129032258
Step 5 / 5
0.826302729529
Step 1 / 5
0.82536101083
Step 2 / 5
0.835288808664
Step 3 / 5
0.820397111913
Step 4 / 5
0.816110108303
Step 5 / 5
0.830099277978


In [46]:
print_result(accuracies)

Accuracy per cat: [0.84074667200480424, 0.80343048749032753, 0.83110669975186102, 0.8254512635379061]
Variances per cat: [0.0043017082024964801, 0.0019539898856351166, 0.0050096447195751691, 0.0067998692291048801]
Total mean: 0.825183780696


### Logistic cat 

In [47]:
logistic_regression_0 = LogisticRegression(degree=3, gamma=0.1)
logistic_regression_1 = LogisticRegression(degree=6, gamma=0.1)
logistic_regression_2 = LogisticRegression(degree=6, gamma=0.1)
logistic_regression_3 = LogisticRegression(degree=6, gamma=0.1)

models = [logistic_regression_0, logistic_regression_1, logistic_regression_2, logistic_regression_3]

accuracies, variances = cat_cross_validation(models, y_train_logi, x_train)

Step 1 / 5
0.838955059554
Step 2 / 5
0.836002402162
Step 3 / 5
0.838154338905
Step 4 / 5
0.842207987188
Step 5 / 5
0.847112401161
Step 1 / 5
0.801134898117
Step 2 / 5
0.803004900696
Step 3 / 5
0.804359040495
Step 4 / 5
0.803649729172
Step 5 / 5
0.803907660562
Step 1 / 5
0.832853598015
Step 2 / 5
0.828089330025
Step 3 / 5
0.839602977667
Step 4 / 5
0.837717121588
Step 5 / 5
0.827791563275
Step 1 / 5
0.824007220217
Step 2 / 5
0.840703971119
Step 3 / 5
0.827842960289
Step 4 / 5
0.825135379061
Step 5 / 5
0.834837545126


In [48]:
print_result(accuracies)

Accuracy per cat: [0.84048643779401466, 0.8032112458086148, 0.83321091811414405, 0.83050541516245491]
Variances per cat: [0.0038667170300329726, 0.0011268089326181561, 0.0048351224216169426, 0.006339248928082933]
Total mean: 0.82685350422
