In [1]:
import numpy as np
import pickle as pk
import pandas as pd
import timeit as tm
import csv
import sys

# Loading data
## Loading training data

In [2]:
# Open training data to pandas
train_dat_pandas = pd.read_csv('../data/clean_data/train_vectors.csv', index_col=0, encoding='utf-8')
del train_dat_pandas['TYPE']

# Open training labels to pandas
train_lbl_pandas = pd.read_csv('../data/clean_data/train_labels.csv', index_col=0, encoding='utf-8')
del train_lbl_pandas['YEAR']

# Save headers
headers = [list(train_dat_pandas)]

# Convert pandas to numpy matrix
train_dat = train_dat_pandas.as_matrix()
print 'training data dimensions:', train_dat.shape

# Convert pandas to numpy matrix
train_lbl = train_lbl_pandas.as_matrix()
print 'training label dimensions:', train_lbl.shape

training data dimensions: (295169L, 64L)
training label dimensions: (295169L, 6L)


## Loading test data

In [3]:
# Open test data
test_dat_pandas = pd.read_csv('../data/clean_data/test_vectors.csv', index_col=0, encoding='utf-8')
del test_dat_pandas['TYPE']

# Open test labels
test_lbl_pandas = pd.read_csv('../data/clean_data/test_labels.csv', index_col=0, encoding='utf-8')
del test_lbl_pandas['YEAR']

# Convert pandas to numpy matrix
test_dat = test_dat_pandas.as_matrix()
print 'testing data dimensions:', test_dat.shape

# Convert pandas to numpy matrix
test_lbl = test_lbl_pandas.as_matrix()
print 'testing label dimensions:', test_lbl.shape

testing data dimensions: (34142L, 64L)
testing label dimensions: (34142L, 6L)


## Converting one hot labels to numeric

In [4]:
# method to convert a one hot encoding array into a numeric array
def onehot_2_numeric(onehot):
    numeric = []
    for elem in onehot:
        result = 0
        for i, k in enumerate(elem):
            result += i * k
        numeric.append(result)
    return np.asarray(numeric)


train_lbl_txt = onehot_2_numeric(train_lbl)
test_lbl_txt = onehot_2_numeric(test_lbl)
print train_lbl_txt, test_lbl_txt

[4 4 4 ..., 4 2 3] [4 2 2 ..., 4 2 2]


## Scaling data

In [5]:
# Feature vector scalling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_dat)
train_dat = scaler.transform(train_dat)
test_dat = scaler.transform(test_dat)

# Dimensionality Reduction

# Linear Regression

In [6]:
from sklearn.linear_model import LinearRegression

# Fit Linear Regression
lin_reg = LinearRegression(n_jobs=-1, normalize=True)
lin_reg.fit(train_dat, train_lbl)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=True)

In [7]:
# Generate predictions
predictions = lin_reg.predict(test_dat)
print predictions.shape

(34142L, 6L)


In [8]:
# Compute RMSE

import math

errors = []

# compute squared errors
for i in xrange(predictions.shape[0]):
    p = predictions[i]
    t = test_lbl[i]
    
    # compute distance
    squared_distance = 0.0
    for j in xrange(predictions.shape[1]):
        squared_distance += (p[j] - t[j])**2
    
    errors.append(squared_distance)

rmse = math.sqrt(sum(errors)/len(errors))
print 'Root mean squared error:', rmse

Root mean squared error: 0.841279191841


In [9]:
# Save model
from sklearn.externals import joblib
joblib.dump(lin_reg, '../models/linear_regression_model.p')

['../models/linear_regression_model.p',
 '../models/linear_regression_model.p_01.npy',
 '../models/linear_regression_model.p_02.npy',
 '../models/linear_regression_model.p_03.npy',
 '../models/linear_regression_model.p_04.npy']

# Logistic regression

In [10]:
# from sklearn.linear_model import LogisticRegression

# clf = LogisticRegression(n_jobs=-1)
# clf.fit(train_dat, train_lbl_txt)

In [11]:
# predictions = clf.predict(test_dat)
# p_predictions = clf.predict_proba(test_dat)

# print 'predictions dimensions:', predictions.shape
# print 'probabilities per class:', p_predictions.shape

In [12]:
# # Table of probabilities for each class
# for i in range(6):
#     print str(i)+'\t',

# print ''

# for i in xrange(len(p_predictions)):
    
#     for j in xrange(len(p_predictions[i])):
#         print("%.2f" % (p_predictions[i][j]*100))+'%\t',
    
#     print ''

In [13]:
# from sklearn.metrics import accuracy_score
# score = accuracy_score(test_lbl_txt, predictions)
# print score

# Logistic Regression Cross validation

In [14]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.cross_validation import StratifiedKFold

folder = StratifiedKFold(train_lbl_txt, n_folds=5, shuffle=False)

clf = LogisticRegressionCV(n_jobs=-1, solver='liblinear', cv=folder, verbose=5)
print clf

LogisticRegressionCV(Cs=10, class_weight=None,
           cv=sklearn.cross_validation.StratifiedKFold(labels=[4 4 ..., 2 3], n_folds=5, shuffle=False, random_state=None),
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=100, multi_class='ovr', n_jobs=-1, penalty='l2',
           random_state=None, refit=True, scoring=None, solver='liblinear',
           tol=0.0001, verbose=5)


In [15]:
clf = clf.fit(train_dat, train_lbl_txt)
print clf.score(test_dat, test_lbl_txt)

[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 27.8min finished


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]0.425692695214


In [16]:
# Save model
from sklearn.externals import joblib
joblib.dump(clf, '../models/logistic_regression_model.p')

['../models/logistic_regression_model.p',
 '../models/logistic_regression_model.p_01.npy',
 '../models/logistic_regression_model.p_02.npy',
 '../models/logistic_regression_model.p_03.npy',
 '../models/logistic_regression_model.p_04.npy',
 '../models/logistic_regression_model.p_05.npy',
 '../models/logistic_regression_model.p_06.npy',
 '../models/logistic_regression_model.p_07.npy',
 '../models/logistic_regression_model.p_08.npy',
 '../models/logistic_regression_model.p_09.npy',
 '../models/logistic_regression_model.p_10.npy',
 '../models/logistic_regression_model.p_11.npy',
 '../models/logistic_regression_model.p_12.npy',
 '../models/logistic_regression_model.p_13.npy',
 '../models/logistic_regression_model.p_14.npy',
 '../models/logistic_regression_model.p_15.npy',
 '../models/logistic_regression_model.p_16.npy',
 '../models/logistic_regression_model.p_17.npy',
 '../models/logistic_regression_model.p_18.npy',
 '../models/logistic_regression_model.p_19.npy',
 '../models/logistic_regres

# Decision Tree Classifier

In [17]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(train_dat, train_lbl_txt)
predictions = clf.predict(test_dat)

from sklearn.metrics import accuracy_score
score = accuracy_score(test_lbl_txt, predictions)
print score

0.387089215629


# Decision Tree Cross Validation

In [18]:
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import StratifiedKFold

folder = StratifiedKFold(train_lbl_txt, n_folds=5, shuffle=False)
parameters = {'max_depth':[None, 2, 4, 8, 16, 32, 64]}
dtc_clf = DecisionTreeClassifier()

clf = GridSearchCV(dtc_clf, parameters, n_jobs=-1, pre_dispatch='n_jobs', cv=folder, refit=True, verbose=5)
clf.fit(train_dat, train_lbl_txt)

print 'Score on test data:', clf.score(test_dat, test_lbl_txt)

print 'best params:', clf.best_params_

[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   27.6s finished


Fitting 5 folds for each of 7 candidates, totalling 35 fits
Score on test data: 0.440044519946
best params: {'max_depth': 2}


In [19]:
# Save model
from sklearn.externals import joblib
joblib.dump(clf, '../models/decision_tree_model.p')

['../models/decision_tree_model.p',
 '../models/decision_tree_model.p_01.npy',
 '../models/decision_tree_model.p_02.npy',
 '../models/decision_tree_model.p_03.npy',
 '../models/decision_tree_model.p_04.npy',
 '../models/decision_tree_model.p_05.npy',
 '../models/decision_tree_model.p_06.npy',
 '../models/decision_tree_model.p_07.npy',
 '../models/decision_tree_model.p_08.npy',
 '../models/decision_tree_model.p_09.npy',
 '../models/decision_tree_model.p_10.npy',
 '../models/decision_tree_model.p_11.npy',
 '../models/decision_tree_model.p_12.npy',
 '../models/decision_tree_model.p_13.npy']

In [20]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import accuracy_score

# Generate k-fold split in the training data
kf = StratifiedKFold(train_lbl_txt, n_folds=5, shuffle=True)

# Do multiple runs and save'em to runs list
runs = []
depths = [None, 2, 4, 8, 16, 32, 64]

print 'this will take a while...',
for d in depths:
    clf = DecisionTreeClassifier(max_depth=d)
    for t,v in kf:
        trn = train_dat[t]
        val = train_dat[v]
        trn_lbl = train_lbl_txt[t]
        val_lbl = train_lbl_txt[v]
        clf.fit(trn, trn_lbl)
        predictions = clf.predict(val)
        #score = accuracy_score(val_lbl, predictions)
        score = clf.score(val, val_lbl)
        runs.append(tuple([d, score]))
        print d, score
print 'done!'
    

this will take a while... None 0.406795738266
None 0.404532827427
None 0.406941879965
None 0.404966797669
None 0.405498805712
2 0.387706014872
2 0.386086455722
2 0.389307675368
2 0.388314812305
2 0.388135047687
4 0.401917441604
4 0.399806897486
4 0.403367607948
4 0.403154221439
4 0.403703139029
8 0.433744939614
8 0.434887187479
8 0.431690749242
8 0.430359804852
8 0.430231573241
16 0.455019733388
16 0.455332339589
16 0.455626513984
16 0.455329312915
16 0.455557249581
32 0.408472652743
32 0.406074259774
32 0.408432571612
32 0.409303428649
32 0.408548051024
64 0.406287582364
64 0.404786909682
64 0.408330933546
64 0.40632199485
64 0.404770374888
done!


In [21]:
best_result = max(runs, key=lambda run: run[1])
print 'Best result:', best_result
best_d = best_result[0]

clf = DecisionTreeClassifier(max_depth=best_d)
clf.fit(train_dat, train_lbl_txt)
print 'Score on test data:', clf.score(test_dat, test_lbl_txt)

Best result: (16, 0.45562651398370402)
Score on test data: 0.468513853904


# Random Forests

In [22]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
clf.fit(train_dat, train_lbl_txt)
clf.score(test_dat, test_lbl_txt)

0.49935563235897135

## Cross validation on random forests

In [23]:
from sklearn.cross_validation import StratifiedKFold

# Generate k-fold split in the training data
kf = StratifiedKFold(train_lbl_txt, n_folds=5, shuffle=True)

# Do multiple runs and save'em to runs list
runs = []
params = []

depths = [None, 2, 4, 8, 16, 32, 64]
max_features = ['auto', 'log2', None]
criterions = ['gini', 'entropy']
for d in depths:
    for mf in max_features:
        for c in criterions:
            params.append([d, mf, c])


print 'this will take a while...'
for d, mf, c in params:
    clf = RandomForestClassifier(n_jobs=-1, max_depth=d, max_features=mf, criterion=c)
    print 'run:', d, mf, c,
    for t,v in kf:
        trn = train_dat[t]
        val = train_dat[v]
        trn_lbl = train_lbl_txt[t]
        val_lbl = train_lbl_txt[v]
        clf.fit(trn, trn_lbl)
        score = clf.score(val, val_lbl)
        runs.append([score, d, mf, c])
    print 'done!'
print 'All done!'

this will take a while...
run: None auto gini done!
run: None auto entropy done!
run: None log2 gini done!
run: None log2 entropy done!
run: None None gini done!
run: None None entropy done!
run: 2 auto gini done!
run: 2 auto entropy done!
run: 2 log2 gini done!
run: 2 log2 entropy done!
run: 2 None gini done!
run: 2 None entropy done!
run: 4 auto gini done!
run: 4 auto entropy done!
run: 4 log2 gini done!
run: 4 log2 entropy done!
run: 4 None gini done!
run: 4 None entropy done!
run: 8 auto gini done!
run: 8 auto entropy done!
run: 8 log2 gini done!
run: 8 log2 entropy done!
run: 8 None gini done!
run: 8 None entropy done!
run: 16 auto gini done!
run: 16 auto entropy done!
run: 16 log2 gini done!
run: 16 log2 entropy done!
run: 16 None gini done!
run: 16 None entropy done!
run: 32 auto gini done!
run: 32 auto entropy done!
run: 32 log2 gini done!
run: 32 log2 entropy done!
run: 32 None gini done!
run: 32 None entropy done!
run: 64 auto gini done!
run: 64 auto entropy done!
run: 64 log

In [24]:
champion = max(runs, key=lambda run: run[0])
score, d, mf, c = champion
clf = RandomForestClassifier(n_jobs=-1, max_depth=d, max_features=mf, criterion=c)
clf.fit(train_dat, train_lbl_txt)
clf.score(test_dat, test_lbl_txt)

# save model
from sklearn.externals import joblib
joblib.dump(clf, '../models/random_forest_model.p')

['../models/random_forest_model.p',
 '../models/random_forest_model.p_01.npy',
 '../models/random_forest_model.p_02.npy',
 '../models/random_forest_model.p_03.npy',
 '../models/random_forest_model.p_04.npy',
 '../models/random_forest_model.p_05.npy',
 '../models/random_forest_model.p_06.npy',
 '../models/random_forest_model.p_07.npy',
 '../models/random_forest_model.p_08.npy',
 '../models/random_forest_model.p_09.npy',
 '../models/random_forest_model.p_10.npy',
 '../models/random_forest_model.p_11.npy',
 '../models/random_forest_model.p_12.npy',
 '../models/random_forest_model.p_13.npy',
 '../models/random_forest_model.p_14.npy',
 '../models/random_forest_model.p_15.npy',
 '../models/random_forest_model.p_16.npy',
 '../models/random_forest_model.p_17.npy',
 '../models/random_forest_model.p_18.npy',
 '../models/random_forest_model.p_19.npy',
 '../models/random_forest_model.p_20.npy',
 '../models/random_forest_model.p_21.npy',
 '../models/random_forest_model.p_22.npy',
 '../models/random

# Support Vector Machine Crossvalidation

In [25]:
# from sklearn.svm import SVC
# from sklearn.grid_search import GridSearchCV
# from sklearn.cross_validation import StratifiedKFold

# folder = StratifiedKFold(train_lbl_txt, n_folds=5, shuffle=False)

# parameters = {'kernel':['linear', 'poly', 'rbf'], 'C':[64, 32, 16, 8], 'probability':[False], 'max_iter':[1000]}
# svm_clf = SVC()

# clf = GridSearchCV(svm_clf, parameters, n_jobs=-1, pre_dispatch='n_jobs', cv=folder, refit=True, verbose=5)
# clf.fit(train_dat_scaled, train_lbl_txt)
# clf.score(test_dat_scaled, test_lbl_txt)

In [26]:
# print 'best score:', clf.best_score_
# print 'best params:', clf.best_params_

In [27]:
from sklearn.externals import joblib
clf = joblib.load('../models/random_forest_model.p')