In [1]:
import numpy as np
import pickle as pk
import pandas as pd
import timeit as tm
import csv
import sys

# Loading data
## Loading training data

In [2]:
# Open training data to pandas
train_dat_pandas = pd.read_csv('../data/clean_data/train_vectors.csv', index_col=0, encoding='utf-8')
del train_dat_pandas['TYPE']

# Open training labels to pandas
train_lbl_pandas = pd.read_csv('../data/clean_data/train_labels.csv', index_col=0, encoding='utf-8')
del train_lbl_pandas['YEAR']

# Save headers
headers = [list(train_dat_pandas)]

# Convert pandas to numpy matrix
train_dat = train_dat_pandas.as_matrix()
print 'training data dimensions:', train_dat.shape

# Convert pandas to numpy matrix
train_lbl = train_lbl_pandas.as_matrix()
print 'training label dimensions:', train_lbl.shape

training data dimensions: (295169L, 64L)
training label dimensions: (295169L, 6L)


## Loading test data

In [3]:
# Open test data
test_dat_pandas = pd.read_csv('../data/clean_data/test_vectors.csv', index_col=0, encoding='utf-8')
del test_dat_pandas['TYPE']

# Open test labels
test_lbl_pandas = pd.read_csv('../data/clean_data/test_labels.csv', index_col=0, encoding='utf-8')
del test_lbl_pandas['YEAR']

# Convert pandas to numpy matrix
test_dat = test_dat_pandas.as_matrix()
print 'testing data dimensions:', test_dat.shape

# Convert pandas to numpy matrix
test_lbl = test_lbl_pandas.as_matrix()
print 'testing label dimensions:', test_lbl.shape

testing data dimensions: (34142L, 64L)
testing label dimensions: (34142L, 6L)


# Linear Regression

In [4]:
from sklearn.linear_model import LinearRegression

# Fit Linear Regression
lin_reg = LinearRegression(n_jobs=-1)
lin_reg.fit(train_dat, train_lbl)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [5]:
# Generate predictions
predictions = lin_reg.predict(test_dat)
print predictions.shape

(34142L, 6L)


In [6]:
# Compute RMSE

import math

errors = []

# compute squared errors
for i in xrange(predictions.shape[0]):
    p = predictions[i]
    t = test_lbl[i]
    
    # compute distance
    squared_distance = 0.0
    for j in xrange(predictions.shape[1]):
        squared_distance += (p[j] - t[j])**2
    
    errors.append(squared_distance)

rmse = math.sqrt(sum(errors)/len(errors))
print 'Root mean squared error:', rmse

Root mean squared error: 0.841330416117


# Convert one hot labels to numeric

In [7]:
# method to convert a one hot encoding array into a numeric array
def onehot_2_numeric(onehot):
    numeric = []
    for elem in onehot:
        result = 0
        for i, k in enumerate(elem):
            result += i * k
        numeric.append(result)
    return np.asarray(numeric)


train_lbl_txt = onehot_2_numeric(train_lbl)
test_lbl_txt = onehot_2_numeric(test_lbl)
print train_lbl_txt, test_lbl_txt

[4 4 4 ..., 4 2 3] [4 2 2 ..., 4 2 2]


# Logistic regression

In [8]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(n_jobs=-1)
clf.fit(train_dat, train_lbl_txt)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
predictions = clf.predict(test_dat)
p_predictions = clf.predict_proba(test_dat)

print 'predictions dimensions:', predictions.shape
print 'probabilities per class:', p_predictions.shape

predictions dimensions: (34142L,)
probabilities per class: (34142L, 6L)


In [10]:
# Table of probabilities for each class
for i in range(6):
    print str(i)+'\t',

print ''

for i in xrange(len(p_predictions)):
    
    for j in xrange(len(p_predictions[i])):
        print("%.2f" % (p_predictions[i][j]*100))+'%\t',
    
    print ''

0	1	2	3	4	5	
8.32%	7.43%	12.30%	33.14%	35.15%	3.65%	
3.22%	0.00%	5.84%	81.20%	9.51%	0.23%	
3.23%	0.00%	5.88%	81.27%	9.40%	0.22%	
3.23%	0.00%	5.89%	81.27%	9.38%	0.22%	
5.98%	10.72%	12.22%	35.41%	29.24%	6.42%	
5.65%	13.55%	13.73%	27.56%	32.74%	6.78%	
5.79%	14.81%	13.68%	24.61%	34.01%	7.10%	
7.63%	11.16%	12.76%	28.83%	34.79%	4.83%	
5.38%	19.09%	14.51%	18.83%	34.29%	7.90%	
5.62%	16.20%	15.72%	25.62%	28.24%	8.60%	
5.66%	11.96%	13.08%	32.82%	28.89%	7.58%	
4.96%	25.97%	16.86%	15.65%	29.16%	7.39%	
6.60%	1.42%	12.27%	48.60%	28.42%	2.69%	
6.07%	11.00%	12.41%	33.96%	29.95%	6.61%	
7.29%	9.80%	12.41%	33.44%	32.47%	4.59%	
5.67%	13.38%	15.32%	30.54%	27.02%	8.07%	
7.52%	10.32%	12.71%	30.22%	34.34%	4.88%	
6.47%	11.78%	13.39%	30.05%	32.73%	5.59%	
6.34%	8.97%	12.69%	35.18%	32.41%	4.42%	
6.03%	10.74%	13.32%	30.33%	33.37%	6.21%	
6.12%	11.00%	13.50%	28.88%	34.12%	6.38%	
7.28%	11.47%	12.55%	31.33%	32.50%	4.86%	
6.08%	12.52%	13.31%	29.31%	32.34%	6.44%	
6.33%	13.26%	17.26%	27.47%	27.75%	7.93%	
6.36%	10.76%	13.

In [11]:
from sklearn.metrics import accuracy_score
score = accuracy_score(test_lbl_txt, predictions)
print score

0.405629430028


# Logistic Regression Cross validation

In [12]:
from sklearn.linear_model import LogisticRegressionCV

cvclf = LogisticRegressionCV(n_jobs=-1, solver='liblinear')
cvclf = cvclf.fit(train_dat, train_lbl_txt)
print cvclf.score(test_dat, test_lbl_txt)

0.405775877219


# Decision Tree Classifier

In [13]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(train_dat, train_lbl_txt)
predictions = clf.predict(test_dat)

from sklearn.metrics import accuracy_score
score = accuracy_score(test_lbl_txt, predictions)
print score

0.385595454279


In [14]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import accuracy_score

# Generate k-fold split in the training data
kf = StratifiedKFold(train_lbl_txt, n_folds=5, shuffle=True)

# Do multiple runs and save'em to runs list
runs = []
depths = [None, 2, 4, 8, 16, 32, 64]

print 'this will take a while...',
for d in depths:
    clf = DecisionTreeClassifier(max_depth=d)
    for t,v in kf:
        trn = train_dat[t]
        val = train_dat[v]
        trn_lbl = train_lbl_txt[t]
        val_lbl = train_lbl_txt[v]
        clf.fit(trn, trn_lbl)
        predictions = clf.predict(val)
        score = accuracy_score(val_lbl, predictions)
        runs.append(tuple([d, score]))
print 'done!'
    

this will take a while... done!


In [15]:
print max(runs, key=lambda run: run[1])

(16, 0.45568805474625651)


# Random Forests

In [16]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
clf.fit(train_dat, train_lbl_txt)
clf.score(test_dat, test_lbl_txt)

0.49730537168297112

## Cross validation on random forests

In [17]:
from sklearn.cross_validation import StratifiedKFold

# Generate k-fold split in the training data
kf = StratifiedKFold(train_lbl_txt, n_folds=5, shuffle=True)

# Do multiple runs and save'em to runs list
runs = []
params = []

depths = [None, 2, 4, 8, 16, 32, 64]
max_features = ['auto', 'log2', None]
criterions = ['gini', 'entropy']
for d in depths:
    for mf in max_features:
        for c in criterions:
            params.append([d, mf, c])


print 'this will take a while...'
for d, mf, c in params:
    clf = RandomForestClassifier(n_jobs=-1, max_depth=d, max_features=mf, criterion=c)
    print 'run:', d, mf, c,
    for t,v in kf:
        trn = train_dat[t]
        val = train_dat[v]
        trn_lbl = train_lbl_txt[t]
        val_lbl = train_lbl_txt[v]
        clf.fit(trn, trn_lbl)
        score = clf.score(val, val_lbl)
        runs.append([score, d, mf, c])
    print 'done!'
print 'All done!'

this will take a while...
run: None auto gini done!
run: None auto entropy done!
run: None log2 gini done!
run: None log2 entropy done!
run: None None gini done!
run: None None entropy done!
run: 2 auto gini done!
run: 2 auto entropy done!
run: 2 log2 gini done!
run: 2 log2 entropy done!
run: 2 None gini done!
run: 2 None entropy done!
run: 4 auto gini done!
run: 4 auto entropy done!
run: 4 log2 gini done!
run: 4 log2 entropy done!
run: 4 None gini done!
run: 4 None entropy done!
run: 8 auto gini done!
run: 8 auto entropy done!
run: 8 log2 gini done!
run: 8 log2 entropy done!
run: 8 None gini done!
run: 8 None entropy done!
run: 16 auto gini done!
run: 16 auto entropy done!
run: 16 log2 gini done!
run: 16 log2 entropy done!
run: 16 None gini done!
run: 16 None entropy done!
run: 32 auto gini done!
run: 32 auto entropy done!
run: 32 log2 gini done!
run: 32 log2 entropy done!
run: 32 None gini done!
run: 32 None entropy done!
run: 64 auto gini done!
run: 64 auto entropy done!
run: 64 log

In [18]:
champion = max(runs, key=lambda run: run[0])
score, d, mf, c = champion
clf = RandomForestClassifier(n_jobs=-1, max_depth=d, max_features=mf, criterion=c)
clf.fit(train_dat, train_lbl_txt)
clf.score(test_dat, test_lbl_txt)

0.50448128404897197