In [1]:
import matplotlib.pyplot as plt

import numpy as np

from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn import cross_validation

import pandas as pd

In [2]:
data = pd.read_csv('train_enc.csv', quotechar='"', skipinitialspace=True)
# drop row index, VisitNumber and Sunday
data = data.drop(data.columns[[0, 2, 6]], axis=1)

In [3]:
data_x = data.drop('TripType', axis=1)
data_y = data['TripType']

In [4]:
data_test_cache = pd.read_csv('test_enc.csv', quotechar='"', skipinitialspace=True)
# drop row index, VisitNumber and Sunday
data_test_cache = data_test_cache.drop(data_test_cache.columns[[0, 1, 5]], axis=1)

In [6]:
data_test = pd.DataFrame(columns=list(data_x.columns))

In [7]:
for col_name in list(data_test.columns):
    try:
        data_test[col_name] = data_test_cache[col_name]
    except:
        data_test[col_name] = 0

In [8]:
# partition train and test set
n_row = data_x.shape[0]

np.random.seed(0)

row_ind = range(n_row)
np.random.shuffle(row_ind)

train_ind = row_ind[0:int(n_row*0.9)]
test_ind = row_ind[int(n_row*0.9):n_row]

train_x = data_x.iloc[train_ind, :]
train_y = data_y[train_ind]

test_x = data_x.iloc[test_ind, :]
test_y = data_y[test_ind]

In [9]:
# cross validation split
cv = cross_validation.ShuffleSplit(len(train_ind), n_iter=10, test_size=0.1, random_state=0)

In [11]:
# Classification_Tree Classfier
clf = tree.DecisionTreeClassifier()

In [16]:

scores = cross_validation.cross_val_score(clf, train_x, train_y, cv=cv)
np.mean(scores)

0.56941121820926721

In [12]:
clf.fit(train_x, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [15]:
# log loss of train
clf_probs = clf.predict_proba(train_x)

score = log_loss(train_y, clf_probs)

y_pred = clf.predict(train_x)
acc = accuracy_score(train_y, y_pred)

print "Training accuracy =", acc, ", log loss =", score

Training accuracy = 0.924732306692 , log loss = 0.173743447522


In [13]:
# log loss of test
clf_probs = clf.predict_proba(test_x)

score = log_loss(test_y, clf_probs)

y_pred = clf.predict(test_x)
acc = accuracy_score(test_y, y_pred)

print "Test accuracy =", acc, ", log loss =", score

Test accuracy = 0.572742474916 , log loss = 12.5532542408


Random Forest

In [11]:
# Random Forest Classfier
clf = RandomForestClassifier(n_estimators=150, random_state=0, max_features = "sqrt")

In [8]:

scores = cross_validation.cross_val_score(clf, train_x, train_y, cv=cv)
np.mean(scores)

0.63424689350830343

In [10]:
# 10-fold
# scores = cross_validation.cross_val_score(clf, train_x, train_y, cv=10)
# np.mean(scores)



0.63461431163435245

In [10]:
clf.fit(train_x, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [9]:
# log loss of train
clf_probs = clf.predict_proba(train_x)

score = log_loss(train_y, clf_probs)

y_pred = clf.predict(train_x)
acc = accuracy_score(train_y, y_pred)

print "Training accuracy =", acc, ", log loss =", score

Training accuracy = 0.924720693099 , log loss = 0.331545336077


In [11]:
# log loss of test
clf_probs = clf.predict_proba(test_x)

score = log_loss(test_y, clf_probs)

y_pred = clf.predict(test_x)
acc = accuracy_score(test_y, y_pred)

print "Test accuracy =", acc, ", log loss =", score

Test accuracy = 0.639318561873 , log loss = 1.65552999486


Prediction on th actual data set

In [12]:
clf.fit(data_x, data_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [None]:
# clf_probs = knn_f.predict_proba(data_test)
pred_probs = clf.predict_proba(data_test)
submission_probs = pd.DataFrame(pred_probs, columns = list(clf.classes_))

In [None]:
submission = pd.read_csv('sample_submission.csv', quotechar='"', skipinitialspace=True)

In [None]:
for i in range(len(submission_probs.columns)):
    # submission.iloc[:,[i+1]] = submission.iloc[:,[i+1]].astype(float)
    submission.iloc[:,[i+1]] = submission_probs.iloc[:,i]

In [None]:
submission.to_csv('submission_rdf.csv', sep=',', index = False)