In [None]:
#!/usr/bin/python
import cv2, os
import numpy as np
from PIL import Image
import csv
import pandas as pd
import pdb as pdb
from sklearn.cross_validation import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
import time

settings = {
    'minNeighbors': 2, 
    'minSize': (40,40)
}
image_list = pd.read_csv('driver_imgs_list.csv')
train_subject_subset = image_list.subject.value_counts()[-9:].index.values
test_subject_subset = image_list.subject.value_counts()[:-9].index.values
train = image_list[image_list.subject.isin(train_subject_subset)].reset_index(drop=True)
test = image_list[image_list.subject.isin(test_subject_subset)].reset_index(drop=True)

#take 25% of the sets 
mask = np.random.choice([False, True], len(train), p=[0.75, 0.25])
train = train[mask]
mask = np.random.choice([False, True], len(test), p=[0.75, 0.25])
test = test[mask]
print 'done getting train/test sets'
print 'getting images from paths'

In [None]:
images = []
predict_images = []
labels = [] 
predict_labels = []

t0 = time.time()
scaled_size = (200,150)

for i, image in train.iterrows():
    image_path = './train/' + image.classname + '/' + image.img
    predict_image_pil = Image.open(image_path).convert('L') #greyscale
    predict_image_pil.thumbnail(scaled_size, Image.ANTIALIAS) #resize
    image = np.array(predict_image_pil, 'uint8') #to array
    images.append(image.flatten())
    labels.append(train['classname'][i])
    
for i, image in test.iterrows():
    image_path = './train/' + image.classname + '/' + image.img
    predict_image_pil = Image.open(image_path).convert('L') #greyscale
    predict_image_pil.thumbnail(scaled_size, Image.ANTIALIAS) #resize
    predict_image = np.array(predict_image_pil, 'uint8') #to array
    predict_images.append(predict_image.flatten())
    predict_labels.append(test['classname'][i])
print('done')

In [None]:
# Extract relevant data
training_data = images
training_labels = labels
prediction_data = predict_images
prediction_labels = predict_labels

training_labels = map(lambda each:int(each.strip("c")), training_labels)
prediction_labels = map(lambda each:int(each.strip("c")), prediction_labels)

training_data = map(lambda each:each.flatten(), training_data)
prediction_data = map(lambda each:each.flatten(), prediction_data)

print("done in {}s".format(time.time() - t0))
print ("training classifier")

In [None]:
# import xgboost as xgb
# import numpy
# from xgboost import XGBClassifier

# t0 = time.time()
# model = xgb.XGBClassifier()
# X = numpy.array(training_data)
# model.fit(X, training_labels)


In [None]:
# X = numpy.array(prediction_data)
# pred = model.predict(X)
# correct = np.sum(pred == prediction_labels)
# print 'accuracy using XGBoost: {} / {} %'.format(correct, len(prediction_labels))
# print("done in {}s".format(time.time() - t0))


In [None]:
class XGBoostClassifier():
    def __init__(self, num_boost_round=10, **params):
        self.clf = None
        self.num_boost_round = num_boost_round
        self.params = params
        self.params.update({'objective': 'multi:softprob'})
 
    def fit(self, X, y, num_boost_round=None):
        num_boost_round = num_boost_round or self.num_boost_round
        self.label2num = {label: i for i, label in enumerate(sorted(set(y)))}
        dtrain = xgb.DMatrix(X, label=[self.label2num[label] for label in y])
        self.clf = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=num_boost_round)
 
    def predict(self, X):
        num2label = {i: label for label, i in self.label2num.items()}
        Y = self.predict_proba(X)
        y = np.argmax(Y, axis=1)
        return np.array([num2label[i] for i in y])
 
    def predict_proba(self, X):
        dtest = xgb.DMatrix(X)
        return self.clf.predict(dtest)
 
    def score(self, X, y):
        Y = self.predict_proba(X)
        return 1 / logloss(y, Y)
 
    def get_params(self, deep=True):
        return self.params
 
    def set_params(self, **params):
        if 'num_boost_round' in params:
            self.num_boost_round = params.pop('num_boost_round')
        if 'objective' in params:
            del params['objective']
        self.params.update(params)
        return self
    
    
def logloss(y_true, Y_pred):
    label2num = dict((name, i) for i, name in enumerate(sorted(set(y_true))))
    return -1 * sum(math.log(y[label2num[label]]) if y[label2num[label]] > 0 else -np.inf for y, label in zip(Y_pred, y_true)) / len(Y_pred)

In [None]:
import xgboost as xgb
import numpy
from sklearn.grid_search import GridSearchCV

clf = XGBoostClassifier(
        eval_metric = 'auc',
        num_class = 10,
        eta = 0.1,
        num_boost_round = 80,
        max_depth = 10,
        subsample = 0.5,
        colsample_bytree = 1.0,
        )
parameters = {
    'num_boost_round': [100, 250, 500],
    'eta': [0.05, 0.1, 0.3],
    'max_depth': [3, 6, 9],
    'subsample': [0.9, 1.0],
    'colsample_bytree': [0.6,0.9, 1.0],
}
t0 = time.time()
clf = GridSearchCV(clf, parameters, n_jobs=-1, cv=2)

X = numpy.array(training_data)
clf.fit(training_data, training_labels)
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print(score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

pred = clf.predict(prediction_data)
correct = np.sum(pred == prediction_labels)
print 'accuracy using XGBoost w/ grid search: {} / {} %'.format(correct, len(prediction_labels))
print("done in {}s".format(time.time() - t0))



In [45]:
import xgboost as xgb
import numpy

clf = xgb.XGBClassifier()

t0 = time.time()

param = {'max_depth':15, 'eta':0.2, 'objective':'multi:softmax', 'num_class':10}
num_round = 5
         
xg_train = xgb.DMatrix( training_data, label=training_labels)
xg_test = xgb.DMatrix(prediction_data, label=prediction_labels)
         
bst = xgb.train(param, xg_train, num_round)
# make prediction
preds = bst.predict(xg_test)


# training_data = numpy.array(training_data)
# clf.fit(training_data, training_labels)

# prediction_data = numpy.array(prediction_data)
# pred = clf.predict(prediction_data)
correct = np.sum(preds == prediction_labels)
print 'accuracy using XGBoost: {} / {} %'.format(correct, len(prediction_labels))
print("done in {} sec".format(time.time() - t0))




accuracy using XGBoost: 916 / 4067 %
done in 201.191013098 sec


In [48]:
import xgboost as xgb
import numpy

clf = xgb.XGBClassifier()

t0 = time.time()

param = {'max_depth':1, 'eta':0.2, 'objective':'multi:softmax', 'num_class':10}
num_round = 20
         
xg_train = xgb.DMatrix( training_data, label=training_labels)
xg_test = xgb.DMatrix(prediction_data, label=prediction_labels)
         
bst = xgb.train(param, xg_train, num_round)
# make prediction
preds = bst.predict(xg_test)


# training_data = numpy.array(training_data)
# clf.fit(training_data, training_labels)

# prediction_data = numpy.array(prediction_data)
# pred = clf.predict(prediction_data)
correct = np.sum(preds == prediction_labels)
print 'accuracy using XGBoost: {} / {} %'.format(correct, len(prediction_labels))
print("done in {} sec".format(time.time() - t0))





accuracy using XGBoost: 991 / 4067 %
done in 215.162302971 sec


In [42]:
import xgboost as xgb
import numpy

clf = xgb.XGBClassifier()

t0 = time.time()

param = {'max_depth':10, 'eta':0.3, 'objective':'multi:softmax', 'num_class':10}
num_round = 5
         
xg_train = xgb.DMatrix( training_data, label=training_labels)
xg_test = xgb.DMatrix(prediction_data, label=prediction_labels)
         
bst = xgb.train(param, xg_train, num_round)
# make prediction
preds = bst.predict(xg_test)


# training_data = numpy.array(training_data)
# clf.fit(training_data, training_labels)

# prediction_data = numpy.array(prediction_data)
# pred = clf.predict(prediction_data)
correct = np.sum(preds == prediction_labels)
print 'accuracy using XGBoost: {} / {} %'.format(correct, len(prediction_labels))
print("done in {} sec".format(time.time() - t0))

accuracy using XGBoost: 944 / 4067 %
done in 199.12429285 sec


In [44]:
import xgboost as xgb
import numpy

clf = xgb.XGBClassifier()

t0 = time.time()

param = {'max_depth':10, 'eta':0.1, 'objective':'multi:softmax', 'num_class':10}
num_round = 5
         
xg_train = xgb.DMatrix( training_data, label=training_labels)
xg_test = xgb.DMatrix(prediction_data, label=prediction_labels)
         
bst = xgb.train(param, xg_train, num_round)
# make prediction
preds = bst.predict(xg_test)


# training_data = numpy.array(training_data)
# clf.fit(training_data, training_labels)

# prediction_data = numpy.array(prediction_data)
# pred = clf.predict(prediction_data)
correct = np.sum(preds == prediction_labels)
print 'accuracy using XGBoost: {} / {} %'.format(correct, len(prediction_labels))
print("done in {} sec".format(time.time() - t0))

accuracy using XGBoost: 807 / 4067 %
done in 229.05040288 sec
