In [65]:
#!/usr/bin/env python2
#
# Example to classify faces.
# Brandon Amos
# 2015/10/11

import time

start = time.time()

import argparse
#import cv2
import os
import pickle
import sys

from operator import itemgetter

import numpy as np
np.set_printoptions(precision=2)
import pandas as pd

#import openface

from sklearn.pipeline import Pipeline
from sklearn.lda import LDA
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.mixture import GMM
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split,  KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
#fileDir = os.path.dirname(os.path.realpath(__file__))
#modelDir = os.path.join(fileDir, '..', 'models')
#dlibModelDir = os.path.join(modelDir, 'dlib')
#openfaceModelDir = os.path.join(modelDir, 'openface')


def train(classfier, data, labelsNum, nClasses,):
    labels = data[:,0]
    embeddings = data[:,1:]
    labelsNum = labels.tolist()
    print("Training for {} classes.".format(nClasses))
    if classifier == 'LinearSvm':
        clf = SVC(C=1, kernel='linear', probability=True)
    elif classifier == 'GridSearchSvm':
#         print("""
#         Warning: In our experiences, using a grid search over SVM hyper-parameters only
#         gives marginally better performance than a linear SVM with C=1 and
#         is not worth the extra computations of performing a grid search.
#         """)
        param_grid = [
            {'C': [1, 10, 100, 1000],
             'kernel': ['linear']},
            {'C': [1, 10, 100, 1000],
             'gamma': [0.001, 0.0001],
             'kernel': ['rbf']}
        ]
        clf = GridSearchCV(SVC(C=1, probability=True), param_grid, cv=5)
    # ref:
    # http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#example-classification-plot-classifier-comparison-py
    elif classifier == 'DecisionTree':  # Doesn't work best
        clf = DecisionTreeClassifier(max_depth=20)
    elif classifier == 'GridSearchDT':
        param_grid = [
            {"max_depth": [20, 40, 60, 80, 100, 120, 140]}
        ]
        clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
    elif classifier == 'KNN':
        clf = KNeighborsClassifier(n_neighbors=1)
    elif classifier == 'GridSearchKNN':
        param_grid = [
            {'n_neighbors': [1, 3, 5, 7, 9]}
        ]
        clf = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
    elif classifier == 'AdaBoost':
        clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=20), n_estimators=100)
    elif classifier == 'GridSearchAB':
        param_grid = [
            {'n_estimators': [20, 60, 100]}
        ]
        clf = GridSearchCV(AdaBoostClassifier(DecisionTreeClassifier(max_depth=80)), param_grid, cv=5)
    elif classifier == 'RandomForest':
        clf = RandomForestClassifier(n_estimators=10)
    elif classifier == 'GridSearchRF':
        param_grid = [
            {'n_estimators': [20, 60, 100]}
        ]
        clf = GridSearchCV(RandomForestClassifier(max_depth=80), param_grid, cv=5)
        
    start = time.time()
    clf.fit(embeddings, labelsNum)
    return clf, (time.time() - start)


def infer(clf, X, Y, multiple=False, verbose=True):
    # TODO Store testing represenations in folder 
    start = time.time()
    f_x = clf.predict(X)
    error = np.sum(Y[:,0] != f_x) / float(len(Y))

    print "\tTesting error is {}".format(error)
    return error,( time.time()-start)


def print_params (clf):
    print "Cross Validation results:"
    for (params, avg_validation_score, cv_scores) in clf.grid_scores_:
        print "\tParameters: {} with validation score of {}"\
            .format(params, round(avg_validation_score,3))
    print "\t********************"
    print "\tBest validation score with params {} and validation score of {}"\
        .format(clf.best_params_, round(clf.best_score_,2))

In [66]:
workDir = "./training-embeddings"
print("Loading embeddings.")
fname = "{}/labels.csv".format(workDir)
labels = pd.read_csv(fname, header=None).as_matrix()[:, 0:1]
fname = "{}/reps.csv".format(workDir)
embeddings = pd.read_csv(fname, header=None).as_matrix()
le = LabelEncoder().fit(labels)
labelsNum = le.transform(labels)
nClasses = len(le.classes_)

print embeddings.shape
print labels.shape, embeddings.shape
data = np.append(labels,embeddings,axis=1)
Y = data[:,0:1]
X = data[:,1:]

# Split dataset
# Train on generated embeddings
splits = [.20,.40,.50,.60,.80]  # percentage of test set
gridsearch_list = ['GridSearchSvm', 'GridSearchDT', 'GridSearchKNN', 'GridSearchAB', 'GridSearchRF']
clf_list = ['LinearSvm', 'DecisionTree', 'KNN', 'AdaBoost', 'RandomForest']

clf_name = gridsearch_list[1]
for split in splits:
    print "----------------------------------------------------"
    print "Using classifier {}".format(clf_name)
    print "[{},{}] [Train,Test] Split".format(int(100-(split*100)),\
                                            int((split*100)))
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y,
                                  test_size=split,random_state=42)
    classifier = clf_name
    data = np.append(Y_train, X_train, axis=1)
    clf, train_time = train(classifier, data, labelsNum, nClasses)
    print "Took {} seconds".format(train_time)
    test_error, infer_time = infer(clf, X_test, Y_test)
    print_params(clf)
    print "Took {} seconds".format(infer_time)

    print "----------------------------------------------------"
    print



Loading embeddings.
(997L, 128L)
(997L, 1L) (997L, 128L)
----------------------------------------------------
Using classifier GridSearchDT
[80,20] [Train,Test] Split
Training for 50 classes.
Took 8.99500012398 seconds
	Testing error is 0.44
Cross Validation results:
	Parameters: {'max_depth': 20} with validation score of 0.527
	Parameters: {'max_depth': 40} with validation score of 0.566
	Parameters: {'max_depth': 60} with validation score of 0.552
	Parameters: {'max_depth': 80} with validation score of 0.575
	Parameters: {'max_depth': 100} with validation score of 0.571
	Parameters: {'max_depth': 120} with validation score of 0.573
	Parameters: {'max_depth': 140} with validation score of 0.561
	********************
	Best validation score with params {'max_depth': 80} and validation score of 0.57
Took 0.0019998550415 seconds
----------------------------------------------------

----------------------------------------------------
Using classifier GridSearchDT
[60,40] [Train,Test] Spli