In [1]:
import cv2
import numpy as np
import dlib
import math
import os
from keras.preprocessing import image
from collections import OrderedDict


import B2_lab2landmarks_rewrite as B2_lab2


Using TensorFlow backend.


In [35]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt

from sklearn import datasets, metrics, model_selection, svm


from sklearn.utils import shuffle
from sklearn.datasets import load_iris

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report,accuracy_score

In [37]:

# Set the parameters for Grid search
split_ratio = 0.75
cvfold = 5



#SVM
SVM_tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-2],'C': [1,10]},
                        {'kernel': ['linear'], 'C': [1, 10]},
                        {'kernel':['poly'],'gamma': [1e-2],'degree':[3]}]

#SVM_tuned_parameters = [{'kernel': ['linear'], 'C': [1, 10]}]

#Logistic Regression
#LogReg_tuned_parameters = [{ 'solver':['lbfgs']}, #L2
#                           {'solver':['liblinear'], 'penalty':['l1']}] #L1
LogReg_tuned_parameters = [{ 'solver':['lbfgs']}]

#Random Forest
#RF_tuned_parameters = [{'n_estimators':[10, 50], 'max_depth':[1,5], 'min_samples_split':[500, 1000]}]
RF_tuned_parameters = [{'n_estimators':[10, 50], 'max_depth':[50, 100]}]

#KNN parameter
KNN_tuned_parameters = [{'n_neighbors':[25, 50, 100]}]

In [2]:
# This is the dictionary contains dlib's 68-point facial landmark detector:
# copyright: https://github.com/jrosebr1/imutils/blob/master/imutils/face_utils/helpers.py
FACIAL_LANDMARKS_68_IDXS = OrderedDict([
	("mouth", (48, 68)),
	("inner_mouth", (60, 68)),
	("right_eyebrow", (17, 22)),
	("left_eyebrow", (22, 27)),
	("right_eye", (36, 41)),
	("left_eye", (42, 47)),
	("nose", (27, 36)),
	("jaw", (0, 17))
])



#n_colors is the number of dominant colours to be detected in the image, used in cv.kmeans
n_colors = 5
roi_margin = 20 #pixels

In [3]:
landmark_features, label_contents, no_features_sets, identified_file_sets, dominant_rgbs  = B2_lab2.extract_features_labels_and_dominant_colour("cartoon_set", "eye_color", roi_margin, n_colors)



In [32]:
def split_data(X, y, split_ratio):

    #X, y,  = lab2.extract_features_labels(image_folder, label_name)
    Y = np.array([y, -(y - 1)]).T
    
    X, Y = shuffle(X,Y)
    train_X, test_X, train_Y, test_Y = train_test_split(X, Y, train_size=split_ratio)
    
    #print(train_X.shape) - > num of samples, 3 (RGB value)
    #there is no need to reshape the dimension 
    #tr_X = train_X.reshape((train_X.shape[0], train_X.shape[1]*2))
    #te_X = test_X.reshape((test_X.shape[0], test_X.shape[1]*2))
    tr_X = train_X
    te_X = test_X
    
    tr_Y = list(zip(*train_Y))[0]
    te_Y = list(zip(*test_Y))[0]

    return tr_X, te_X, tr_Y, te_Y 


def bulk_runtime_estimation(classifier, xtest):
    
    nsamp = xtest.shape[0]
    start = time.time()
    ypred = classifier.predict(xtest)
    bulk_runtime = time.time() - start
    
    #average runtime per instance
    avg_runtime = bulk_runtime/nsamp
    
    return ypred, bulk_runtime, avg_runtime

In [26]:

#Prediction with Grid Search Cross validation Logistic Regression
# Compare between L1 or L2 regularization
def LogReg_GridSearch_PredictionCV(xtrain, ytrain, xtest, ytest, tuning_parameters, cvfold):
    clf = GridSearchCV(LogisticRegression(max_iter=5000), tuning_parameters, cv = cvfold)
    clf.fit(xtrain, ytrain)
    print()
    print('-------------------------------------------------')
    print("Prediction with Grid search Logistic Regression Cross validation:")
    print()
    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))
    print()
    
    #ytrue, ypred = ytest, clf.predict(xtest)
    ypred, bulk_runtime, avg_runtime = bulk_runtime_estimation(clf, xtest)
    ytrue, ypred = ytest, ypred
    
    print(classification_report(ytrue, ypred))
    print("Accuracy:", accuracy_score(ytrue, ypred))
    print()
    print('Average runtime per test instance:', avg_runtime)
    
    #return classifier
    return clf
    

In [27]:

#Prediction with Grid search SVC Cross validation
def SVC_GridSearch_PredictionCV(xtrain, ytrain, xtest, ytest, tuning_parameters, cvfold):
    # classifier
    clf = GridSearchCV(SVC(), tuning_parameters, cv = cvfold)
    clf.fit(xtrain, ytrain)
    
    print()
    print('-------------------------------------------------')
    print("Prediction with Grid search SVC Cross validation:")
    print()
    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))
    print()
    
    #ytrue, ypred = ytest, clf.predict(xtest)
    ypred, bulk_runtime, avg_runtime = bulk_runtime_estimation(clf, xtest)
    ytrue, ypred = ytest, ypred
    
    print(classification_report(ytrue, ypred))
    print("Accuracy:", accuracy_score(ytrue, ypred))
    print()
    print('Average runtime per test instance:', avg_runtime)
    
    #return classifier
    return clf
    
    
    

In [28]:

def RF_GridSearch_PredictionCV(xtrain, ytrain, xtest, ytest, tuning_parameters, cvfold):

    clf = GridSearchCV(RandomForestClassifier() , tuning_parameters, cv = cvfold)
    clf.fit(xtrain, ytrain)
    
    print()
    print('-------------------------------------------------')
    print("Prediction with Grid search Random Forest Cross validation:")
    print()
    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))
    print()
    
    #ytrue, ypred = ytest, clf.predict(xtest)
    ypred, bulk_runtime, avg_runtime = bulk_runtime_estimation(clf, xtest)
    ytrue, ypred = ytest, ypred
    
    print(classification_report(ytrue, ypred))
    print("Accuracy:", accuracy_score(ytrue, ypred))
    print()
    print('Average runtime per test instance:', avg_runtime)
    
    #return classifier
    return clf
    

In [29]:

def KNN_Grid_search_Prediction_CV(xtrain, ytrain, xtest, ytest, tuning_parameters, cvfold):
    clf = GridSearchCV(KNeighborsClassifier() , tuning_parameters, cv = cvfold)
    clf.fit(xtrain, ytrain)
    
    print()
    print('-------------------------------------------------')
    print("Prediction with Grid search KNN Cross validation:")
    print()
    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))
    print()
    
    #ytrue, ypred = ytest, clf.predict(xtest)
    ypred, bulk_runtime, avg_runtime = bulk_runtime_estimation(clf, xtest)
    ytrue, ypred = ytest, ypred
    
    print(classification_report(ytrue, ypred))
    print("Accuracy:", accuracy_score(ytrue, ypred))
    print()
    print('Average runtime per test instance:', avg_runtime)
    
    #return classifier
    return clf
    

In [30]:

def plot_ROC_curve(ytrue, ypred):
    fpr, tpr, threshold = metrics.roc_curve(ytrue, ypred) 
    roc_auc = metrics.auc(fpr, tpr)
    
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

    

In [33]:
tr_X, te_X, tr_Y, te_Y = split_data(dominant_rgbs, label_contents, split_ratio)

In [34]:
# SVM result:
SVMclf = SVC_GridSearch_PredictionCV(tr_X, tr_Y, te_X, te_Y, SVM_tuned_parameters, cvfold)



-------------------------------------------------
Prediction with Grid search SVC Cross validation:

Best parameters set found on development set:

{'C': 1, 'kernel': 'linear'}

Grid scores on development set:

0.843 (+/-0.015) for {'C': 1, 'kernel': 'linear'}
0.842 (+/-0.016) for {'C': 10, 'kernel': 'linear'}



NameError: name 'time' is not defined

In [38]:

# Random Forest result:
RFclf = RF_GridSearch_PredictionCV(tr_X, tr_Y, te_X, te_Y, RF_tuned_parameters, cvfold)




-------------------------------------------------
Prediction with Grid search Random Forest Cross validation:

Best parameters set found on development set:

{'max_depth': 50, 'n_estimators': 10}

Grid scores on development set:

0.867 (+/-0.012) for {'max_depth': 50, 'n_estimators': 10}
0.863 (+/-0.012) for {'max_depth': 50, 'n_estimators': 50}
0.865 (+/-0.009) for {'max_depth': 100, 'n_estimators': 10}
0.867 (+/-0.014) for {'max_depth': 100, 'n_estimators': 50}

              precision    recall  f1-score   support

           0       0.89      0.86      0.88       392
           1       0.90      0.87      0.88       405
           2       0.87      0.84      0.86       410
           3       0.91      0.86      0.88       421
           4       0.78      0.89      0.83       395

    accuracy                           0.86      2023
   macro avg       0.87      0.86      0.86      2023
weighted avg       0.87      0.86      0.87      2023

Accuracy: 0.8645575877409788

Average run

In [39]:

# KNN result:
KNNclf = KNN_Grid_search_Prediction_CV(tr_X, tr_Y, te_X, te_Y, KNN_tuned_parameters, cvfold)


-------------------------------------------------
Prediction with Grid search KNN Cross validation:

Best parameters set found on development set:

{'n_neighbors': 25}

Grid scores on development set:

0.845 (+/-0.020) for {'n_neighbors': 25}
0.837 (+/-0.017) for {'n_neighbors': 50}
0.837 (+/-0.024) for {'n_neighbors': 100}

              precision    recall  f1-score   support

           0       0.82      0.85      0.84       392
           1       0.84      0.83      0.84       405
           2       0.96      0.80      0.87       410
           3       0.85      0.86      0.85       421
           4       0.76      0.87      0.81       395

    accuracy                           0.84      2023
   macro avg       0.85      0.84      0.84      2023
weighted avg       0.85      0.84      0.84      2023

Accuracy: 0.8423133959466139

Average runtime per test instance: 3.173807382701472e-05
