# Homework #3 - Labeled Faces in the Wild

In [1]:
from time import time
import numpy as np

from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
#  Images are single-channel with pixel intensity represented by a float value from 0 to 255,
#  the single-channel intensity is the mean value of the original RGB channel intensities.

# X_train.npy: training data numpy array.
# Each row in the array corresponds to an image unrolled to a vector (50 x 37 = 1850 dimension)
X_train = np.load('X_train.npy')
# y_train.npy: labels (0-6) of each data corresponding to the image in the same row in X_train.npy
y_train = np.load('y_train.npy')
# X_test.npy: test data numpy array.
X_test = np.load('X_test.npy')
print("data loaded")

data loaded


In [3]:
# Generate testResults.csv
# Run this with y_pred

def saveTestResults(y_pred, filename='testResult.csv'):
    fo = open(filename, 'w')
    fo.write("ImageId,PredictedClass\n")
    for i in range(len(y_pred)):
        if i == len(y_pred) - 1:
            fo.write(str(i) + "," + str(y_pred[i]))
        else:
            fo.write(str(i) + "," + str(y_pred[i]) + "\n")
    fo.close()
    
# to calculate number of classes
def calculateClass(label):
    n_classes = []
    for n_class in label:
        if n_class not in n_classes:
            n_classes.append(n_class)
        else:
            continue
    return len(n_classes)

In [4]:
# Split data (75% train, 25% test) to generate Ein and Eout.
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

print("Train data size:")
print("n_samples: %d" % len(X_train_split))
print("n_features: %d" % len(X_train_split[0]))
print("n_classes: %d" % calculateClass(y_train_split))

print("Test data size:")
print("n_samples: %d" % len(X_test_split))
print("n_features: %d" % len(X_test_split[0]))
print("n_classes: %d" % calculateClass(y_test_split))


Train data size:
n_samples: 724
n_features: 1850
n_classes: 7
Test data size:
n_samples: 242
n_features: 1850
n_classes: 7


In [5]:
# getting eigenfaces from input data
n_components = 150
h = 50
w = 37
print("Extracting the top %d eigenfaces from %d faces"
      % (n_components, X_train_split.shape[0]))
t0 = time()
pca = PCA(n_components=n_components, svd_solver='arpack',
          whiten=True).fit(X_train_split)
print("done in %0.3fs" % (time() - t0))

eigenfaces = pca.components_.reshape((n_components, h, w))
print("Projecting the input data on the eigenfaces orthonormal basis")

t0 = time()
X_train_split_pca = pca.transform(X_train_split)
X_test_split_pca = pca.transform(X_test_split)

print("done in %0.3fs" % (time() - t0))


Extracting the top 150 eigenfaces from 724 faces
done in 0.378s
Projecting the input data on the eigenfaces orthonormal basis
done in 0.014s


In [6]:
# GridSearchCV with SVC using OneVsRest classifier (best kaggle result)
print("Fitting the classifier to the training set")
print("performing with SVC using OneVsRest Classifier")

model_to_set = OneVsRestClassifier(SVC(kernel="rbf"))
parameters = {
    "estimator__C": [1, 3, 5, 1e1, 5e1, 1e2, 1e3],
    "estimator__kernel": ["rbf"],
    "estimator__gamma":[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]
}

clf = GridSearchCV(model_to_set, param_grid=parameters, cv=10)
clf = clf.fit(X_train_split_pca, y_train_split)

print("Ein")
internal_pred = clf.predict(X_train_split_pca)
print(classification_report(y_train_split, internal_pred))

print("Eout")
y_pred = clf.predict(X_test_split_pca)
print(classification_report(y_test_split, y_pred))
print("done in %0.3fs" % (time() - t0))


Fitting the classifier to the training set
performing with SVC using OneVsRest Classifier
Ein
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        38
          1       1.00      1.00      1.00       143
          2       1.00      1.00      1.00        59
          3       1.00      1.00      1.00       303
          4       1.00      1.00      1.00        53
          5       1.00      1.00      1.00        49
          6       1.00      1.00      1.00        79

avg / total       1.00      1.00      1.00       724

Eout
             precision    recall  f1-score   support

          0       1.00      0.38      0.56        13
          1       0.86      0.93      0.89        40
          2       0.89      0.62      0.73        26
          3       0.77      0.99      0.87        92
          4       0.95      0.64      0.76        33
          5       1.00      0.73      0.84        11
          6       0.82      0.85      0.84       

In [7]:
# using neural network (best local result)
print("Fitting the classifier to the training set for Neural Network")
t0 = time()

# Runs with Neural nets.
clf = MLPClassifier(solver='lbfgs', alpha=2e-4,
                     hidden_layer_sizes=(1000,), random_state=1)
clf = clf.fit(X_train_split_pca, y_train_split)
print("done in %0.3fs" % (time() - t0))

print("Ein")
internal_pred = clf.predict(X_train_split_pca)
print(classification_report(y_train_split, internal_pred))

print("Eout")
t0 = time()
y_pred = clf.predict(X_test_split_pca)
print(classification_report(y_test_split, y_pred))
print("done in %0.3fs" % (time() - t0))


Fitting the classifier to the training set for Neural Network
done in 0.661s
Ein
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        38
          1       1.00      1.00      1.00       143
          2       1.00      1.00      1.00        59
          3       1.00      1.00      1.00       303
          4       1.00      1.00      1.00        53
          5       1.00      1.00      1.00        49
          6       1.00      1.00      1.00        79

avg / total       1.00      1.00      1.00       724

Eout
             precision    recall  f1-score   support

          0       1.00      0.62      0.76        13
          1       0.95      0.93      0.94        40
          2       0.95      0.69      0.80        26
          3       0.81      0.96      0.88        92
          4       0.79      0.82      0.81        33
          5       0.88      0.64      0.74        11
          6       0.88      0.85      0.87        27

avg / to

In [8]:
# GridSearchCV with SVC using OneVsRest classifier (first model we tried)
print("Fitting the classifier to the training set")
print("performing with SVC using OneVsOne Classifier")


param_grid = {'C': [1, 3, 5, 1e1, 5e1, 1e2, 1e3],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }

clf = GridSearchCV(estimator=SVC(kernel='rbf', class_weight='balanced'), param_grid=param_grid, cv=10)
clf = clf.fit(X_train_split_pca, y_train_split)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

print("Ein")
internal_pred = clf.predict(X_train_split_pca)
print(classification_report(y_train_split, internal_pred))

print("Eout")
t0 = time()
y_pred = clf.predict(X_test_split_pca)
print(classification_report(y_test_split, y_pred))
print("done in %0.3fs" % (time() - t0))


Fitting the classifier to the training set
performing with SVC using OneVsOne Classifier
done in 100.900s
Best estimator found by grid search:
SVC(C=5, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Ein
             precision    recall  f1-score   support

          0       0.95      1.00      0.97        38
          1       0.96      0.99      0.98       143
          2       0.97      1.00      0.98        59
          3       1.00      0.96      0.98       303
          4       1.00      1.00      1.00        53
          5       1.00      1.00      1.00        49
          6       0.98      1.00      0.99        79

avg / total       0.98      0.98      0.98       724

Eout
             precision    recall  f1-score   support

          0       1.00      0.62      0.76        13
          1       0.80      0.88  

In [9]:
### DecisionTree with Adaboost
print("Fitting the classifier to the training set")
t0 = time()

print("Performing with Adaboost using DecisionTree Classifier")
clf = AdaBoostClassifier(DecisionTreeClassifier(class_weight='balanced'), n_estimators=300)
clf = clf.fit(X_train_split_pca, y_train_split)

print("Ein")
internal_pred = clf.predict(X_train_split_pca)
print(classification_report(y_train_split, internal_pred))

print("Eout")
t0 = time()
y_pred = clf.predict(X_test_split_pca)
print(classification_report(y_test_split, y_pred))
print("done in %0.3fs" % (time() - t0))


Fitting the classifier to the training set
Performing with Adaboost using DecisionTree Classifier
Ein
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        38
          1       1.00      1.00      1.00       143
          2       1.00      1.00      1.00        59
          3       1.00      1.00      1.00       303
          4       1.00      1.00      1.00        53
          5       1.00      1.00      1.00        49
          6       1.00      1.00      1.00        79

avg / total       1.00      1.00      1.00       724

Eout
             precision    recall  f1-score   support

          0       0.50      0.23      0.32        13
          1       0.22      0.28      0.24        40
          2       0.40      0.31      0.35        26
          3       0.47      0.54      0.51        92
          4       0.50      0.27      0.35        33
          5       0.21      0.27      0.24        11
          6       0.21      0.22      0.2

In [10]:
# Linear SVC with Adaboost
print("Fitting the classifier to the training set")
t0 = time()

print("Performing with Adaboost using SVC")
clf = AdaBoostClassifier(SVC(kernel='linear', class_weight='balanced', C=1, gamma=0.005, probability=True), n_estimators=300, algorithm='SAMME')
clf = clf.fit(X_train_split_pca, y_train_split)

print("Ein")
internal_pred = clf.predict(X_train_split_pca)
print(classification_report(y_train_split, internal_pred))

print("Eout")
t0 = time()
y_pred = clf.predict(X_test_split_pca)
print(classification_report(y_test_split, y_pred))
print("done in %0.3fs" % (time() - t0))


Fitting the classifier to the training set
Performing with Adaboost using SVC
Ein
             precision    recall  f1-score   support

          0       0.95      1.00      0.97        38
          1       0.82      0.97      0.89       143
          2       0.92      0.98      0.95        59
          3       0.99      0.90      0.94       303
          4       0.96      0.98      0.97        53
          5       0.96      1.00      0.98        49
          6       1.00      0.91      0.95        79

avg / total       0.95      0.94      0.94       724

Eout
             precision    recall  f1-score   support

          0       0.90      0.69      0.78        13
          1       0.72      0.85      0.78        40
          2       0.85      0.65      0.74        26
          3       0.81      0.83      0.82        92
          4       0.76      0.76      0.76        33
          5       0.62      0.73      0.67        11
          6       0.84      0.78      0.81        27

avg / t

In [13]:
# Gradient Boosting
print("Fitting the classifier to the training set")
t0 = time()

print("Performing with GradientBoosting")
clf = GradientBoostingClassifier(n_estimators=150)
clf = clf.fit(X_train_split_pca, y_train_split)

print("Ein")
internal_pred = clf.predict(X_train_split_pca)
print(classification_report(y_train_split, internal_pred))

print("Eout")
t0 = time()
y_pred = clf.predict(X_test_split_pca)
print(classification_report(y_test_split, y_pred))
print("done in %0.3fs" % (time() - t0))

Fitting the classifier to the training set
Performing with GradientBoosting
Ein
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        38
          1       1.00      1.00      1.00       143
          2       1.00      1.00      1.00        59
          3       1.00      1.00      1.00       303
          4       1.00      1.00      1.00        53
          5       1.00      1.00      1.00        49
          6       1.00      1.00      1.00        79

avg / total       1.00      1.00      1.00       724

Eout
             precision    recall  f1-score   support

          0       1.00      0.08      0.14        13
          1       0.74      0.80      0.77        40
          2       0.54      0.27      0.36        26
          3       0.58      0.96      0.72        92
          4       0.80      0.24      0.37        33
          5       0.67      0.36      0.47        11
          6       0.50      0.30      0.37        27

avg / tot

In [12]:
# Model chosen for submission on kaggle
print("training with the whole dataset on best model")
# getting eigenfaces from input data
n_components = 150
h = 50
w = 37
print("Extracting the top %d eigenfaces from %d faces"
      % (n_components, X_train_split.shape[0]))
t0 = time()
pca = PCA(n_components=n_components, svd_solver='randomized',
          whiten=True).fit(X_train)
print("done in %0.3fs" % (time() - t0))

eigenfaces = pca.components_.reshape((n_components, h, w))
print("Projecting the input data on the eigenfaces orthonormal basis")

t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("done in %0.3fs" % (time() - t0))

print("Fitting the classifier to the training set")
print("performing with SVC using OneVsRest Classifier")

model_to_set = OneVsRestClassifier(SVC(kernel="rbf"))
parameters = {
    "estimator__C": [1, 3, 5, 1e1, 5e1, 1e2, 1e3],
    "estimator__kernel": ["rbf"],
    "estimator__gamma":[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]
}

clf = GridSearchCV(model_to_set, param_grid=parameters, cv=10)
clf = clf.fit(X_train_pca, y_train)

print("Ein")
internal_pred = clf.predict(X_train_pca)
print(classification_report(y_train, internal_pred))

print("Eout")
y_pred = clf.predict(X_test_pca)
saveTestResults(y_pred)
print("data saved in testResult.csv")
print("done in %0.3fs" % (time() - t0))



training with the whole dataset on best model
Extracting the top 150 eigenfaces from 724 faces
done in 0.243s
Projecting the input data on the eigenfaces orthonormal basis
done in 0.020s
Fitting the classifier to the training set
performing with SVC using OneVsRest Classifier
Ein
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        51
          1       1.00      1.00      1.00       183
          2       1.00      1.00      1.00        85
          3       1.00      1.00      1.00       395
          4       1.00      1.00      1.00        86
          5       1.00      1.00      1.00        60
          6       1.00      1.00      1.00       106

avg / total       1.00      1.00      1.00       966

Eout
data saved in testResult.csv
done in 473.694s
