In [39]:
#Practice code from Raul Garreta's "Learning scikit-learn: Machine Learning in Python" 2013 book
#Code adapted for Python 3.6.1 5/9/17 by L Kahn
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_olivetti_faces
faces = fetch_olivetti_faces()
print (faces.DESCR)

Modified Olivetti faces dataset.

The original database was available from

    http://www.cl.cam.ac.uk/research/dtg/attarchive/facedatabase.html

The version retrieved here comes in MATLAB format from the personal
web page of Sam Roweis:

    http://www.cs.nyu.edu/~roweis/

There are ten different images of each of 40 distinct subjects. For some
subjects, the images were taken at different times, varying the lighting,
facial expressions (open / closed eyes, smiling / not smiling) and facial
details (glasses / no glasses). All the images were taken against a dark
homogeneous background with the subjects in an upright, frontal position (with
tolerance for some side movement).

The original dataset consisted of 92 x 112, while the Roweis version
consists of 64x64 images.



In [40]:
print (faces.keys())

dict_keys(['data', 'images', 'target', 'DESCR'])


In [41]:
print(faces.images.shape)

(400, 64, 64)


In [42]:
print(faces.target.shape)

(400,)


In [43]:
#Let's normalize the data if needed to obtain good ML classifier results
print(np.max(faces.data))

1.0


In [44]:
print(np.min(faces.data))

0.0


In [45]:
print(np.mean(faces.data))

0.547043


In [46]:
#In our case, we do not need to normalize the data since the values are in the 0 to 1 range
#Before applying the ML classifier, let's plot some face images.
#Define a helper function
def print_faces(images, target, top_n):
    #set up the figure size in inches
    fig = plt.figure(figsize=(12,12))
    fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
    for i in range(top_n):
            #plot the images in a matrix of 20x20
            p = fig.add_subplot(20, 20, i + 1, xticks=[],
            yticks=[])
            p.imshow(images[i], cmap=plt.cm.bone)
            
            #label the image with the target value
            p.text(0, 14, str(target[i]))
            p.text(0, 60, str(i))

In [49]:
#Now let's print the first 20 images
print_faces(faces.images, faces.target, 20)

In [30]:
from sklearn.svm import SVC
svc_1 = SVC(kernel='linear')

In [50]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(faces.data, faces.target, test_size=0.25, random_state=0)

In [51]:
#Add a function to evaluate K-fold cross validation
from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem

In [58]:
def evaluate_cross_validation(clf, X, y, K):
    #create a k-fold cross validation iterator
    cv = KFold(len(y), K, shuffle=True, random_state=0)
    #by default the score used is the one returned by score method of the estimator (accuracy)
    scores = cross_val_score(clf, X, y, cv=cv)
    print (scores)
    print ("Mean score: {0:.3f} (+/-{1:.3f})").format(
        np.mean(scores), sem(scores))

In [59]:
evaluate_cross_validation(svc_1, X_train, y_train, 5)
#SVC with a cross validation of 5 folds has an accuracy of 0.933

[ 0.93333333  0.86666667  0.91666667  0.93333333  0.91666667]
Mean score: {0:.3f} (+/-{1:.3f})


AttributeError: 'NoneType' object has no attribute 'format'

In [62]:
#Next, we'll define a function to perform training on training set and evaluate the test set performance
from sklearn import metrics

def train_and_evaluate(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    
print ("Accuracy on training set:")
print (clf.score(X_train, y_train))
print ("Accuracy on testing set:")
print (clf.score(X_test, y_test))

y_pred = clf.predict(X_test)
    
print ("Classification Report:")
print (metrics.classification_report(y_test, y_pred))
print ("Confusion Matrix:")
print (metrics.confusion_matrix(y_test, y_pred))

In [63]:
train_and_evaluate(svc_1, X_train, X_test, y_train, y_test)

Accuracy on training set:
1.0
Accuracy on testing set:
0.99
Classification Report:
             precision    recall  f1-score   support

          0       0.86      1.00      0.92         6
          1       1.00      1.00      1.00         4
          2       1.00      1.00      1.00         2
          3       1.00      1.00      1.00         1
          4       1.00      1.00      1.00         1
          5       1.00      1.00      1.00         5
          6       1.00      1.00      1.00         4
          7       1.00      0.67      0.80         3
          9       1.00      1.00      1.00         1
         10       1.00      1.00      1.00         4
         11       1.00      1.00      1.00         1
         12       1.00      1.00      1.00         2
         13       1.00      1.00      1.00         3
         14       1.00      1.00      1.00         5
         15       1.00      1.00      1.00         3
         17       1.00      1.00      1.00         6
         19    

In [67]:
#Next let's try to classify the images of people with and without glasses
#First, define the range of the images that show faces wearing glasses:
glasses = [(10, 19), (30, 32), (37, 38), (50, 59), (63, 64),
   (69, 69), (120, 121), (124, 129), (130, 139), (160, 161),
   (164, 169), (180, 182), (185, 185), (189, 189), (190, 192),
   (194, 194), (196, 199), (260, 269), (270, 279), (300, 309),
   (330, 339), (358, 359), (360, 369)
]

In [64]:
print_faces

<function __main__.print_faces>

In [68]:
#Define a function that returns a new target array that marks with 1 for the faces with glasses and 0 for 
#the faces without glasses

def create_target(segments):
    #Create a new y array of target size initialzed with zeros
    
    y = np.zeros(faces.target.shape[0])
    # put 1 in the specified segments
    for (start, end) in segments:
        y[start:end +1] = 1
        return y
target_glasses = create_target(glasses)

In [69]:
#Perform the train/test split again
X_train, X_test, y_train, y_test = train_test_split(faces.data, target_glasses, test_size=0.25, random_state=0)

In [70]:
#Now let's create a new SVC classifier and train it with the new target vector
svc_2 = SVC(kernel='linear')

In [71]:
evaluate_cross_validation(svc_2, X_train, y_train, 5)

[ 1.  1.  1.  1.  1.]
Mean score: {0:.3f} (+/-{1:.3f})


AttributeError: 'NoneType' object has no attribute 'format'

In [72]:
evaluate_cross_validation(svc_2, X_train, y_train, 5)

[ 1.  1.  1.  1.  1.]
Mean score: {0:.3f} (+/-{1:.3f})


AttributeError: 'NoneType' object has no attribute 'format'

In [73]:
train_and_evaluate(svc_2, X_train, X_test, y_train, y_test)

Accuracy on training set:
1.0
Accuracy on testing set:
1.0
Classification Report:
             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00        96
        1.0       1.00      1.00      1.00         4

avg / total       1.00      1.00      1.00       100

Confusion Matrix:
[[96  0]
 [ 0  4]]


In [74]:
X_test = faces.data[30:40]
y_test = target_glasses[30:40]
print (y_test.shape[0])

10


In [75]:
select = np.ones(target_glasses.shape[0])

In [76]:
select[30:40] = 0

In [77]:
X_train = faces.data[select ==1]
y_train = target_glasses[select ==1]
print (y_train.shape[0])

390


In [78]:
svc_3 = SVC(kernel='linear')

In [79]:
train_and_evaluate(svc_3, X_train, X_test, y_train, y_test)

Accuracy on training set:
1.0
Accuracy on testing set:
1.0
Classification Report:
             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00        10

avg / total       1.00      1.00      1.00        10

Confusion Matrix:
[[10]]


In [84]:
y_pred = svc_3.predict(X_test)
eval_faces = [np.reshape(a, (64,64)) for a in X_test]

In [85]:
print_faces(eval_faces, y_pred, 10)

In [87]:
print_faces

<function __main__.print_faces>