## 根据sklearn提供的人脸数据，利用kNN实现对一个新输入的人脸进行识别后返回他的名字

### 引申可将训练数据集更换为明星，输入路人照片匹配与之最相近的明星脸

In [7]:
import numpy as np
import matplotlib.pyplot as plt

In [8]:
from sklearn.datasets import fetch_lfw_people  #加载数据

In [10]:
faces = fetch_lfw_people(min_faces_per_person=60)

In [11]:
faces.keys()

dict_keys(['data', 'images', 'target', 'target_names', 'DESCR'])

In [12]:
faces.data.shape

(1348, 2914)

In [16]:
faces.images.shape

(1348, 62, 47)

In [28]:
for i in range(20):
    print(faces.target_names[faces.target[i]])

Colin Powell
George W Bush
George W Bush
George W Bush
Hugo Chavez
George W Bush
Junichiro Koizumi
George W Bush
Tony Blair
Ariel Sharon
George W Bush
Donald Rumsfeld
George W Bush
George W Bush
George W Bush
Ariel Sharon
Colin Powell
Colin Powell
George W Bush
Tony Blair


## 使用kNN进行分类

In [29]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(faces.data, faces.target, random_state=666)

In [31]:
X_train.shape

(1011, 2914)

In [32]:
X_test.shape

(337, 2914)

In [33]:
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [34]:
knn_clf.score(X_test, y_test)

0.6350148367952523

In [47]:
from sklearn.decomposition import PCA
pca = PCA(0.9)

In [48]:
%time pca.fit(X_train)

Wall time: 328 ms


PCA(copy=True, iterated_power='auto', n_components=0.9, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [49]:
X_train_reduction = pca.transform(X_train)
X_test_reduction = pca.transform(X_test)

In [50]:
X_test_reduction.shape

(337, 82)

In [53]:
knn_clf2 = KNeighborsClassifier()
knn_clf2.fit(X_train_reduction, y_train)
knn_clf2.score(X_test_reduction, y_test)

0.6350148367952523

### Grid Search

In [56]:
param_grid = [
    {
        'weights':['uniform'],
        'n_neighbors':[i for i in range(1, 11)]
    },
    {
        'weights':['distance'],
        'n_neighbors':[i for i in range(1, 11)],
        'p':[i for i in range(1, 6)]
    }
]

In [57]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(knn_clf, param_grid)

In [58]:
%time grid_search.fit(X_train, y_train)

Wall time: 47min 55s


GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'weights': ['uniform']},
                         {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [59]:
grid_search.best_params_

{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}

In [60]:
grid_search.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=1,
                     weights='distance')

In [61]:
grid_search.best_score_

0.6132663512656685

In [62]:
knn_clf = grid_search.best_estimator_

In [63]:
knn_clf.score(X_test, y_test)

0.7240356083086054

## 最佳参数如下，准确度其实有点低，才0.72

In [73]:
knn_clf2 = KNeighborsClassifier(n_neighbors=5,metric='minkowski',n_jobs=-1,p=1,weights='distance')
knn_clf2.fit(X_train_reduction, y_train)
knn_clf2.score(X_test_reduction, y_test)

0.7270029673590505

In [83]:
X_train_reduction.shape

(1011, 82)

In [86]:
index = np.random.randint(len(X_test_reduction))
face = X_test_reduction[index,:].reshape(1, -1)
face_id = knn_clf2.predict(face)
print(" this face belongs to" , faces.target_names[face_id])

 this face belongs to ['Tony Blair']
