In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 

from sklearn.datasets import fetch_lfw_people

lfw_people = fetch_lfw_people(min_faces_per_person=1)#筛选出大于70张照片的人
print(dir(lfw_people))#查看数据的属性

['DESCR', 'data', 'images', 'target', 'target_names']


In [2]:
print(lfw_people.data.shape)
print(lfw_people.images.shape)
print(lfw_people.target.shape)
print(lfw_people.target_names.shape)

# (13233, 2914)
# (13233, 62, 47)
# (13233,)
# (5749,)

(13233, 2914)
(13233, 62, 47)
(13233,)
(5749,)


In [3]:
#统计各个标签数量
target = pd.DataFrame(lfw_people.target)
target.value_counts()

1871    530
1047    236
5458    144
1404    121
1892    109
       ... 
2231      1
2230      1
2229      1
2228      1
5748      1
Length: 5749, dtype: int64

## 划分数据集

In [4]:
#按照8：2划分训练集与测试集
from sklearn.model_selection import train_test_split
X = lfw_people.data
y = lfw_people.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

## PCA对数据进行降维，提取特征

In [5]:
from sklearn.decomposition import PCA
n_components = 150 #提取的主成分数量
print("Extracting the top %d eigenfaces from %d faces"
      % (n_components, X_train.shape[0]))
pca = PCA(n_components=n_components, svd_solver='randomized', whiten=True).fit(X_train)

eigenfaces = pca.components_.reshape((n_components, lfw_people.images.shape[1], lfw_people.images.shape[2])) #低维空间（PCA中的矩阵W） 
print("Projecting the input data on the eigenfaces orthonormal basis")


X_train_pca = pca.transform(X_train) #将训练集投影到低维空间
X_test_pca = pca.transform(X_test)


print("eigenfaces:\n ",eigenfaces.shape)
print("X_train_pca:\n ",X_train_pca.shape)
print("X_test_pca:\n ",X_test_pca.shape)

Extracting the top 150 eigenfaces from 10586 faces
Projecting the input data on the eigenfaces orthonormal basis
eigenfaces:
  (150, 62, 47)
X_train_pca:
  (10586, 150)
X_test_pca:
  (2647, 150)


## 用SVM模型训练，网格搜索调参

In [2]:
import torch

print(torch.cuda.current_device())
print(torch.cuda.get_device_name())

0
NVIDIA GeForce RTX 2080 Ti


In [6]:
#用SVM模型训练，网格搜索调参
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

print("Fitting the classifier to the training set")
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(
    SVC(kernel='rbf', class_weight='balanced'), param_grid, n_jobs=-1,
)
clf = clf.fit(X_train_pca, y_train)
print("Best estimator found by grid search:")
print(clf.best_estimator_)

Fitting the classifier to the training set




## 测试集，评估模型的效果

In [None]:
# 在测试集上评估模型效果
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print("Predicting people's names on the test set")

y_pred = clf.predict(X_test_pca)

print(classification_report(y_test, y_pred, target_names=lfw_people.target_names))
print(confusion_matrix(y_test, y_pred, labels=range(lfw_people.target_names.shape[0])))