In [35]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import cv2
import re

mpl.rcParams['font.sans-serif'] = ['SimHei']
%matplotlib inline

for lib in [('numpy',np), ('pandas', pd), ('matplotlib', mpl), ('opencv', cv2)]:
    print(f'{lib[0]:>10} version is {lib[1].__version__}')

     numpy version is 1.17.2
    pandas version is 0.25.1
matplotlib version is 3.1.1
    opencv version is 3.4.1


In [41]:
def test_train_file_names(k, d):
    """
    将指定目录下的文件分组，每组随机提取 k 比率的文件作为训练集，其余的作为测试集。
    注意：该函数只对文件名操作，并非实际对文件操作
    
    Paramaters
    -----------
    k -- 训练集所占比率 0<k<=1
    d -- 存放文件的目录
    
    Returns
    -------
    ndarray 1-D 训练集
    ndarray 1-D 测试集
    """
    # 构建的DataFrame的字段名称
    FILE_NAME = 'file_name'
    FILE_GROUP_NAME = 'file_group_name'
    #
    fn_list = os.listdir(d)
    df = pd.DataFrame(fn_list, columns=[FILE_NAME])
    df[FILE_GROUP_NAME] = df.file_name.str.extract(r'(.*)(_)')[0]
    grouped = df.groupby([FILE_GROUP_NAME])
    train_set = np.array([])
    test_set = np.array([])
    for v in grouped.groups.values():
        data = df[FILE_NAME][v]
        train = data.sample(frac=k)
        test = data.drop(train.index)
        train_set = np.concatenate((train_set, train.values))
        test_set = np.concatenate((test_set, test.values))
    return train_set, test_set

def img2vector(img_file_path_name):
    """
    图片向量化
    将给定的图片(m, n), 输出(1, m*n)的行向量
    """
    img = cv2.imread(img_file_path_name, 0)
    return img.flatten().reshape(1, -1)

def load_orl(k, d):
    train_file_names, test_file_names = test_train_file_names(k, d)
    train_set = img2vector(os.path.join(d, train_file_names[0]))
    train_label = np.array([])
    test_set = img2vector(os.path.join(d, test_file_names[0]))
    test_label = np.array([])
    for f in train_file_names:
        img = img2vector(os.path.join(d, f))
        train_set = np.concatenate((train_set, img))
        train_label = np.append(train_label, re.sub('_.*', '', f))
    train_set = train_set[1:,:]
    for f in test_file_names:
        img = img2vector(os.path.join(d, f))
        test_set = np.concatenate((test_set, img))
        test_label = np.append(test_label, re.sub('_.*', '', f))
    test_set = test_set[1:,:]
    return train_set, train_label, test_set, test_label

In [42]:
X, X_label, y, y_label = load_orl(0.9, os.path.join('..', 'datas', 'ORL'))
print(f'X.shape is {X.shape}; y.shape is {y.shape};'
      f'X_label.shape is {X_label.shape}; y_label.shape is {y_label.shape}')


X.shape is (360, 10304); y.shape is (40, 10304);X_label.shape is (360,); y_label.shape is (40,)
