In [1]:
# cifar 10数据集10分类数据集，32 x 32大小的RGB3通道图片，50000张用于训练，10000张用于测试
# 飞机、汽车、鸟、猫、鹿、狗、蛙、马、船、卡车
import numpy as np
import pickle
import math
from sklearn.decomposition import PCA

In [2]:
'''
    一些超参数
'''
# 最小风险矩阵
# m[i, j]代表把真实属于j类的样本归到i类中带来的损失
''' 0.3692
matrixLambda = np.array([
    [0, 1, 1, 2, 2, 2, 2, 2, 1, 1],
    [1, 0, 2, 2, 2, 2, 2, 2, 1, 1],
    [1, 2, 0, 1, 1, 1, 1, 1, 2, 2],
    [2, 2, 1, 0, 1, 1, 1, 1, 2, 2],
    [2, 2, 1, 1, 0, 1, 1, 1, 2, 2],
    [2, 2, 1, 1, 1, 0, 1, 1, 2, 2],
    [2, 2, 1, 1, 1, 1, 0, 1, 2, 2],
    [2, 2, 1, 1, 1, 1, 1, 0, 2, 2],
    [1, 1, 2, 2, 2, 2, 2, 2, 0, 1],
    [1, 1, 2, 2, 2, 2, 2, 2, 1, 0]
])
'''
''' 0.3014
matrixLambda = np.array([
    [0, 2, 2, 1, 1, 1, 1, 1, 2, 2],
    [2, 0, 1, 1, 1, 1, 1, 1, 2, 2],
    [2, 1, 0, 2, 2, 2, 2, 2, 1, 1],
    [1, 1, 2, 0, 2, 2, 2, 2, 1, 1],
    [1, 1, 2, 2, 0, 2, 2, 2, 1, 1],
    [1, 1, 2, 2, 2, 0, 2, 2, 1, 1],
    [1, 1, 2, 2, 2, 2, 0, 2, 1, 1],
    [1, 1, 2, 2, 2, 2, 2, 0, 1, 1],
    [2, 2, 1, 1, 1, 1, 1, 1, 0, 2],
    [2, 2, 1, 1, 1, 1, 1, 1, 2, 0]
])
'''
# 0.3716
matrixLambda = np.array([
    [0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [1, 0, 1, 1, 1, 1, 1, 1, 1, 1],
    [1, 1, 0, 1, 1, 1, 1, 1, 1, 1],
    [1, 1, 1, 0, 1, 1, 1, 1, 1, 1],
    [1, 1, 1, 1, 0, 1, 1, 1, 1, 1],
    [1, 1, 1, 1, 1, 0, 1, 1, 1, 1],
    [1, 1, 1, 1, 1, 1, 0, 1, 1, 1],
    [1, 1, 1, 1, 1, 1, 1, 0, 1, 1],
    [1, 1, 1, 1, 1, 1, 1, 1, 0, 1],
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
])
# 数据特征降维
feature_count = 25

In [3]:
# 使用CIFAR-10官方给出的使用方法加载数据集
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding = 'iso-8859-1')
    return dict

def loadData(train_dir = './cifar-10-batches-py/data_batch_', test_dir = './cifar-10-batches-py/test_batch'):
    # 加载训练集
    x_train = np.empty(shape = [0, 3072])
    y_train = []
    for i in range(5):
        file_batch = train_dir + str(i + 1)
        dict_train_batch = unpickle(file_batch)
        data_train_batch = dict_train_batch['data']
        labels = dict_train_batch['labels']
        x_train = np.append(x_train, data_train_batch, axis = 0)
        y_train = np.append(y_train, labels)

    # 加载测试集
    dict_test = unpickle(test_dir)
    x_test = dict_test['data']
    y_test = dict_test['labels']
    
    return x_train, y_train, x_test, y_test

In [4]:
# 提取第c类图片
def extractClass(label, data, c):
    # 提取出第c类图片放到listC
    listC = []
    for i in range(len(label)):
        if label[i] == c:
            listC.append(list(data[i, : ]))
    return listC

# 计算类内均值 类内离散度 比例 新的训练数据 主成分分析模型
def calculateEC(x_train, y_train, feature_count):
    # matrixE[i][j]表示第i类样本在第j个属性上的均值
    matrixE = np.empty((10, feature_count)) 
    matrixC = np.empty((10, feature_count))
    
    pca = PCA(n_components = feature_count)
    new_train_data = pca.fit_transform(x_train)
    
    pc = [] # 先验概率
    for i in range(10):
        listC = extractClass(y_train, new_train_data, i)
        pc.append(len(listC) / len(y_train))
        for j in range(feature_count):
            list1 = []
            for k in range(len(listC)):
                list1.append(listC[k][j])
            matrixE[i, j] = np.mean(list1)
            matrixC[i, j] = np.cov(list1)
    return matrixE, matrixC, pc, new_train_data, pca     

In [5]:
# 根据最小风险贝叶斯模型进行分类
def calculatePostrior(matrixE, matrixC, pc, test_data, feature_count):
    risk = []
    for i in range(10):
        r = 0
        for j in range(10):
            p_x_wj = 1
            for k in range(feature_count):
                p_x_wj = p_x_wj * math.exp(- (test_data[k] - matrixE[j][k]) ** 2 / (2 * matrixC[j][k])) / math.sqrt(2 * math.pi * matrixC[j][k])
            r += matrixLambda[i][j] * pc[i] * p_x_wj
        risk.append(r)
    return risk.index(min(risk))

In [6]:
if __name__ == '__main__':
    x_train, y_train, x_test, y_test = loadData()
    matrixE, matrixC, pc, new_train_data, pca = calculateEC(x_train, y_train, feature_count)
    new_test_data = pca.transform(x_test)

    # 进行类别预测
    pre_label = []
    for i in range(len(y_test)):
        pre_label.append(calculatePostrior(matrixE, matrixC, pc, new_test_data[i, : ], feature_count))
        
    error_count = 0
    for i in range(len(pre_label)):
        if pre_label[i] != y_test[i]:
            error_count += 1
            
    print('Total test number is %d' % len(y_test))
    print('Test accuracy is %f' % (1 - error_count / len(y_test)))

Total test number is 10000
Test accuracy is 0.370600


In [7]:
### 原始RGB特征 0.2647
### HOG特征

### 贝叶斯分类方法
### PCA特征数20  0.3651
### PCA特征数25  0.3716
### PCA特征数30  0.3684
### PCA特征数35  0.3668
###          40  0.3669
###          50  0.3485
###          60  0.3526
###          80  0.3438
###          100 0.3368