In [1]:
# 导包

In [2]:
import random
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelBinarizer
import numpy as np
import scipy.io as sio
from sklearn.datasets import load_svmlight_files
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold

In [3]:
# 读取数据的接口

In [4]:
def readPIE():
    data = sio.loadmat('D:\pytorch\统计学习方法实践\datasets\PIE.mat')
    X,y = data['fea'].astype(np.float64),data['gnd'].ravel()
    y = LabelEncoder().fit(y).transform(y).astype(np.float64)
    return X,y


def readBooks():
    file = 'D:\pytorch\统计学习方法实践\datasets\\books.svmlight'
    X, y = load_svmlight_files([file])
    X = X.toarray()
    y = LabelEncoder().fit(y).transform(y).astype(np.float64)
    #X = scale(X)
    return X,y


    '''
    X: 数据
    y: 标签
    '''

In [5]:
# 激活函数和其导数

In [6]:
def logistic(x):
    return 1 / (1 + np.exp(-x))


def logistic_derivative(x):
    return logistic(x) * (1 - logistic(x))


    '''
    @x: 要激活的数值
    fun: sigmoid
    '''

In [7]:
# ANN算法模块

In [8]:
class ANN:
    def predict(self, x):
        for w, b in zip(self.weights, self.biases):
            z = np.dot(x, w) + b
            # 计算权重相加再加上偏置后的结果
            x = self.activation(z)
            # 计算输出值
        return self.classes_[np.argmax(x, axis=1)]
        # 返回行最大值的索引的类别

In [9]:
# BP算法

In [10]:
class BP(ANN):
    def __init__(self, layers, batch):
        self.layers = layers
        self.num_layers = len(layers)
        self.batch = batch
        self.activation = logistic
        self.activation_deactivation = logistic_derivative
        self.biases = [np.random.rand(x) for x in layers[1:]]
        self.weights = [np.random.rand(x, y) for x, y in zip(layers[: -1], layers[1:])]
        # 返回一组服从标准正态分布的随机权重和偏置。
    '''
    参数初始化
    :@layers: 层数
    :@num_layers: 层数数量
    :@batch: 批数
    :@activation: 激活函数
    :@activation_deactivation: 激活函数的导数
    :@biases: 偏置
    :@weights: 权重
    '''

    def fit(self, x, y, lr, epochs):
        label_bin = LabelBinarizer()
        # 将标签以一对多的方式二值化，效果与one-hot类似
        y = label_bin.fit_transform(y)
        self.classes_ = label_bin.classes_
        # 保留每个类别的标签
        train_data = [(x, y) for x, y in zip(x, y)]
        n = len(train_data)
        # 获取数据长度
        for i in range(epochs):
            random.shuffle(train_data)
            # 打乱数据
            batches = [train_data[k: k + self.batch] for k in range(0, n, self.batch)]
            # 批量梯度下降
            for sub_batch in batches:
                sub_x = []
                sub_y = []
                for tmp_x, tmp_y in sub_batch:
                    sub_x.append(tmp_x)
                    sub_y.append(tmp_y)
                activations = [np.array(sub_x)]
                # 前向传播
                for w, b in zip(self.weights, self.biases):
                    # 计算激活函数的参数
                    '''计算公式: wx + b (权重乘输入加上偏置)'''
                    res = np.dot(activations[-1], w) + b
                    output = self.activation(res)
                    # 将本次输入放入列表，以便方向传播更新权重
                    activations.append(output)
                # 计算误差值
                err = activations[-1] - np.array(sub_y)
                # 计算输出层的误差率
                details = [err * self.activation_deactivation(activations[-1])]
                # 反向循环计算隐层的误差率，从倒数第二层开始
                for i in range(self.num_layers - 2, 0, -1):
                    '''计算公式: 激活函数得到的值进行求导并乘上 每上一个的错误率再乘当前权重的转置的结果'''
                    details.append(self.activation_deactivation(activations[i]) *
                                    np.dot(details[-1], self.weights[i].T))
                # 将各层的误差颠倒，准备逐层更新权重和偏置
                details.reverse()
                for j in range(self.num_layers - 1):
                    # 权重的增量
                    '''计算公式: 学习率 * (dot(错误率, 输出值)) / 批数'''
                    details = lr * ((np.atleast_2d(activations[j].sum(axis=0)).T).dot(np.atleast_2d(
                        details[j].sum(axis=0) / self.batch
                    )))
                    # 更新权重
                    self.weights[j] -= details
                    # 偏置的增量
                    '''计算公式: 学习率 * 错误率'''
                    details = lr * details[j].sum(axis=0) / self.batch
                    # 更新偏置
                    self.biases[j] -= details
        return self
    '''
    :@x: 训练数据
    :@y: 训练标签
    :@lr: 学习率
    "@epochs: 轮数
    '''

In [11]:
# 主函数部分，读取数据并进行切分

In [12]:
X, y = readBooks()
print(X, y)
# 切割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2022)

[[6. 8. 8. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 [2. 1. 1. ... 0. 0. 0.]
 ...
 [1. 0. 1. ... 0. 0. 0.]
 [3. 0. 0. ... 0. 0. 0.]
 [6. 2. 4. ... 0. 0. 0.]] [1. 1. 0. ... 0. 1. 1.]


In [13]:
# 训练并得出预测标签

In [16]:
clf = BP([X_train.shape[1], 68], 100).fit(x=X_train, y=y_train, lr=1e-1, epochs=3000)
predict_label = clf.predict(X_test)

  return 1 / (1 + np.exp(-x))


IndexError: index 26 is out of bounds for axis 0 with size 2

In [None]:
# 画出混淆矩阵，并且可视化保存

In [None]:
confusion_mat = confusion_matrix(y_test, predict_label)
print(confusion_mat)
classes = [i for i in range(68)]
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_mat, display_labels=classes)
disp.plot(include_values=False)  # 混淆矩阵每个单元格上显示具体数值)

plt.savefig("a", dpi=1000, bbox_inches="tight")
plt.show()

In [None]:
# 最会利用包中函数来查看预测的准确率，召回率和f1-score

In [None]:
print(classification_report(y_test, predict_label))
a = classification_report(y_test, predict_label)

In [None]:
# knn算法部分

In [None]:
def KNN(test_data1,train_data_pca,train_label,k,p):
    sub_mat = train_data_pca - test_data1
    sub_mat = np.abs(sub_mat)
    distance = sub_mat ** p
    distance = np.sum(distance, axis=1)
    distance = distance ** (1.0 / p)
    distance_index = np.argsort(distance)
    class_count = [0 for i in range(68)]
    for i in range(k):
        label = train_label[distance_index[i]]
        class_count[label] += 1
    return np.argmax(class_count)

# 测试算法
def test(k, p):
    print("testing with K = %d and p = %d" % (k, p))
    m, n = np.shape(X_test_pca)
    correctCount = 0
    for i in range(m):
        test_data1 = X_test_pca[i, :]
        predict_label = KNN(test_data1, X_train_pca, y_train, k, p)
        true_label = y_test[i]
        if true_label == predict_label:
            correctCount += 1
    print("The accuracy is: %f" % (float(correctCount) / m))
    return float(correctCount) / m

X,y = readPIE()

pred, preds = [], []
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=2022)

for trn_idx, tt_idx in kf.split(X):
    X_train, y_train, X_test, y_test = X[trn_idx], y[trn_idx], X[tt_idx], y[tt_idx]

    pca = PCA(n_components=68)
    pca.fit(X_train)
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)

    pred.append(test(1, 1))

print("每个分割的准确率为: {}".format(pred))
print("平均准确率为: {}".format(sum(pred) / len(pred))
      )