In [210]:
#导入用到的库以及函数
from Mnist_import import load_images,load_labels
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [211]:
#可视化函数
def plot_embedding(T, labels, title):
    x_min = np.min(T, 0)
    x_max = np.max(T, 0)
    data = (T - x_min) / (x_max - x_min)
    plt.figure(figsize = (6, 6))
    ax = plt.subplot(111)
    colors = ['black','green','yellow','blue','purple','brown','deepskyblue','olive','gold','silver']#选择区分颜色
    for i in range(data.shape[0]):#一行一个样本
        ax.text(data[i, 0], data[i, 1], str(labels[i]), color = colors[labels[i]], fontdict = {'size' : 10, 'color' : 'red'})

In [212]:
#第一种构建邻接图，距离平方小于E
def E_neighborhoods(DM):
    N, D = DM.shape#获取数据矩阵的行列
    K = N * N
    G = np.zeros((N, N))#定义一个N阶矩阵
    #计算数据矩阵中的点之间的距离的平方
    for i in range(N):
        for j in range(i+1, N):
            temp = DM[i] - DM[j]
            temp = temp.reshape((1, D))#缺少这句，下面会报错
            temp = np.sum(temp**2,axis = 1)#距离平方
            #满足则将对应对称邻接图点设为距离平方
            #if(temp<E):
            G[i][j] = temp
            G[j][i] = temp
    total = 0
    for i in range(N):
        for j in range(N):
           total = total + G[i][j] 
    avg = total / K
    avg = avg*1.2#防止有些点出现没有邻近点的情况
    for i in range(N):
        for j in range(i+1, N):
            if(G[i][j] > avg):
                G[i][j] = 0
                G[j][i] = 0
    return G            
    

In [213]:
#第二种构建邻近图，knn方式
def K_neighbors(DM, k):
    N, D = DM.shape
    G = np.zeros((N, N))
    X = np.sum(DM ** 2, axis = 1, keepdims = True)#得到一个nx1的矩阵
    dist = np.tile(X, (1, N)) + np.tile(X.T, (N, 1)) - 2 * np.dot(DM, DM.T)#距离矩阵
    index = np.argsort(dist)#获得距离排序后的下标矩阵
    for i in range(N):
        for j in range(1,k+1):
            G[i][index[i][j]] = dist[i][index[i][j]]
    #print(G[0])
    return G           

In [214]:
#第一种计算权重矩阵，热核方程
def get_weight(G, t):
    N = G.shape[0]
    W = np.zeros((N, N))#定义一个N阶阵来保存权重矩阵
    for i in range(N):
        for j in range(N):
            if(G[i][j] != 0):
                W[i][j] = np.exp(-G[i][j]/t)
                #W[j][i] = W[i][j]
            #W = np.real(W)
    #print(W[0])
    return W

In [215]:
#第二计算权重矩阵，简单设置为1
def get_weight_1(G, t):
    N = G.shape[0]
    W = np.zeros((N, N))
    for i in range(N):
        for j in range(i+1, N):
            if(G[i][j] != 0):
                W[i][j] = 1
                W[j][i] = 1
    return W

In [216]:
#计算拉普拉斯矩阵的特征向量
def get_embedding(W, m):
    N = W.shape[0]
    D = np.zeros((N, N))
    #计算D对角阵
    for i in range(N):
        temp = W[i]
        temp = temp.reshape(1, N)
        D[i][i] = np.sum(temp, axis = 1)
       # print(D[i][i])
    #print(D)
    #print(np.linalg.det(D))
    L = np.zeros((N, N))#定义拉普拉斯阵
    L = D - W#计算拉普拉斯阵
    D_inv = np.linalg.inv(D)
    eigenvals, eigenvecs = np.linalg.eig(np.dot(D_inv, L))#计算拉普拉斯阵的特征值和特征向量
    index = np.argsort(eigenvals)#获得特征值从小到大排序的下标
    
    eigenvals = eigenvals[index]
    flag = 0#统计为特征值为0的特征向量数量
    while eigenvals[flag] < 1e-6:
        flag += 1
    #print(flag)
    
    eigenvecs = eigenvecs[:, index[flag:m+flag:1]]#从非0开始取，取m个特征向量
    #print(eigenvecs)
    return eigenvecs

In [217]:
#使用knn分类器
def knn_test(data, label):
    knn = KNeighborsClassifier(n_neighbors = 1)#生成一个Knn分类器
    data_train, data_test, label_train, label_test = train_test_split(data, label, train_size = 0.5, stratify = label)
    #print(data) #检查输入的数据集
    #将传入的数据和标签使用划分函数进行划分，train_size表示训练集所占比例,stratify = label表示按标签划分数据
    knn.fit(data_train, label_train)#用划分的训练模型进行训练
    label_pred = knn.predict(data_test)#对划分的测试集进行预测
    #print(label_pred) #输出预测的结果
    #acc = metrics.accuracy_score(label_test, label_pred)#调用函数进行正确率的测试
    #return(acc)
    x = sum(label_pred == label_test)#手动统计计算预测正确率
    return((float)(x/1000))

In [226]:

#mnist测试集
data = load_images('C:/Users/yinziniu/mnist_data/t10k-images.idx3-ubyte')
label = load_labels('C:/Users/yinziniu/mnist_data/t10k-labels.idx1-ubyte')

#data = load_images('C:/Users/yinziniu/mnist_data/train-images.idx3-ubyte')
#label = load_labels('C:/Users/yinziniu/mnist_data/train-labels.idx1-ubyte')

#new_data = data[:2000]/255.0 #对于灰度图来说，像素值最大为255，最小值为1，进行标准化之后数据处在0-1之间
#new_label = label[:2000]

'''
#G = E_neighborhoods(new_data)
G = K_neighbors(new_data, 10)#邻居数
W = get_weight(G, 30)#参数t
#W = get_weight_1(G, 3)#参数t
T = get_embedding(W, 2)#降维的目标维数
T = np.real(T)
#plot_embedding(T, new_label, 'LE_mnist')
knn_test(T, new_label)
'''
new_data = []
new_label = []
for i in range(0,10):
    n = 0
    for j in range(60000):
        if(label[j] == i):
            new_data.append(data[j])
            new_label.append(label[j])
            n = n+1
        if(n == 100):
            break;
new_data = np.reshape(new_data, (1000,784))
new_data = new_data/255.0
k = [2,3,5,9,13,18,25,30]
d = [2,3,4,5,6,7]
for i in range(8):
    for j in range(6):
        G = K_neighbors(new_data, k[i])
        W = get_weight(G, 28)
        T = get_embedding(W, d[j])
        T = np.real(T)
        acc = knn_test(T, new_label)
        print('neighbors = ', k[i], ', dims =' , d[j], ', accuracy = ', acc)

neighbors =  2 , dims = 2 , accuracy =  0.532
neighbors =  2 , dims = 3 , accuracy =  0.588
neighbors =  2 , dims = 4 , accuracy =  0.624
neighbors =  2 , dims = 5 , accuracy =  0.598
neighbors =  2 , dims = 6 , accuracy =  0.628
neighbors =  2 , dims = 7 , accuracy =  0.642
neighbors =  3 , dims = 2 , accuracy =  0.598
neighbors =  3 , dims = 3 , accuracy =  0.644
neighbors =  3 , dims = 4 , accuracy =  0.692
neighbors =  3 , dims = 5 , accuracy =  0.68
neighbors =  3 , dims = 6 , accuracy =  0.682
neighbors =  3 , dims = 7 , accuracy =  0.666
neighbors =  5 , dims = 2 , accuracy =  0.628
neighbors =  5 , dims = 3 , accuracy =  0.702
neighbors =  5 , dims = 4 , accuracy =  0.706
neighbors =  5 , dims = 5 , accuracy =  0.686
neighbors =  5 , dims = 6 , accuracy =  0.686
neighbors =  5 , dims = 7 , accuracy =  0.72
neighbors =  9 , dims = 2 , accuracy =  0.602
neighbors =  9 , dims = 3 , accuracy =  0.676
neighbors =  9 , dims = 4 , accuracy =  0.69
neighbors =  9 , dims = 5 , accuracy 