# 实验 最近邻分类器

## 读取MNIST数据集，并将其划分为train/val/test数据集

### MNIST数据集读取

In [2]:
import gzip
import os
import struct
import numpy as np

def load_mnist(path, kind='train'):
    """加载MNIST数据集"""
    labels_path = os.path.join(path, f'{kind}-labels-idx1-ubyte.gz')
    images_path = os.path.join(path, f'{kind}-images-idx3-ubyte.gz')

    with gzip.open(labels_path, 'rb') as lbpath:
        struct.unpack('>II', lbpath.read(8))
        labels = np.frombuffer(lbpath.read(), dtype=np.uint8)

    with gzip.open(images_path, 'rb') as imgpath:
        struct.unpack('>IIII', imgpath.read(16))
        images = np.frombuffer(imgpath.read(), dtype=np.uint8).reshape(len(labels), 784)

    return images, labels



# 数据集划分
def data_split(images, labels, ratio):
    
    total_len = images.shape[0]
    offset = int(total_len * ratio)
    
    val_img = images[:offset][:]
    val_lb = labels[:offset]
    
    train_img = images[offset:][:]
    train_lb = labels[offset:]
    
    return train_img, train_lb, val_img, val_lb    

### 按照 5:1:1 划分为训练集，验证集，测试集


In [3]:
# 读取训练集和测试集数据
[images, labels] = load_mnist('./MNIST', kind='train')
[test_img, test_lb] = load_mnist('./MNIST',kind='test')
train_img, train_lb, val_img, val_lb = data_split(images, labels, 1/6)


# 打印查看数据集格式
print('训练集图像格式为:', train_img.shape, '训练集标签格式为:', train_lb.shape)
print('验证集图像格式为:', val_img.shape, '验证集标签格式为:', val_lb.shape)
print('测试集图像格式为:', test_img.shape, '测试集标签格式为:', test_lb.shape)

训练集图像格式为: (50000, 784) 训练集标签格式为: (50000,)
验证集图像格式为: (10000, 784) 验证集标签格式为: (10000,)
测试集图像格式为: (10000, 784) 测试集标签格式为: (10000,)


## 用Random Search + Cross Entroy Loss来训练Linear Classifier

### 用向量化计算Cross Entroy Loss

In [4]:
def vectorized_loss_softmax(X, y, W):
    """
    Inputs have dimension D=784, there are C=10 classes, and we operate on N=50000 examples.
    
    Inputs:
    - W: Indexs of linear classifier, a numpy array of shape (D, C) containing weights.
    - X: Training images, a numpy array of shape (N, D) containing a minibatch of data.
    - y: Training labels, a numpy array of shape (N,) containing training labels; y[i] = c means
         that X[i] has label c, where 0 <= c < C.
    
    Returns a tuple of:
    - Softmax loss as single float.
    """
    
    # set the value of delta, lamda
    delta = 1.0
    lamda = 1.0
    num_train = X.shape[0]
    num_class = W.shape[1]
    
    # scores: class x examples
    scores = W.T.dot(X.T).reshape(num_class, num_train)
    
    # scores_max: 1 x examples, get the max value from each column
    scores_max = np.reshape(np.max(scores, axis=0), (1, num_train))

    # prob: class x examples, calculate the log probability
    # use scores_max to limit the boundary of exp indexes
    prob = np.exp(scores-scores_max) / np.sum(np.exp(scores-scores_max), axis=0)

    # set value 1 in true label positions, 0 for false labels
    y_true = np.zeros(prob.shape)
    y_true[y, np.arange(num_train)] = 1.0
    
    # calculate the average data loss
    loss = -np.sum(y_true * np.log(prob))/num_train
    
    return loss

### 定义训练函数

In [30]:
def random_search(X, y):
    """
    Inputs have dimension D=784, there are C=10 classes, and we operate on N=50000 examples.
    
    Inputs:
    - X: Training images, a numpy array of shape (N, D) containing a minibatch of data.
    - y: Training labels, a numpy array of shape (N,) containing training labels; y[i] = c means
         that X[i] has label c, where 0 <= c < C.
    
    Returns:
    - Best indexs W
    """
    
    # sign the highest float value to best loss
    best_loss = float('inf')
    
    # randomly choose different indexs W to calculate the loss
    for num in range(100):
        # W: dimension x class
        W = np.random.randn(X.shape[1], 10) * 0.0001
        loss = vectorized_loss_softmax(X, y, W)
            
        # scores: class x examples
        scores =  W.T.dot(X.T)
        
        # get the predicted labels
        # y_pred: examples
        y_pred = np.argmax(scores,axis=0)
        
        # calculate the accuracy
        accuracy = np.mean(y_pred==y)*100
        
        # update the best loss
        if loss < best_loss:
            best_loss = loss
            best_W = W
            best_acc = accuracy
        
        # print the result for every iteration
        print("Epoch: %d  Loss: %.3f  Acc: %.3f%%  Best Loss: %.3f  Best Acc: %.3f%%" % (num+1, loss, accuracy, best_loss, best_acc))
    
    # print the final result
    print("\nBest Loss: %.3f  Best Acc: %.3f%%" % (best_loss, best_acc))
    
    return best_W

### 在训练集上进行训练

In [29]:
# train the linear classifier with random search strategy

best_W = random_search(train_img, train_lb)

Epoch: 1  Loss: 2.341  Acc: 9.832%  Best Loss: 2.341  Best Acc: 9.832%
Epoch: 2  Loss: 2.352  Acc: 12.408%  Best Loss: 2.341  Best Acc: 9.832%
Epoch: 3  Loss: 2.341  Acc: 6.072%  Best Loss: 2.341  Best Acc: 9.832%
Epoch: 4  Loss: 2.323  Acc: 7.890%  Best Loss: 2.323  Best Acc: 7.890%
Epoch: 5  Loss: 2.309  Acc: 11.808%  Best Loss: 2.309  Best Acc: 11.808%
Epoch: 6  Loss: 2.325  Acc: 11.418%  Best Loss: 2.309  Best Acc: 11.808%
Epoch: 7  Loss: 2.289  Acc: 14.074%  Best Loss: 2.289  Best Acc: 14.074%
Epoch: 8  Loss: 2.320  Acc: 13.520%  Best Loss: 2.289  Best Acc: 14.074%
Epoch: 9  Loss: 2.339  Acc: 10.670%  Best Loss: 2.289  Best Acc: 14.074%
Epoch: 10  Loss: 2.311  Acc: 11.234%  Best Loss: 2.289  Best Acc: 14.074%
Epoch: 11  Loss: 2.326  Acc: 11.776%  Best Loss: 2.289  Best Acc: 14.074%
Epoch: 12  Loss: 2.328  Acc: 9.422%  Best Loss: 2.289  Best Acc: 14.074%
Epoch: 13  Loss: 2.300  Acc: 10.276%  Best Loss: 2.289  Best Acc: 14.074%
Epoch: 14  Loss: 2.356  Acc: 6.970%  Best Loss: 2.289  

### 在验证集上测试分类的效果

In [27]:
# test the classification accuracy on validation dataset
X=test_img[0:10000]
Y=test_lb[0:10000]
# scores: class x examples
scores =  best_W.T.dot(X.T)

# get the predicted labels
# y_pred: examples
y_pred = np.argmax(scores,axis=0)
# calculate the accuracy
accuracy = np.mean(y_pred==Y)*100

# print the accuracy
print("Random Search在验证集上的分类精度为: %.3f%%" % accuracy)

Random Search在验证集上的分类精度为: 16.740%
