# 实验一 最近邻分类器

将下面的代码补充完整，实现使用最近邻分类器功能

## 读取MNIST数据集，并将其划分为train/val/test数据集

### MNIST数据集读取

In [9]:
import gzip
import os
import struct
import numpy as np

def load_mnist(path, kind='./MNIST/train'):
    """加载MNIST数据集"""
    labels_path = os.path.join(path, f'{kind}-labels-idx1-ubyte.gz')
    images_path = os.path.join(path, f'{kind}-images-idx3-ubyte.gz')

    with gzip.open(labels_path, 'rb') as lbpath:
        struct.unpack('>II', lbpath.read(8))
        labels = np.frombuffer(lbpath.read(), dtype=np.uint8)

    with gzip.open(images_path, 'rb') as imgpath:
        struct.unpack('>IIII', imgpath.read(16))
        images = np.frombuffer(imgpath.read(), dtype=np.uint8).reshape(len(labels), 784)

    return images, labels



# 数据集划分
def data_split(images, labels, ratio):
    
    total_len = images.shape[0]
    offset = int(total_len * ratio)
    
    val_img = images[:offset][:]
    val_lb = labels[:offset]
    
    train_img = images[offset:][:]
    train_lb = labels[offset:]
    
    return train_img, train_lb, val_img, val_lb   


### 按照 5:1:1 划分为训练集，验证集，测试集


In [10]:
# 读取训练集和测试集数据
[images, labels] = load_mnist('./MNIST', kind='train')
[test_img, test_lb] = load_mnist('./MNIST',kind='test')
train_img, train_lb, val_img, val_lb = data_split(images, labels, 1/6)


# 打印查看数据集格式
print('训练集图像格式为:', train_img.shape, '训练集标签格式为:', train_lb.shape)
print('验证集图像格式为:', val_img.shape, '验证集标签格式为:', val_lb.shape)
print('测试集图像格式为:', test_img.shape, '测试集标签格式为:', test_lb.shape)

训练集图像格式为: (50000, 784) 训练集标签格式为: (50000,)
验证集图像格式为: (10000, 784) 验证集标签格式为: (10000,)
测试集图像格式为: (10000, 784) 测试集标签格式为: (10000,)


## Nearest Neighbor Classifier

### 定义 Nearest Neighbor Classifier

In [11]:
import numpy as np

class NearestNeighbor(object):
    def __init__(self):
        pass
    
    # 读入图像和标签数据并保存
    def train(self, X, y):
        # X is N x D and Y is 1-dimension of size N
        self.Xtrain = X
        self.ytrain = y
    
    # 按照最近邻原则分类
    def predict(self, X):
        # X is N x D where each row is an example we wish to predict label for
        num_test = X.shape[0]
        Ypred = np.zeros((num_test, 1), dtype=self.ytrain.dtype)
        
        for i in range(num_test):
            distances=np.sum(np.abs(self.Xtrain-X[i,:]),axis=1)
            min_index=np.argmin(distances)
            Ypred[i]=self.ytrain[min_index]







        return Ypred



### 测试 Nearest Neighbor Classifier

In [14]:
# 实例化Nearest Neighbor Classifier
nn = NearestNeighbor()
nn.train(train_img[0:1000],train_lb[0:1000])#选1000个数据，进行测试
Yval_predict = nn.predict(val_img)
cnt=0
for i in range(len(Yval_predict)):
    if Yval_predict[i]==val_lb[i]:    
        cnt+=1
print(cnt)
# 在验证集上测试精确度
validation_accuracy = cnt/len(Yval_predict)
print ('accuracy: %f' % (validation_accuracy ,))

2448
accuracy: 0.244800
