In [3]:
import os
import math
import pickle
import struct
import numpy as np
import matplotlib.pyplot as plt

In [4]:
def decode_labels(file):
    with open(file, 'rb') as f:
        binary_data = f.read()
    _, num_items = struct.unpack_from('>II', binary_data, 0)
    labels       = struct.unpack_from('B'*num_items, binary_data, 8)
    return np.array(labels).reshape(-1, 1).astype(np.int)

def decode_images(file):
    with open(file, 'rb') as f:
        binary_data = f.read()
    _,num_images, rows, cols = struct.unpack_from('>IIII', binary_data, 0)
    images                   = struct.unpack_from('B'*(num_images*rows*cols), binary_data, 16)
    return np.array(images).reshape(-1, rows*cols)

def one_hot(y, num_classes):
    rows   = y.shape[0]
    output = np.zeros((rows, num_classes), np.uint8)
    for i in range(rows):
        output[i, y[i]] = 1 # y的值是几，就把1更新到第几列，行号取决于第几个出现
    return output

def norm_image(image):  # 将图像进行标准化
    return (image / 255 - 0.5).astype(np.float32)

y = np.array([1,3,6,7,2,5,8])[...,None]
# one_hot(y, 10)

In [5]:
class DataSet:
    def __init__(self, image_file, label_file, num_classes=10):
        self.images      = decode_images(image_file)
        self.labels      = decode_labels(label_file)
        self.onehot      = one_hot(self.labels, num_classes)
        self.num_classes = num_classes
    
    def __getitem__(self, index):
        '''重载索引器'''
        return self.images[index], self.labels[index], self.onehot[index]
    
    def __len__(self):
        return len(self.images)
    
class DataLoader:
    def __init__(self, dataset, batch_size, shuffle=True):
        self.dataset    = dataset
        self.shuffle    = shuffle
        self.count      = len(dataset)
        self.batch_size =  batch_size
    
    def __iter__(self):
        return DataLoaderIterator(self)
    
    def __len__(self):
        '''batch count'''
        return math.ceil(len(self.dataset) / self.batch_size)
    
class DataLoaderIterator:
    def __init__(self, dataloader):
        self.dataloader = dataloader
        self.cursor     = 0
        self.indexs     = list(range(self.dataloader.count))
        if self.dataloader.shuffle:
            np.random.shuffle(self.indexs)
        
    def __next__(self):
        if self.cursor >= self.dataloader.count:
            raise StopIteration()
        
        output    = []
        one_batch = min(self.dataloader.batch_size, self.dataloader.count - self.cursor)
        for _ in range(one_batch):
            index = self.indexs[self.cursor]
            data  = self.dataloader.dataset[index]
            output.append(data)
            self.cursor += 1
#         output = np.split(output, [1,2], 1)# list(range(1, len(data)-1)), 1)
        output = list(zip(*output))
        for i in range(len(output)):
            output[i] = np.vstack(output[i])
        return output

In [6]:
# 测试DataLoader
dataset = DataSet("../stage_1/data/mnist/t10k-images-idx3-ubyte", "../stage_1/data/mnist/t10k-labels-idx1-ubyte")
loader = DataLoader(dataset, 256)
print('len(loader) = ', len(loader))
# for ibatch, (images, labels, targets) in enumerate(loader):
#     print(ibatch, images.shape, labels.shape, targets.shape)

len(loader) =  40


In [7]:
# Train

batch_size  = 256
num_hidden  = 256   # 做一个隐层，取256个特征值
num_classes = 10    # 分类的数量
# num_feature = 784   
num_feature = 676   # 3x3 kernel (28-2)^2

train_dataset = DataSet("../stage_1/data/mnist/train-images-idx3-ubyte", "../stage_1/data/mnist/train-labels-idx1-ubyte", num_classes)
train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
test_dataset = DataSet("../stage_1/data/mnist/t10k-images-idx3-ubyte", "../stage_1/data/mnist/t10k-labels-idx1-ubyte", num_classes)
test_loader = DataLoader(test_dataset, 1024, shuffle=True)

In [8]:

def gemm_im2col(tensor, kernel):
#     e.g. tensor:(1024, 28, 28), kernel:(1, 9)
#     print(f'tensor:{tensor.shape}, kernel:{kernel.shape}')
#     _, kh, kw = kernel.shape
    kh, kw  = 3, 3
    c, h, w = tensor.shape
    ksize = kw * kh
    s = (w - kw + 1) * (h - kh + 1)  # s:单图列数， ksize: 行数， c:图片张数  s*c=总列数
#     column = np.zeros((1, kh * kw * c, s))
    column = np.zeros((ksize, s * c))
    col_kernel = kernel.reshape(1, -1) 
    half_kx = kw // 2
    half_ky = kh // 2

    for ic in range(c):
        for iy in range(half_ky, h - half_ky):
            for ix in range(half_kx, w - half_kx):
                for iky in range(kh):
                    col_y = 0
                    for ikx in range(kw):
                        pixel_value = tensor[ic, iy - half_ky + iky, ix - half_kx + ikx]
                        col_x = ic * ksize + (ix-half_kx) + (iy-half_ky)
                        col_y = ikx + iky * kw
                        column[col_y, col_x] = pixel_value
#   e.g. output = (1, 9) x (9, 173056)
#   print(f'output = {col_kernel.shape} x {column.shape}')
    output = col_kernel @ column
    return output, column

def gemm_col2im_index(row, col, ctr, k_size=3):
    '''
        row: 行索引（其实就是在kernel形状内移动几次）
        col: 列索引
        ctr: kernel窗口在图片内移动多少次换行
        k_size: kernel核大小，3x3即传3

        左上角=(col//ctr, col%ctr)
        行偏移：row // ksize
        列偏移：row % ksize
    
    '''
    start = (col // ctr, col % ctr)
    rowindex = start[0] + row // k_size
    colindex = start[1] + row % k_size
    return (rowindex, colindex)

def gemm_col2im(column, imgcols, ctr, ker_size, outimage):
    '''
        imgcols: column化的图片的列数，如784, 676
        ctr: kernel能在一张图片里横向滑动的次数
        ker_size: 比如3
    '''
    height, width = column.shape
    for row in range(height):     # kernel元素个数决定行数
        for col in range(width):  # 28*28 -> 26*26 -> 决定列数
            batch     = col // imgcols
            c         = col % imgcols  # 即每26*26换了一张图，列索引归零
            row_index, col_index = gemm_col2im_index(row, c, ctr, ker_size)
            outimage[batch, row_index, col_index] += column[row, col]
    return outimage

def lr_cosine_schedule(lr_min, lr_max, epochs):
    '''
        :param epochs: total epochs are performed before a new restart.
        :param epoch: How many epochs have been performed since the last restart.
        :return: a function to compute a value within a period.
    '''

    def compute(epoch):
        return lr_min + 0.5 * (lr_max - lr_min) * (1 + np.cos(epoch / epochs * np.pi))

    return compute

- Xavier初始化公式xavier normal为$(0, std^2)$：$std = gain \times \sqrt{\frac{2}{fan\_in + fan\_out}}$
    - fan_in为输入的数据量，fan_out为输出的数据量
    - Linear中，就是输入通道数和输出通道数
    - Conv中，则需要计算通道与宽高的乘积
- Xavier考虑的非线性激活函数为：TanH

In [9]:
class Module:
    def __init__(self):
        self.train_mode = True
        
    def __call__(self, *args):
        return self.forward(*args)
    
    def train(self):
        self.train_mode = True
        for m in self.modules():
            m.train()
    
    def eval(self):
        self.train_mode = False
        for m in self.modules():
            m.eval()
    
    def modules(self):
        ms = []
        # 反射
        for attr in self.__dict__:
            m = self.__dict__[attr]
            if isinstance(m, Module):
                ms.append(m)
        return ms
    
    def params(self):
        ps = []
        for attr in self.__dict__:
            p = self.__dict__[attr]
            if isinstance(p, Parameter):
                ps.append(p)
            
        ms = self.modules()
        for m in ms:
            ps.extend(m.params())
        return ps
    
    def info(self, n):
        ms = self.modules()
        name = self.__class__.__name__
        output = f"{name}\n"
        for m in ms:
            output += ('  '*(n+1)) + f"{m.info(n+1)}\n"
        return output[:-1]
    
    def __repr__(self):
        return self.info(0)
    
class Parameter:
    def __init__(self, data):
        self.data = data
        self.grad = np.zeros(data.shape)
        
    def zero_grad(self):
        self.grad[...] = 0
        
class Linear(Module):
    def __init__(self, in_number, out_number):
        super().__init__()
        self.weight   = Parameter(np.random.normal(0, np.sqrt(2/(in_number+out_number)), size=(in_number, out_number)))
        self.bias     = Parameter(np.zeros(out_number))
        
    def forward(self, x):
        self.x = x
        return x @ self.weight.data + self.bias.data
    
    def backward(self, g):
        '''
        g是上层的gradient
        这里有x, weight, bias三个变量，所以要分别提供对三者的求导
        其中weight, bias是参数，求出来后直接消费掉
        对x变量的求导要传给下一层
        '''
        # 对g补齐对weight的链式求导
        self.weight.grad += self.x.T @ g
        # 对g补齐对bias的链式求导
        self.bias.grad   += np.sum(g, axis=0)
        # 用修正后的参数继续对下层返对x的导数
        return (g @ self.weight.data.T).reshape(g.shape[0], -1)
    
class Sigmoid(Module):
    def __init__(self):
        super().__init__()
    
    def sigmoid_impl(self, x):
        return 1 / (1 + np.exp(-x))
    
    def forward(self, x):
        self.x = x
        return self.sigmoid_impl(x)
    
    def backward(self, g):
        return g * self.sigmoid_impl(self.x) * (1 - self.sigmoid_impl(self.x))

class ReLU(Module):
    def __init__(self, inplace=True):
        super().__init__()
        self.inplace = inplace

    def forward(self, x):
        self.x_negative = x < 0
        if not self.inplace:
            x = x.copy()

        x[self.x_negative] = 0  # boolean indexing
        return x

    def backward(self, G):
        if not self.inplace:
            G = G.copy()

        G[self.x_negative] = 0
        return G

class Dropout(Module):
    '''
    通过对输入的参数值乘以采用指定概率的伯努利分布随机数，
    实现随机将部分值设置为0，同时其导数也会为0，称之为失活
    '''
    def __init__(self, prob_keep=0.5, inplace=True):
        super().__init__()
        self.prob_keep = prob_keep
        self.inplace   = inplace
    
    def forward(self, x):
        if not self.train_mode:
            return x
        if not self.inplace:
            x          = x.copy()
        self.mask      = np.random.binomial(1, 1-self.prob_keep, x.shape)
        x[self.mask]   = 0         # fancing boolean index
        x *= 1 / self.prob_keep    # 一部分x置零了，剩下的部分就把值扩大相应倍数
        return x
        
    def backward(self, g):
        if not self.inplace:
            g = g.copy()
        g[self.mask] = 0
        g *= 1 / self.prob_keep
        return g

class Conv2d(Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        self.k_size = 3
        # 一张图片一个输出就一行参数，用这一行参数训练一组（比如256张）图片
        self.weight = Parameter(np.random.randn(1, self.k_size * self.k_size)) # 3x3
        self.bias   = Parameter(np.zeros((1, 1)))
        
        # 直接传图片进来就不费这些事了，先不改
        rows, cols = x.shape  # e.g. (256, 764)
        w = h  = int(np.sqrt(cols))
        # 28*28 -> (1, 28, 28), -> (256, 28, 28)
        tensor = np.array([])
        for row in x:
            image   = row.reshape((w, h))[None] # 再外包一层，以便于竖向拼接
            if len(tensor) == 0:
                tensor = image
            else:
                tensor = np.vstack([tensor, image])
        kernel   = self.weight.data
        conv_img, col_img = gemm_im2col(tensor, kernel)   # img2col  <- col_kernel @ col_image
        self.x   = col_img
        return (conv_img + self.bias.data).reshape(rows, -1)

    def backward(self, g):
            '''
                tensor:(256, 28, 28), kernel:(1, 9)
                output = w @ x = (1, 9) x (9, 173056) .reshape(256, -1) = (256, 676)

                d(w @ x)/dw => g @ x.T
                g = (256, 676) -> reshape(1, -1) -> (1, 173056)
                x = (9, 173056) -> transpose
                -> (1, 173056) @ (173056, 9) => (1, 9)

                d(w @ x)/dx  => w.T @ g
                g = (256, 676) -> （1, 173056）
                w = (1, 9) transpose
                -> (9, 1) @ (1, 173056) => (9, 17...) => col2im => (256, 28, 28)
                -> reshape(256, -1) => (256, 784)
            '''
            # d(w @ x) / dw
            self.weight.grad += g.reshape(1, -1) @ self.x.T
            self.bias.grad   += np.sum(g)

            # d(w @ x) / dx
            column     = self.weight.data.T @ g.reshape(1, -1)
            batchs, imgcols = g.shape  # (256, 676)

            ctr = 26 # <= 28 - 3 + 1 # image shape - kernel shape + 1
            ker = 3  # <= kernel size
            g_out = np.zeros((batchs, 28, 28))
            g_out = gemm_col2im(column, imgcols, ctr, ker, g_out)
            return g_out.reshape(256, -1)
    
# 包含了softmax操作和loss计算，返回的是loss
class SoftmaxCrossEntropyLoss(Module):  # 定义损失函数
    def __init__(self):
        super().__init__()
    
    def softmax(self, predict):  # 定义softmax的计算方法
        exp_predict = np.exp(predict)
        total = np.sum(exp_predict, axis=1, keepdims=True)
        return exp_predict / total
    
    def forward(self, x, y):  # 前向传播
        '''
        比二元交叉熵少半截，即(1-y) * log(1-y)的部分
        '''
        self.batch_size  = len(x)
        self.probability = self.softmax(x)
        self.y           = y
        return -np.sum(y * np.log(self.probability)) / self.batch_size
    
    def backward(self, g=1):  # 反向传播
        '''自己推推试试'''
        g = g * self.probability
        return (g - self.y) / self.batch_size
     
class Sequencial(Module):
    def __init__(self, *args):
        super().__init__()
        self.items = list(args)
        
    def modules(self):
        return self.items
    
    def forward(self, x):
        for m in self.items:
            x = m(x)
        return x
    
    def backward(self, G):
        for item in self.items[::-1]:
            G = item.backward(G)
        return G
    
class Network(Module):
    def __init__(self, num_feature, num_hidden, num_classes):
        super().__init__()
        self.layers = Sequencial(
            Conv2d(), # 先写死，一个3x3kernel，一个输出256x(26x26)，不padding
#             Sigmoid(),
            ReLU(),
            Linear(num_feature, num_classes)
        )
        self.lossfn = SoftmaxCrossEntropyLoss()
        
#     def __init__(self, num_feature, num_hidden, num_classes):
#         super().__init__()
#         num_feature = 784
#         self.layers = Sequencial(
#             Linear(num_feature, num_hidden),
#             Sigmoid(),
#             Dropout(0.75),
#             Linear(num_hidden, num_classes)
#         )
#         self.lossfn = SoftmaxCrossEntropyLoss()
    
    def forward(self, x):
        return self.layers(x)
    
    def loss(self, x, y):
        return self.lossfn(self(x), y)
    
    def backward(self):
        g = self.lossfn.backward()
        self.layers.backward(g)

In [10]:

class Optimizer:
    def __init__(self, params, lr):
        self.lr = lr
        self.params = params

    def zero_grad(self):
        for param in self.params:
            param.zero_grad()
            
    def set_lr(self, lr):
        self.lr = lr
        
        
class SGD(Optimizer):
    def __init__(self, params, lr=1e-3):
        super().__init__(params, lr)
    
    def step(self):
        for param in self.params:
            param.data -= self.lr * param.grad
            
            
class SGDMomentum(Optimizer):
    def __init__(self, params, lr=1e-3, momentum=0.9):
        super().__init__(params, lr)
        self.momentum = momentum
        
        for param in self.params:
            param.v = 0
    
    # 移动平均
    def step(self):
        for param in self.params:
            param.v = self.momentum * param.v - self.lr * param.grad
            param.data += param.v
            
            
class AdamW(Optimizer):
    def __init__(self, params, lr=1e-3, beta1=0.9, beta2=0.999, l2_regularization = 0):
        super().__init__(params, lr)
        self.beta1 = beta1
        self.beta2 = beta2
        self.l2_regularization = l2_regularization
        self.t = 0
        
        for param in self.params:
            param.m = 0
            param.v = 0
            
    # 指数移动平均
    def step(self):
        eps = 1e-8
        self.t += 1
        for param in self.params:
            g = param.grad + self.l2_regularization * param.data
            param.m = self.beta1 * param.m + (1 - self.beta1) * g
            param.v = self.beta2 * param.v + (1 - self.beta2) * g ** 2
            mt_ = param.m / (1 - self.beta1 ** self.t)
            vt_ = param.v / (1 - self.beta2 ** self.t)
            param.data -= self.lr * mt_ / (np.sqrt(vt_) + eps)
            
            
class Adam(Optimizer):
    def __init__(self, params, lr=1e-3, beta1=0.9, beta2=0.999, l2_regularization = 0):
        super().__init__(params, lr)
        self.beta1 = beta1
        self.beta2 = beta2
        self.l2_regularization = l2_regularization
        self.t = 0
        
        for param in self.params:
            param.m = 0
            param.v = 0
            
    # 指数移动平均
    def step(self):
        eps = 1e-8
        self.t += 1
        for param in self.params:
            g = param.grad
            param.m = self.beta1 * param.m + (1 - self.beta1) * g
            param.v = self.beta2 * param.v + (1 - self.beta2) * g ** 2
            mt_ = param.m / (1 - self.beta1 ** self.t)
            vt_ = param.v / (1 - self.beta2 ** self.t)
            param.data -= self.lr * mt_ / (np.sqrt(vt_) + eps) + self.l2_regularization * param.data

def save_model(file, model):
    
    # 改成只保存parameter
    with open(file, "wb") as f:
        f.write(pickle.dumps(model))
        
def load_model(file):
    
    # 改成只保存parameter
    with open(file, "rb") as f:
        return pickle.loads(f.read())

In [None]:
train_round    = len(train_loader)  # 每一次训练要几轮才做完
train_epochs   = 10                 # 共做几次训练
# lr_schedule    = {0:1e-4, 9:1e-4}   # 学习率策略
lr_schedule    = lr_cosine_schedule(1e-4, 1e-2, train_epochs)
network        = Network(num_feature, num_hidden, num_classes)
optim          = AdamW(network.params(), 1e-3, l2_regularization=1e-4)

print(network)
for epoch_index in range(train_epochs):
    optim.set_lr(lr_schedule(epoch_index))
    
    # dataloader已经实现了按batch_size输出
    for round_index, (t_imgs, t_lbls, t_onehots) in enumerate(train_loader):
        niter   = epoch_index * train_round + round_index
        n_imgs  = norm_image(t_imgs)
        loss    = network.loss(n_imgs, t_onehots)
        
        optim.zero_grad()
        network.backward()
        optim.step()
        
        # 每训练100张图片计算一次精度
        if niter % 100 == 0 and niter > 0:
            progress    = epoch_index + round_index / train_round
            correct     = 0
            for ts_imgs, ts_lbls, _ in test_loader:
                p       = network(norm_image(ts_imgs))
                labels  = np.argmax(p, axis=1)
                correct = correct + (labels == ts_lbls[:,0]).sum()
            accuracy    = correct / len(test_loader.dataset)
            print(f'Iter: {niter:04d}, Epoch: {progress:.2f}/{train_epochs}, LR: {optim.lr:.6f}, Loss: {loss:.6f}, Test Accuracy: {accuracy}')
            
#     save_model('model_0508', network)        

Network
  Sequencial
    Conv2d
    ReLU
    Linear
  SoftmaxCrossEntropyLoss
Iter: 0100, Epoch: 0.43/10, LR: 0.010000, Loss: 2.427119, Test Accuracy: 0.1137
Iter: 0200, Epoch: 0.85/10, LR: 0.010000, Loss: 2.458883, Test Accuracy: 0.1135
Iter: 0300, Epoch: 1.28/10, LR: 0.009758, Loss: 2.414146, Test Accuracy: 0.1139
Iter: 0400, Epoch: 1.70/10, LR: 0.009758, Loss: 2.365527, Test Accuracy: 0.1136
Iter: 0500, Epoch: 2.13/10, LR: 0.009055, Loss: 2.395529, Test Accuracy: 0.113
Iter: 0600, Epoch: 2.55/10, LR: 0.009055, Loss: 2.306278, Test Accuracy: 0.1128
Iter: 0700, Epoch: 2.98/10, LR: 0.009055, Loss: 2.428323, Test Accuracy: 0.1135
Iter: 0800, Epoch: 3.40/10, LR: 0.007960, Loss: 2.300434, Test Accuracy: 0.113
Iter: 0900, Epoch: 3.83/10, LR: 0.007960, Loss: 2.301428, Test Accuracy: 0.1133
Iter: 1000, Epoch: 4.26/10, LR: 0.006580, Loss: 2.318766, Test Accuracy: 0.1133
Iter: 1100, Epoch: 4.68/10, LR: 0.006580, Loss: 2.332414, Test Accuracy: 0.1134
Iter: 1200, Epoch: 5.11/10, LR: 0.005050, Lo

In [None]:
# 预测

# network = load_model("model_0508")
show_rows = 5 # 五行
show_cols = 5 # 五列
select_count = show_rows * show_cols
select_iter = iter(test_loader)
test_batch_images, test_batch_labels, test_batch_onehots = next(select_iter)

predict        = network(norm_image(test_batch_images))  # 进行网络预测
predict_labels = predict.argmax(axis=1)  # 获取预测的标签值

plt.figure(figsize=(show_rows * 5, show_cols * 5))  # 定义画板的大小
for i in range(select_count):  # 显示图片和标题

    plt.subplot(show_rows, show_cols, i+1)

    predict_label      = predict_labels[i]
    ground_truth_label = test_batch_labels[i, 0]
    image              = test_batch_images[i].reshape(28, 28)

    plt.title(f"Predict: {predict_label}, GroundTruth: {ground_truth_label}")
    plt.imshow(image)