![20221124165432](https://cdn.jsdelivr.net/gh/xihuishawpy/PicBad@main/blogs/pictures/20221124165432.png)

In [7]:
import paddle
from paddle.nn import Linear
import paddle.nn.functional as F

import numpy as np
import os
import random
import gzip
import json

In [8]:
class MINIST(paddle.nn.Layer):
    def __init__(self):
        super(MINIST, self).__init__()
        self.fc = paddle.nn.Linear(784, 1)
        
    def forward(self,input):
        return self.fc(input)

In [9]:
# 加载json数据 
data = json.load(gzip.open('mnist.json.gz'))
train_set, val_set, eval_set = data

## 数据读取与模型训练串行

In [10]:

def load_data(mode='train'):
    # 加载json数据
    data = json.load(gzip.open('mnist.json.gz'))
   
    # 数据读取
    train_set, val_set, eval_set = data
    if mode=='train':
        imgs, labels = train_set[0], train_set[1]
    elif mode=='valid':
        imgs, labels = val_set[0], val_set[1]
    elif mode=='eval':
        imgs, labels = eval_set[0], eval_set[1]
    else:
        raise Exception("只能有这3种读取模式：['train', 'valid', 'eval']")
    print("数据集数量: ", len(imgs))
    
    # 校验数据
    imgs_length = len(imgs)

    assert len(imgs) == len(labels), \
          "样本长度({})=label长度({})".format(len(imgs), len(labels))
    
    # 定义数据集每个数据的序号，根据序号读取数据
    index_list = list(range(imgs_length))
    BATCHSIZE = 100
    
    # 数据生成器
    def data_generator():
        if mode == 'train':
            np.random.shuffle(index_list)
        imgs_list = []
        labels_list = []
        for i in index_list:
            img = np.array(imgs[i]).astype('float32')
            label = np.array(labels[i]).astype('float32')
            imgs_list.append(img) 
            labels_list.append(label)
            if len(imgs_list) == BATCHSIZE:
                # 获得一个batchsize数据，并返回
                # 生成模式，减少内存占用
                yield np.array(imgs_list), np.array(labels_list)
                # 清空数据读取列表
                imgs_list = []
                labels_list = []
    
        # 剩余小于一个batch的数据，单独组成一个batch
        if len(imgs_list) > 0:
            yield np.array(imgs_list), np.array(labels_list)
    return data_generator

In [11]:
def train():
    # 实例化网络，开启训练
    model = MINIST()
    model.train()
    
    # 数据读取
    train_loader = load_data(mode='train')
    opt = paddle.optimizer.SGD(learning_rate=0.001, parameters=model.parameters())
    
    EPOCH = 10 
    for epoch_id in range(EPOCH):
        for batch_id, data in enumerate(train_loader()):
            images ,  labels = data
            images = paddle.to_tensor(images,dtype='float32')
            labels = paddle.to_tensor(labels,dtype='float32')
            
            # 正向传播
            predicts = model.forward(images)
            loss = F.square_error_cost(predicts,label=labels)
            avg_loss = paddle.mean(loss)
            
            if batch_id % 100 == 0:
                print("epoch: {}, batch: {}, loss is: {}".format(epoch_id, batch_id, avg_loss.numpy()))
            
            # 反向传播，更新参数
            avg_loss.backward()
            opt.step()
            opt.clear_grad()
        
    paddle.save(model.state_dict(),'mnist.pdparams')



train()

数据集数量:  50000
epoch: 0, batch: 0, loss is: [24.82596]
epoch: 0, batch: 100, loss is: [8.885684]
epoch: 0, batch: 200, loss is: [8.884466]
epoch: 0, batch: 300, loss is: [10.054378]
epoch: 0, batch: 400, loss is: [10.07171]
epoch: 1, batch: 0, loss is: [7.0039916]
epoch: 1, batch: 100, loss is: [9.610771]
epoch: 1, batch: 200, loss is: [9.0718]
epoch: 1, batch: 300, loss is: [9.122907]
epoch: 1, batch: 400, loss is: [9.951382]
epoch: 2, batch: 0, loss is: [9.5425205]
epoch: 2, batch: 100, loss is: [9.823632]
epoch: 2, batch: 200, loss is: [9.732441]
epoch: 2, batch: 300, loss is: [8.533162]
epoch: 2, batch: 400, loss is: [9.047721]
epoch: 3, batch: 0, loss is: [9.799807]
epoch: 3, batch: 100, loss is: [8.877254]
epoch: 3, batch: 200, loss is: [8.487012]
epoch: 3, batch: 300, loss is: [8.902817]
epoch: 3, batch: 400, loss is: [8.861171]
epoch: 4, batch: 0, loss is: [9.992799]
epoch: 4, batch: 100, loss is: [9.144516]
epoch: 4, batch: 200, loss is: [8.174755]
epoch: 4, batch: 300, loss is

![20221125101603](https://cdn.jsdelivr.net/gh/xihuishawpy/PicBad@main/blogs/pictures/20221125101603.png)

## 异步数据读取

数据读取和模型训练**并行**。读取到的数据不断的放入缓存区，无需等待模型训练就可以启动下一轮数据读取。当模型训练完一个批次后，不用等待数据读取过程，直接从缓存区获得下一批次数据进行训练，从而加快了数据读取速度。

异步队列：数据读取和模型训练交互的仓库，二者均可以从仓库中读取数据，它的存在使得两者的工作节奏可以解耦。

In [13]:

# 继承paddle.io.Dataset类的数据读取器
class MnistDataset(paddle.io.Dataset):

    # 实现初始化方法，在初始化的时候将数据读载入
    def __init__(self,mode):
        data = json.load(gzip.open('mnist.json.gz'))
        # 数据读取
        train_set, val_set, eval_set = data
        if mode=='train':
            imgs, labels = train_set[0], train_set[1]
        elif mode=='valid':
            imgs, labels = val_set[0], val_set[1]
        elif mode=='eval':
            imgs, labels = eval_set[0], eval_set[1]
        else:
            raise Exception("只能有这3种读取模式：['train', 'valid', 'eval']")
        print("数据集数量: ", len(imgs))
        
        # 校验数据
        imgs_length = len(imgs)
        assert len(imgs) == len(labels), \
            "样本长度({})=label长度({})".format(len(imgs), len(labels))
            
        self.imgs = imgs
        self.labels = labels

    # __getitem__() 该方法定义用索引(0 到 len(self))获取一条数据或一个样本
    def __getitem__(self,idx):
        img = np.array(self.imgs[idx]).astype('float32')
        label = np.array(self.labels[idx]).astype('float32')
        return img ,label
     
    # 返回数据集的总长度
    def __len__(self):
        return len(self.imgs)



In [14]:
# 实例化一个对象访问 定义好的数据集
train_dataset = MnistDataset(mode='train')

# 用paddle.io.DataLoader 定义DataLoader对象用于加载Python生成器产生的数据
# DataLoader 返回的是一个批次数据迭代器，并且是异步的
data_loader = paddle.io.DataLoader(train_dataset , batch_size=100, shuffle= True)

数据集数量:  50000


In [18]:
def train():
    model = MINIST()
    model.train
    opt = paddle.optimizer.SGD(learning_rate=0.001, parameters=model.parameters())

    EPOCH = 10
    for epoch_id in range(EPOCH):
        for batch_id, data in enumerate(data_loader()):     
            images , labels = data
            images = paddle.to_tensor(images,dtype ='float32')
            labels = paddle.to_tensor(labels, dtype='float32') 
            
            predicts = model(images)
            loss = F.square_error_cost(predicts, labels)
            avg_loss = paddle.mean(loss)
            
            if batch_id % 100 == 0:
                print("epoch id = %d, batch id = %d, loss = %f" % (epoch_id, batch_id, avg_loss.numpy()))
            
            avg_loss.backward()
            opt.step()
            opt.clear_grad()
            
    paddle.save(model.state_dict() , 'minist.pdparams')
    
                        
train()

epoch id = 0, batch id = 0, loss = 31.080965
epoch id = 0, batch id = 100, loss = 11.579112
epoch id = 0, batch id = 200, loss = 9.516181
epoch id = 0, batch id = 300, loss = 8.596170
epoch id = 0, batch id = 400, loss = 10.533883
epoch id = 1, batch id = 0, loss = 10.032254
epoch id = 1, batch id = 100, loss = 9.104719
epoch id = 1, batch id = 200, loss = 9.645576
epoch id = 1, batch id = 300, loss = 8.424115
epoch id = 1, batch id = 400, loss = 10.812142
epoch id = 2, batch id = 0, loss = 8.437902
epoch id = 2, batch id = 100, loss = 9.696856
epoch id = 2, batch id = 200, loss = 9.246710
epoch id = 2, batch id = 300, loss = 8.111835
epoch id = 2, batch id = 400, loss = 9.740843
epoch id = 3, batch id = 0, loss = 9.318897
epoch id = 3, batch id = 100, loss = 9.657000
epoch id = 3, batch id = 200, loss = 8.939503
epoch id = 3, batch id = 300, loss = 10.104265
epoch id = 3, batch id = 400, loss = 9.695880
epoch id = 4, batch id = 0, loss = 8.546838
epoch id = 4, batch id = 100, loss = 9