## 1. 线性回归demo

In [2]:
import paddle
import numpy as np
paddle.__version__

'2.0.2'

In [3]:
# 定义训练和测试数据
# 我们使用numpy定义一组数据，这组数据的每一条数据有13个
# 这组数据符合y = 2 * x + 1，但是程序是不知道的，我们之后使用这组数据进行训练，看看强大的神经网络是否能够训练出一个拟合这个函数的模型
x_data = np.array([[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 
                   [2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 
                   [3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 
                   [4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 
                   [5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]).astype('float32')
y_data = np.array([[3.0], [5.0], [7.0], [9.0], [11.0]]).astype('float32')
test_data = np.array([[6.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]).astype('float32')

In [4]:
# 定义一个简单的线性网络
net = paddle.nn.Sequential(
    paddle.nn.Linear(13, 100),
    paddle.nn.ReLU(),
    paddle.nn.Linear(100, 1)
)

In [5]:
# 定义优化方法
optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=net.parameters())

In [6]:
# 将numpy类型数据转换成tensor之后才能用于模型训练
inputs = paddle.to_tensor(x_data)
labels = paddle.to_tensor(y_data)

# 开始训练100个pass
for pass_id in range(10):
    out = net(inputs)
    loss = paddle.mean(paddle.nn.functional.square_error_cost(out, labels))

    loss.backward()
    optimizer.step()
    optimizer.clear_grad()

    print("Pass:%d, Cost:%0.5f" % (pass_id, loss))

Pass:0, Cost:59.02511
Pass:1, Cost:24.60351
Pass:2, Cost:6.71588
Pass:3, Cost:0.43965
Pass:4, Cost:0.02129
Pass:5, Cost:0.02036
Pass:6, Cost:0.01993
Pass:7, Cost:0.01952
Pass:8, Cost:0.01911
Pass:9, Cost:0.01871


In [7]:
# 开始预测
predict_inputs = paddle.to_tensor(test_data)
result = net(predict_inputs)

print("当x为6.0时，y为：%0.5f" % result)

当x为6.0时，y为：13.20984


## 2. 图像分类之美食分类demo

In [17]:
"""
官方URL: https://aistudio.baidu.com/aistudio/projectdetail/1952220
"""
import os
import zipfile
import random
import json
import paddle
import paddle.vision.transforms as T
import sys
import numpy as np
from PIL import Image
from PIL import ImageEnhance
import paddle
import matplotlib.pyplot as plt

In [13]:
# 参数配置
train_parameters = {
    "input_size": [3, 64, 64],                                          #输入图片的shape
    "class_dim": -1,                                                      #分类数
    "src_path":"data/data42610/foods.zip",                    #原始数据集路径
    "target_path":"/home/aistudio/data/",                     #要解压的路径
    "train_list_path": "/home/aistudio/data/train.txt",      #train.txt路径
    "eval_list_path": "/home/aistudio/data/eval.txt",        #eval.txt路径
    "readme_path": "/home/aistudio/data/readme.json",  #readme.json路径
    "label_dict":{},                                                       #标签字典
    "num_epochs": 2,                                                  #训练轮数
    "train_batch_size": 64,                                           #训练时每个批次的大小
    "learning_strategy": {                                             #优化函数相关的配置
        "lr": 0.01                                                          #超参数学习率
    } 
}
print(paddle.__version__)

2.0.2


### 2.1 数据准备

In [14]:
def unzip_data(src_path,target_path):
    '''
    解压原始数据集，将src_path路径下的zip包解压至target_path目录下
    '''
    if(not os.path.isdir(target_path + "foods")):     
        z = zipfile.ZipFile(src_path, 'r')
        z.extractall(path=target_path)
        z.close()

In [15]:
def get_data_list(target_path,train_list_path,eval_list_path):
    '''
    生成数据列表
    '''
    #存放所有类别的信息
    class_detail = []
    #获取所有类别保存的文件夹名称
    data_list_path=target_path+"foods/"
    class_dirs = os.listdir(data_list_path)  
    #总的图像数量
    all_class_images = 0
    #存放类别标签
    class_label=0
    #存放类别数目
    class_dim = 0
    #存储要写进eval.txt和train.txt中的内容
    trainer_list=[]
    eval_list=[]
    #读取每个类别
    for class_dir in class_dirs:
        if class_dir != ".DS_Store":
            class_dim += 1
            #每个类别的信息
            class_detail_list = {}
            eval_sum = 0
            trainer_sum = 0
            #统计每个类别有多少张图片
            class_sum = 0
            #获取类别路径 
            path = data_list_path  + class_dir
            # 获取所有图片
            img_paths = os.listdir(path)
            for img_path in img_paths:                                  # 遍历文件夹下的每个图片
                name_path = path + '/' + img_path                       # 每张图片的路径
                if class_sum % 8 == 0:                                  # 每8张图片取一个做验证数据
                    eval_sum += 1                                       # test_sum为测试数据的数目
                    eval_list.append(name_path + "\t%d" % class_label + "\n")
                else:
                    trainer_sum += 1 
                    trainer_list.append(name_path + "\t%d" % class_label + "\n")#trainer_sum测试数据的数目
                class_sum += 1                                          #每类图片的数目
                all_class_images += 1                                   #所有类图片的数目
             
            # 说明的json文件的class_detail数据
            class_detail_list['class_name'] = class_dir             #类别名称
            class_detail_list['class_label'] = class_label          #类别标签
            class_detail_list['class_eval_images'] = eval_sum       #该类数据的测试集数目
            class_detail_list['class_trainer_images'] = trainer_sum #该类数据的训练集数目
            class_detail.append(class_detail_list)  
            #初始化标签列表
            train_parameters['label_dict'][str(class_label)] = class_dir
            class_label += 1 
            
    #初始化分类数
    train_parameters['class_dim'] = class_dim
    
    #乱序  
    random.shuffle(eval_list)
    with open(eval_list_path, 'a') as f:
        for eval_image in eval_list:
            f.write(eval_image) 
            
    random.shuffle(trainer_list)
    with open(train_list_path, 'a') as f2:
        for train_image in trainer_list:
            f2.write(train_image) 

    # 说明的json文件信息
    readjson = {}
    readjson['all_class_name'] = data_list_path                  #文件父目录
    readjson['all_class_images'] = all_class_images
    readjson['class_detail'] = class_detail
    jsons = json.dumps(readjson, sort_keys=True, indent=4, separators=(',', ': '))
    with open(train_parameters['readme_path'],'w') as f:
        f.write(jsons)
    print ('生成数据列表完成！')

In [16]:
'''
参数初始化
'''
src_path=train_parameters['src_path']
target_path=train_parameters['target_path']
train_list_path=train_parameters['train_list_path']
eval_list_path=train_parameters['eval_list_path']
batch_size=train_parameters['train_batch_size']

'''
解压原始数据到指定路径
'''
unzip_data(src_path,target_path)

'''
划分训练集与验证集，乱序，生成数据列表
'''
#每次生成数据列表前，首先清空train.txt和eval.txt
with open(train_list_path, 'w') as f: 
    f.seek(0)
    f.truncate() 
with open(eval_list_path, 'w') as f: 
    f.seek(0)
    f.truncate() 
    
#生成数据列表   
get_data_list(target_path,train_list_path,eval_list_path)

FileNotFoundError: [Errno 2] No such file or directory: 'data/data42610/foods.zip'

In [18]:
class FoodDataset(paddle.io.Dataset):
    """
    5类food数据集类的定义
    """
    def __init__(self, mode='train'):
        """
        初始化函数
        """
        self.data = []
        with open('data/{}.txt'.format(mode)) as f:
            for line in f.readlines():
                info = line.strip().split('\t')
                if len(info) > 0:
                    self.data.append([info[0].strip(), info[1].strip()])
        self.transforms = T.Compose([
            T.Resize((64, 64)),    # 图片缩放
            T.ToTensor(),                       # 数据的格式转换和标准化、 HWC => CHW            
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
            ])        
    def __getitem__(self, index):
        """
        根据索引获取单个样本
        """
        image_file, label = self.data[index]
        image = Image.open(image_file)
        if image.mode != 'RGB':
            image = image.convert('RGB')
            image = self.transforms(image)
        return image, np.array(label, dtype='int64')
    def __len__(self):
        """
        获取样本总数
        """
        return len(self.data)

In [19]:
'''
构造数据提供器
'''
train_dataset = FoodDataset(mode='train')
eval_dataset = FoodDataset(mode='eval')

FileNotFoundError: [Errno 2] No such file or directory: 'data/train.txt'

In [20]:
train_dataset[11][0]

NameError: name 'train_dataset' is not defined

In [21]:
print(train_dataset.__len__())
print(eval_dataset.__len__())

NameError: name 'train_dataset' is not defined

### 2.2 模型配置

In [22]:
#定义卷积网络
import paddle.nn as nn
import paddle.nn.functional as F

class MyCNN(nn.Layer):
    def __init__(self):
        super(MyCNN,self).__init__()
        self.hidden1 = nn.Conv2D(in_channels=3,       #通道数
                                            out_channels=64,       #卷积核个数
                                            kernel_size =3,        #卷积核大小
                                            stride=1)             #步长
        self.hidden2 = nn.Conv2D(in_channels=64,
                                            out_channels = 128,
                                            kernel_size =3,
                                            stride=1)
        self.hidden3 = nn.MaxPool2D(kernel_size=2,          #池化核大小
                                            stride=2)        #池化步长
        self.hidden4 = nn.Linear(in_features=128*30*30,out_features=5)
    #网络的前向计算过程
    def forward(self,input):
        x = self.hidden1(input)
        # x=F.relu(x)
        # print(x.shape)
        x = self.hidden2(x)
        # x=F.relu(x)
        # print(x.shape)
        x = self.hidden3(x)
        # x=F.relu(x)
        # print(x.shape)
        #卷积层的输出特征图如何当作全连接层的输入使用呢？
        #卷积层的输出数据格式是[N,C,H,W],在输入全连接层的时候，会自动将数据拉平.
        #也就是对每个样本，自动将其转化为长度为K的向量，其中K=C×H×W，一个mini-batch的数据维度变成了N×K的二维向量。
        x = paddle.reshape(x, shape=[-1, 128*30*30])
        x = self.hidden4(x)
        out = F.softmax(x)        
        return out_channels

### 2.3 模型训练&评估

In [25]:
import paddle
from paddle import Model
myCNN=MyCNN()
model= Model(myCNN)
model.summary((1, 3, 64, 64))

NameError: name 'out_channels' is not defined

In [26]:
# 模型训练配置
model.prepare(optimizer=paddle.optimizer.Adam(learning_rate=0.0001,parameters=model.parameters()),# 优化器
              loss=paddle.nn.CrossEntropyLoss(),           # 损失函数
              metrics=paddle.metric.Accuracy()) # 评估指标

# 训练可视化VisualDL工具的回调函数
visualdl = paddle.callbacks.VisualDL(log_dir='visualdl_log')

In [27]:
# 启动模型全流程训练
model.fit(train_dataset,            # 训练数据集
        #   eval_dataset,            # 评估数据集
          epochs=10,            # 总的训练轮次
          batch_size=256,    # 批次计算的样本量大小
          shuffle=True,             # 是否打乱样本集
          verbose=1,                # 日志展示格式
          save_dir='./chk_points/', # 分阶段的训练模型存储路径
          callbacks=[visualdl])     # 回调函数使用

NameError: name 'train_dataset' is not defined

### 2.4 模型储存&inference

In [28]:
model.save('model_save_dir')

In [None]:
print('测试数据集样本量：{}'.format(len(eval_dataset)))

In [29]:
# 执行预测
result = model.predict(eval_dataset)

NameError: name 'eval_dataset' is not defined

In [30]:
# 样本映射
LABEL_MAP = ['beef_tartare','baklava','beef_carpaccio','apple_pie','baby_back_ribs']

# 随机取样本展示
indexs = [2, 38, 56, 92, 100, 101]

for idx in indexs:
    predict_label = np.argmax(result[0][idx])
    real_label = eval_dataset.__getitem__(idx)[1]
    print('样本ID：{}, 真实标签：{}, 预测值：{}'.format(idx, LABEL_MAP[real_label], LABEL_MAP[predict_label]))

IndexError: The starting index 2 of slice is out of bounds in tensor 0-th axis, it shound be in the range of [-1, 1)