# 使用GPU进行模型训练

In [1]:
# 使用gpu训练的第一种方式
# 调用：
#     网络模型
#     数据
#     损失函数
# 的.to(device)
# device可以设置
# 例如：
    # device = torch.device("cpu")
    # device0 = torch.device("cuda")  ==  device0 = torch.device("cuda:0")       两者等价
    # device1 = torch.device("cuda:1")
    # ...多卡设备以此类推

In [2]:
import torch
import torchvision
from torch import nn
from torch.utils.tensorboard import SummaryWriter
from collections import OrderedDict

In [3]:
# 定义训练的设备
device = torch.device("cpu")

In [4]:
class LeNet_5(nn.Module):
    def __init__(self):
        super(LeNet_5, self).__init__()
        self.model1 = nn.Sequential(OrderedDict([
            ('conv1', nn.Conv2d(in_channels=3, out_channels=32, kernel_size=5, padding=2)),
            ('pool1', nn.MaxPool2d(kernel_size=2)),
            ('conv2', nn.Conv2d(in_channels=32, out_channels=32, kernel_size=5, padding=2)),
            ('pool2', nn.MaxPool2d(kernel_size=2)),
            ('conv3', nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, padding=2)),
            ('pool3', nn.MaxPool2d(kernel_size=2)),
            ('flatten', nn.Flatten()),
            ('fc1', nn.Linear(1024, 64)),  # 注意：1024 = 64通道 × 4 × 4（针对输入32x32）
            ('fc2', nn.Linear(64, 10))
        ]))

    def forward(self, x):
        return self.model1(x)

In [5]:
# 准备数据集
train_data = torchvision.datasets.CIFAR10("./data/CIFAR10/", train = True,
                                          transform = torchvision.transforms.ToTensor(), download = True)
val_data = torchvision.datasets.CIFAR10("./data/CIFAR10/", train = False,
                                          transform = torchvision.transforms.ToTensor(), download = True)

# 如何查看数据集大小
train_data_size = len(train_data)
val_data_size = len(val_data)
print("训练数据集长度为：{}".format(train_data_size))
print("测试数据集长度为：{}".format(val_data_size))

# 加载数据集
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size = 64)
val_dataloader = torch.utils.data.DataLoader(val_data, batch_size = 64)

Files already downloaded and verified
Files already downloaded and verified
训练数据集长度为：50000
测试数据集长度为：10000


In [6]:
# 搭建神经网络:网络见 Model_save.py
# 创建网络模型
# 对网络模型调用.cuda()
model = LeNet_5()
model.to(device)
# input = torch.ones((64, 3, 32, 32))
# print(model(input).shape)

LeNet_5(
  (model1): Sequential(
    (conv1): Conv2d(3, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (conv2): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (conv3): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (flatten): Flatten(start_dim=1, end_dim=-1)
    (fc1): Linear(in_features=1024, out_features=64, bias=True)
    (fc2): Linear(in_features=64, out_features=10, bias=True)
  )
)

In [7]:
# 创建损失函数
# 对损失函数调用.cuda()
loss_fn = nn.CrossEntropyLoss()
loss_fn.to(device)

CrossEntropyLoss()

In [8]:
# 定义优化器
# learning_rate = 0.01
# 1e-2 == 1 * (10)^(-2)
learning_rate = 1e-2
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

In [9]:
# 设置训练网络的一些参数
# 记录训练的次数
total_train_step = 0
# 记录测试的次数
total_test_step = 0
# 记录训练的次数
epoch = 10
# 绘制训练图像
writer = SummaryWriter("./logs/17_Model_training")

In [10]:
# 开始训练
for i in range(epoch):
    print("-----------------第 {} 轮训练开始-----------------".format(i + 1))

    # 训练步骤开始
    model.train()
    for data in train_dataloader:
        # 对数据调用.cuda()
        imgs, targets = data
        imgs.to(device)
        targets.to(device)
        outputs = model(imgs)
        # print(outputs.shape)
        # print(targets.shape)
        loss = loss_fn(outputs, targets)

        # 优化优化模型
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # 查看训练结果
        total_train_step += 1
        if total_train_step % 100 == 0:
            print("训练次数:{}, loss:{}".format(total_train_step, loss.item()))
            writer.add_scalar("train_loss", loss.item(), total_train_step)


    model.eval()
    # 每一轮训练结束之后在测试集上验证模型的效果，对模型进行评估，在测试集上不对模型进行调优
    # 在with下不会再进行梯度的计算
    total_test_loss = 0
    with torch.no_grad():
        right_sum = 0
        for data in val_dataloader:
            # 对数据调用.cuda()
            imgs, targets = data
            imgs.to(device)
            targets.to(device)
            outputs = model(imgs)
            loss = loss_fn(outputs, targets)
            total_test_loss += loss.item()
            outputs = torch.argmax(outputs, dim = 1)
            # print((outputs == targets).sum())
            # print(outputs)
            right_sum += (outputs == targets).sum()
        print("整体测试集上的loss:{}".format(total_test_loss)) 
        writer.add_scalar("test_loss", total_test_loss, total_test_step)
        print("整体测试集上的正确率:{}".format(right_sum / 10000)) # 测试集有10000张图片
        writer.add_scalar("test_accuracy", right_sum / 10000, total_test_step)
        total_test_step += 1

    # 每一轮训练完需要保存当前模型
    # 方法1： 
    # torch.save(model, "../Models/LeNet_5/LeNet_5_No_{}_Loss_{}.pth".format(epoch, total_test_step))
    # 方法2（更推荐）：
    torch.save(model.state_dict(), "./models/LeNet_5/LeNet_5_No_{}_Loss_{}.pth".format(i, total_test_step))
    print("模型已保存")


writer.close()

-----------------第 1 轮训练开始-----------------
训练次数:100, loss:2.2847371101379395
训练次数:200, loss:2.281127452850342
训练次数:300, loss:2.2628941535949707
训练次数:400, loss:2.18520450592041
训练次数:500, loss:2.0906147956848145
训练次数:600, loss:2.076401710510254
训练次数:700, loss:2.021336793899536
整体测试集上的loss:311.99304604530334
整体测试集上的正确率:0.2930999994277954
模型已保存
-----------------第 2 轮训练开始-----------------
训练次数:800, loss:1.8649178743362427
训练次数:900, loss:1.8240022659301758
训练次数:1000, loss:1.9312106370925903
训练次数:1100, loss:1.9625526666641235
训练次数:1200, loss:1.6884448528289795
训练次数:1300, loss:1.6447392702102661
训练次数:1400, loss:1.711478590965271
训练次数:1500, loss:1.7955427169799805
整体测试集上的loss:297.28282403945923
整体测试集上的正确率:0.32089999318122864
模型已保存
-----------------第 3 轮训练开始-----------------
训练次数:1600, loss:1.7118862867355347
训练次数:1700, loss:1.634330153465271
训练次数:1800, loss:1.9542057514190674
训练次数:1900, loss:1.6894093751907349
训练次数:2000, loss:1.9117482900619507
训练次数:2100, loss:1.513730764389038
训练次数:2200, loss