# 1. 利用GPU训练(一)
能进行GPU训练部分: 三部分可添加 .cuda(): 网络模型, 数据（imgs，targets）, 损失函数

在上一节视频中 train.py 的基础上,增加.cuda()和增加测试时间的计算


In [5]:
# 准备数据集
import torch
import torchvision
from torch.utils.tensorboard import SummaryWriter

from model import *
from torch.utils.data import DataLoader
import time

train_data = torchvision.datasets.CIFAR10(root="../../dataset",train=True,transform=torchvision.transforms.ToTensor(),
                                          download=True)
test_data = torchvision.datasets.CIFAR10(root="../../dataset",train=False,transform=torchvision.transforms.ToTensor(),
                                          download=True)

# 查看数据集大小 length 长度
train_data_size = len(train_data)
test_data_size  = len(test_data)
print("训练数据集长度为:{}",format(train_data_size))
print("测试数据集长度为:{}",format(test_data_size))

# 利用 DataLoader 来加载数据集
train_dataloader = DataLoader(train_data,batch_size=64)
test_dataloader = DataLoader(test_data,batch_size=64)

# 创建网络模型
tudui = Tudui()

# 定义损失函数 用交叉熵
loss_fn = nn.CrossEntropyLoss()

# 判断是否可用cuda
if torch.cuda.is_available():
    print("torch.cuda.is_available")
    tudui = tudui.cuda() # 网络模型转移到cuda
    loss_fn = loss_fn.cuda()



# 定义优化器 选择的是随机梯度下降
# 1e-2 = 1*(10)^(-2) = 1/100 =0.01
learning_rate = 1e-2
optimizer = torch.optim.SGD(tudui.parameters(),lr=learning_rate)

# 设置训练网络的一些参数
    # 记录训练的次数
total_train_step = 0
    # 记录测试的次数
total_test_step = 0
    # 训练的轮数
epoch = 10


# 添加tensorboard
writer = SummaryWriter("../net_logs")



for i in range(epoch):
    print("-----------第{}轮训练开始-----------".format(i+1))
    start_time = time.time()
    tudui.train()
    # -- 训练步骤开始
    for data in train_dataloader:
        imgs,targets = data
        if torch.cuda.is_available():
            imgs ,targets = imgs.cuda(), targets.cuda()
        outputs = tudui(imgs)
        loss = loss_fn(outputs,targets)

        # 优化器优化模型
        # 利用优化器将梯度清零
        optimizer.zero_grad()
        # 利用反向传播得到每个梯度的结点
        loss.backward()
        # 调用优化器
        optimizer.step()

        # 训练次数+1
        total_train_step += 1
        if total_train_step % 100 == 0:
            print("训练次数:{},Loss:{}".format(total_train_step,loss))
            writer.add_scalar("train_loss",loss.item(), total_train_step)

    # -- 测试步骤开始
    # 计算整个数据集上的loss
    total_test_loss = 0
    # 计算整体的正确率 整体正确的个数 初始为0
    total_true_num = 0
    tudui.eval()
    with torch.no_grad():
        for data in test_dataloader:
            imgs,targets = data
            if torch.cuda.is_available():
                imgs ,targets = imgs.cuda(), targets.cuda()
            outputs = tudui(imgs)
            # 计算数据的损失
            loss = loss_fn(outputs,targets)
            # 计算整体损失
            total_test_loss = total_test_loss + loss

            # 计算准确率
            batch_true_num = (outputs.argmax(1)==targets).sum()
            total_true_num += batch_true_num         
    test_accuracy = total_true_num/test_data_size

    print("整体测试集上的loss:{}".format(total_test_loss))
    print(f"整体测试集上的正确率:{test_accuracy}")

    writer.add_scalar("test_2_loss",total_test_loss, total_test_step)
    writer.add_scalar("test_2_accuracy",test_accuracy,total_test_step)

    total_test_step += 1
    print(f"-----------第{total_test_step}轮测试结束-----------")    

    # 保存训练时每一轮的模型
    torch.save(tudui, "./model_pth/gpu_tudui_{}.pth".format(i))
    print('模型已保存')
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"第{i}轮 Elapsed time: {elapsed_time}")
writer.close()

Files already downloaded and verified
Files already downloaded and verified
训练数据集长度为:{} 50000
测试数据集长度为:{} 10000
torch.cuda.is_available
-----------第1轮训练开始-----------
训练次数:100,Loss:2.2954437732696533
训练次数:200,Loss:2.2895901203155518
训练次数:300,Loss:2.278843879699707
训练次数:400,Loss:2.2362146377563477
训练次数:500,Loss:2.1171886920928955
训练次数:600,Loss:2.069493055343628
训练次数:700,Loss:2.0062482357025146
整体测试集上的loss:314.2009582519531
整体测试集上的正确率:0.27559998631477356
-----------第1轮测试结束-----------
模型已保存
第0轮 Elapsed time: 17.186070442199707
-----------第2轮训练开始-----------
训练次数:800,Loss:1.9224035739898682
训练次数:900,Loss:1.858345866203308
训练次数:1000,Loss:1.9357198476791382
训练次数:1100,Loss:1.9857430458068848
训练次数:1200,Loss:1.7319587469100952
训练次数:1300,Loss:1.6944974660873413
训练次数:1400,Loss:1.7629321813583374
训练次数:1500,Loss:1.8012839555740356
整体测试集上的loss:310.1408996582031
整体测试集上的正确率:0.30300000309944153
-----------第2轮测试结束-----------
模型已保存
第1轮 Elapsed time: 17.169941186904907
-----------第3轮训练开始-----------
训练次数:160

# 2. 利用GPU训练(二)

.to(device) # 这种方式更常用~

`device = torch.device("cpu)`

`device = torch.device(“cuda”)`

当电脑上有多张显卡时: 可以指定显卡

`device = torch.device(“cuda:0”)` # 制定第一个显卡

`device = torch.device(“cuda:1”)` # 制定第二个显卡


**train_gpu_2.py**

首先把train_gpu_1.py 里的代码复制过来 , 开改~

改动部分如下:

定义训练的设备: `device = torch.device('cuda')`

定义模型 :

* 常见写法1 `tudui.to('device')`
* 常见写法2 `device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')`
* 修改损失函数: `loss_fn = loss_fn.to(device)`

修改训练和测试的cuda部分

​ `imgs = imgs.to(device)`
​ `targets = targets.to(device)`

In [6]:
import torch

# Check the number of GPUs available
gpu_count = torch.cuda.device_count()
gpu_count

1

In [None]:
# 准备数据集
import torch
import torchvision
from torch.utils.tensorboard import SummaryWriter

from model import *
from torch.utils.data import DataLoader
import time

train_data = torchvision.datasets.CIFAR10(root="../../dataset",train=True,transform=torchvision.transforms.ToTensor(),
                                          download=True)
test_data = torchvision.datasets.CIFAR10(root="../../dataset",train=False,transform=torchvision.transforms.ToTensor(),
                                          download=True)

# 查看数据集大小 length 长度
train_data_size = len(train_data)
test_data_size  = len(test_data)
print("训练数据集长度为:{}",format(train_data_size))
print("测试数据集长度为:{}",format(test_data_size))

# 利用 DataLoader 来加载数据集
train_dataloader = DataLoader(train_data,batch_size=64)
test_dataloader = DataLoader(test_data,batch_size=64)

#######################
# 判断是否可用cuda
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#######################

# 创建网络模型
tudui = Tudui().to(device)

# 定义损失函数 用交叉熵
loss_fn = nn.CrossEntropyLoss().to(device)

# 定义优化器 选择的是随机梯度下降
# 1e-2 = 1*(10)^(-2) = 1/100 =0.01
learning_rate = 1e-2
optimizer = torch.optim.SGD(tudui.parameters(),lr=learning_rate)

# 设置训练网络的一些参数
    # 记录训练的次数
total_train_step = 0
    # 记录测试的次数
total_test_step = 0
    # 训练的轮数
epoch = 10


# 添加tensorboard
writer = SummaryWriter("../net_logs")



for i in range(epoch):
    print("-----------第{}轮训练开始-----------".format(i+1))
    start_time = time.time()
    tudui.train()
    # -- 训练步骤开始
    for data in train_dataloader:
        imgs,targets = data
        if torch.cuda.is_available():
            imgs ,targets = imgs.to(device), targets.to(device)
        outputs = tudui(imgs)
        loss = loss_fn(outputs,targets)

        # 优化器优化模型
        # 利用优化器将梯度清零
        optimizer.zero_grad()
        # 利用反向传播得到每个梯度的结点
        loss.backward()
        # 调用优化器
        optimizer.step()

        # 训练次数+1
        total_train_step += 1
        if total_train_step % 100 == 0:
            print("训练次数:{},Loss:{}".format(total_train_step,loss))
            writer.add_scalar("gpu_2_train_loss",loss.item(), total_train_step)

    # -- 测试步骤开始
    # 计算整个数据集上的loss
    total_test_loss = 0
    # 计算整体的正确率 整体正确的个数 初始为0
    total_true_num = 0
    tudui.eval()
    with torch.no_grad():
        for data in test_dataloader:
            imgs,targets = data
            if torch.cuda.is_available():
                imgs ,targets = imgs.to(device), targets.to(device)
            outputs = tudui(imgs)
            # 计算数据的损失
            loss = loss_fn(outputs,targets)
            # 计算整体损失
            total_test_loss = total_test_loss + loss

            # 计算准确率
            batch_true_num = (outputs.argmax(1)==targets).sum()
            total_true_num += batch_true_num         
    test_accuracy = total_true_num/test_data_size

    print("整体测试集上的loss:{}".format(total_test_loss))
    print(f"整体测试集上的正确率:{test_accuracy}")

    writer.add_scalar("test_gpu_2_loss",total_test_loss, total_test_step)
    writer.add_scalar("test_gpu_2_accuracy",test_accuracy,total_test_step)

    total_test_step += 1
    print(f"-----------第{total_test_step}轮测试结束-----------")    

    # 保存训练时每一轮的模型
    torch.save(tudui, "./model_pth/gpu_2_tudui_{}.pth".format(i))
    print('模型已保存')
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"第{i}轮 Elapsed time: {elapsed_time}")
writer.close()

Files already downloaded and verified
Files already downloaded and verified
训练数据集长度为:{} 50000
测试数据集长度为:{} 10000
-----------第1轮训练开始-----------
训练次数:100,Loss:2.288994550704956
训练次数:200,Loss:2.287259340286255
训练次数:300,Loss:2.277855396270752
训练次数:400,Loss:2.230238199234009
训练次数:500,Loss:2.1387710571289062
训练次数:600,Loss:2.040024518966675
训练次数:700,Loss:2.0002012252807617
整体测试集上的loss:315.3457336425781
整体测试集上的正确率:0.2766999900341034
-----------第1轮测试结束-----------
模型已保存
第0轮 Elapsed time: 17.261840343475342
-----------第2轮训练开始-----------
训练次数:800,Loss:1.8995734453201294
训练次数:900,Loss:1.8302090167999268
训练次数:1000,Loss:1.9018371105194092
训练次数:1100,Loss:2.0305349826812744
训练次数:1200,Loss:1.666668176651001
训练次数:1300,Loss:1.6292837858200073
训练次数:1400,Loss:1.7061986923217773
训练次数:1500,Loss:1.799957275390625
整体测试集上的loss:298.57867431640625
整体测试集上的正确率:0.3231000006198883
-----------第2轮测试结束-----------
模型已保存
第1轮 Elapsed time: 17.24276041984558
-----------第3轮训练开始-----------
训练次数:1600,Loss:1.7362592220306396
训练次数

# 3. 加载模型，继续训练
test代码测试出来识别不了airplane，所以再训练10轮再试试

In [10]:
# test代码测试出来识别不了airplane，所以再训练10轮再试试

# 前面省略，需要先运行第一块代码
model = torch.load("/storage/pt/AW_STUDY/pytorch-tutorial-tudui/study_self/P25-_train_all/model_pth/gpu_3_tudui_9.pth")
print(model)

# 定义损失函数 用交叉熵
loss_fn = nn.CrossEntropyLoss().to(device)

# 定义优化器 选择的是随机梯度下降
# 1e-2 = 1*(10)^(-2) = 1/100 =0.01
learning_rate = 1e-2
optimizer = torch.optim.SGD(model.parameters(),lr=learning_rate)

# 设置训练网络的一些参数
    # 记录训练的次数
total_train_step = 0
    # 记录测试的次数
total_test_step = 0
    # 训练的轮数
epoch = 10

# 添加tensorboard
writer = SummaryWriter("../net_logs")

for i in range(epoch):
    print("-----------第{}轮训练开始-----------".format(i+1))
    start_time = time.time()
    model.train()
    # -- 训练步骤开始
    for data in train_dataloader:
        imgs,targets = data
        if torch.cuda.is_available():
            imgs ,targets = imgs.to(device), targets.to(device)
        outputs = model(imgs)
        loss = loss_fn(outputs,targets)

        # 优化器优化模型
        # 利用优化器将梯度清零
        optimizer.zero_grad()
        # 利用反向传播得到每个梯度的结点
        loss.backward()
        # 调用优化器
        optimizer.step()

        # 训练次数+1
        total_train_step += 1
        if total_train_step % 100 == 0:
            print("训练次数:{},Loss:{}".format(total_train_step,loss))
            writer.add_scalar("gpu_2_train_loss",loss.item(), total_train_step)

    # -- 测试步骤开始
    # 计算整个数据集上的loss
    total_test_loss = 0
    # 计算整体的正确率 整体正确的个数 初始为0
    total_true_num = 0
    model.eval()
    with torch.no_grad():
        for data in test_dataloader:
            imgs,targets = data
            if torch.cuda.is_available():
                imgs ,targets = imgs.to(device), targets.to(device)
            outputs = model(imgs)
            # 计算数据的损失
            loss = loss_fn(outputs,targets)
            # 计算整体损失
            total_test_loss = total_test_loss + loss

            # 计算准确率
            batch_true_num = (outputs.argmax(1)==targets).sum()
            total_true_num += batch_true_num         
    test_accuracy = total_true_num/test_data_size

    print("整体测试集上的loss:{}".format(total_test_loss))
    print(f"整体测试集上的正确率:{test_accuracy}")

    writer.add_scalar("test_gpu_2_loss",total_test_loss, total_test_step)
    writer.add_scalar("test_gpu_2_accuracy",test_accuracy,total_test_step)

    total_test_step += 1
    print(f"-----------第{total_test_step}轮测试结束-----------")    

    # 保存训练时每一轮的模型
    torch.save(model, "./model_pth/gpu_4_tudui_{}.pth".format(i))
    print('模型已保存')
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"第{i}轮 Elapsed time: {elapsed_time}")
writer.close()

Tudui(
  (model): Sequential(
    (0): Conv2d(3, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (2): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Flatten(start_dim=1, end_dim=-1)
    (7): Linear(in_features=1024, out_features=64, bias=True)
    (8): Linear(in_features=64, out_features=10, bias=True)
  )
)
-----------第1轮训练开始-----------
训练次数:100,Loss:0.5573392510414124
训练次数:200,Loss:0.618193507194519
训练次数:300,Loss:0.6381914615631104
训练次数:400,Loss:0.663777768611908
训练次数:500,Loss:0.5712800621986389
训练次数:600,Loss:0.5486220121383667
训练次数:700,Loss:0.6988285183906555
整体测试集上的loss:177.7161407470703
整体测试集上的正确率:0.6408999562263489
-----