In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
import argparse
import os
import time
!pip install torchsummary
from torchsummary import summary

/var/lib/oar/.batch_job_bashrc: line 5: /home/ziwang/.bashrc: No such file or directory


In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [4]:
batch_size = 128
transform = transforms.Compose( # 创建了一个转换组合，将一系列的数据预处理操作组合在一起。这里使用了两个预处理操作
    [transforms.ToTensor(), # 将图像转换为 PyTorch 张量，并将像素值缩放到 [0, 1] 的范围
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) # 对图像进行标准化处理，减去均值（0.5）并除以标准差（0.5）
# 创建了 CIFAR-10 数据集的训练集对象。root 参数指定了数据集存储的根目录，train=True 表示加载训练集，
# download=True 表示如果数据集不存在则自动下载，transform=transform 表示应用之前定义的数据预处理操作。
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)

Files already downloaded and verified


In [5]:
# 创建了一个训练集数据加载器。trainloader 负责从训练集中加载数据，shuffle=True 表示每个 epoch 都会对数据进行洗牌，
# num_workers=2 表示使用两个子进程来加载数据以加快速度。
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

In [6]:
# 这里分别创建了 CIFAR-10 数据集的测试集对象和测试集数据加载器。
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

# 数据类别
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified


In [7]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def count_maxpool_operations(self, input, output, pool_layer, out_channels):
        # input output 的维度均为 N,C,H,W
        kernel_maxpooling = pool_layer.kernel_size
        stride = pool_layer.stride
        padding = pool_layer.padding
        output_height = output.shape[2]
        output_width = output.shape[3]
        out_channels =  output.shape[1]
        num_max = output_height * output_width * (kernel_maxpooling**2 -1) * out_channels
        return num_max

    def count_conv_operations(self, input, output, output_pooled, conv_layer, pool_layer):
        # batch_size = input.size(0)
        out_channels, in_channels = output.size(1), conv_layer.in_channels
        output_height, output_width = output.size(2), output.size(3)
        filter_size = conv_layer.kernel_size[0]
        stride = conv_layer.stride[0]
        padding = conv_layer.padding[0]
        # Compute number of operations for convolution
        # print(str(output_height) + "*" +  str(output_width) + "*" + str(in_channels) + "*" + str(filter_size ** 2) + "*" + str(out_channels))
        num_mults = output_height * output_width * in_channels * filter_size ** 2 * out_channels
        num_adds = output_height * output_width * in_channels * filter_size ** 2 * out_channels
        # num_maxs = output_height * output_width * out_channels
        # print("num_mults" + str(num_mults))
        num_maxs = self.count_maxpool_operations(output, output_pooled, pool_layer, out_channels)
        # print("num_maxs" + str(num_maxs))
        total_ops = num_mults + num_adds + num_maxs
        return num_mults, num_adds, num_maxs, total_ops

    def count_operations(self, x):
        conv1_out = self.conv1(x)
        conv1_out_pooled = self.pool(F.relu(conv1_out))  # Apply max pooling after the first convolution
        conv2_out = self.conv2(conv1_out_pooled)
        conv2_out_pooled = self.pool(F.relu(conv2_out))  # Apply max pooling after the first convolution
        # Count operations for convolutional layer 1
        conv1_ops = self.count_conv_operations(x, conv1_out, conv1_out_pooled, self.conv1, self.pool)
        # Count operations for convolutional layer 2
        conv2_ops = self.count_conv_operations(conv1_out_pooled, conv2_out, conv2_out_pooled, self.conv2, self.pool)
        return conv1_ops, conv2_ops        

    def count_fc_operations(self, input, fc_layer):
        # Get the number of input features for the fully connected layer
        in_features = fc_layer.in_features
        # Get the number of output features for the fully connected layer
        out_features = fc_layer.out_features    
        # Compute number of operations for fully connected layer
        # print(str(out_features) + " * " + str(in_features))
        num_mults = out_features * in_features
        num_adds = out_features * in_features
        num_maxs = 0    
        total_ops = num_mults + num_adds
        return num_mults, num_adds, num_maxs, total_ops

    def count_total_operations(self, x):
        conv1_ops, conv2_ops = self.count_operations(x)
        fc1_ops = self.count_fc_operations(x, self.fc1)
        fc2_ops = self.count_fc_operations(x, self.fc2)
        fc3_ops = self.count_fc_operations(x, self.fc3)
        total_ops = sum(op[3] for op in [conv1_ops, conv2_ops, fc1_ops, fc2_ops, fc3_ops])
        return total_ops
# 4-2 实例化网络
net = Net().to(device)

In [8]:
## 5 定义h损失函数
# 这行代码定义了损失函数，即交叉熵损失函数。交叉熵损失函数通常用于多类别分类问题
criterion = nn.CrossEntropyLoss()
# 这行代码定义了优化器，即随机梯度下降（SGD）优化器。SGD是一种常用的优化算法，用于更新神经网络的权重以最小化损失函数。
# lr=0.001指定了学习率，即每次更新时的步长。momentum=0.9是SGD的一个超参数，用于加速SGD在相关方向上前进，并减小波动。
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)    

In [9]:
## 6 定义其他训练所使用的超参数
epoch_nums = 2   #为便于测试可定义为1，实际训练时根据需要改变

In [10]:
## 8 训练模型
# 8-1 记录总时间_起始时间
if torch.cuda.is_available(): torch.cuda.synchronize()
total_time_start = time.time()
# 8-2 记录总操作数、单个epoch操作数、单个batch操作数
total_ops = 0
epoch_train_ops = 0
epoch_evaluate_ops = 0
batch_train_ops = 0
batch_evaluate_ops = 0
# 8-3 遍历epoch执行训练
for epoch in range(epoch_nums):
    print('\nEpoch: %d' % (epoch + 1))
    # 8-3-1 定义统计参数
    net.train()
    sum_loss = 0.0
    correct = 0.0
    total = 0.0
    iteration_num = 0
    # 8-3-2 遍历训练数据集，每次取batch_num个数据
    for i, data in enumerate(trainloader, 0):
        # 8-3-3 准备数据
        length = len(trainloader)
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)        
        optimizer.zero_grad()
        # 8-3-4 统计训练时的操作数，仅需统计一次
        # images.shape[0] 是batch_size,count_total_operations在实际计算时未考虑batch_size
        if(epoch == 0 & i == 0):
            batch_train_ops = net.count_total_operations(inputs)
            batch_train_ops = inputs.shape[0] * batch_train_ops
        # 8-3-5 在数据输入网络前记录时间
        if torch.cuda.is_available(): torch.cuda.synchronize()
        batch_train_start = time.time()
        # 8-3-6 网络计算
        outputs = net(inputs)
        # 8-3-7 在网络处理完成后记录时间
        if torch.cuda.is_available(): torch.cuda.synchronize()
        batch_train_time = time.time() - batch_train_start
        # 8-3-8 计算loss及反向传播
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        # 8-3-9 每训练1个batch打印一次loss和准确率等参数
        sum_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += predicted.eq(labels.data).cpu().sum()
        # 每秒运算次数
        batch_train_ops_per_second = batch_train_ops / batch_train_time
        # print('[Epoch:%d, Iterations:%d] Loss: %.03f | Acc: %.3f%% | Ops: %d| Time: %.6fs | Ops/Sec : %d' % (
        #     epoch + 1, 
        #     (i + 1 + epoch * length), 
        #     sum_loss / (i + 1), 
        #     100. * correct / total,
        #     batch_train_ops,
        #     batch_train_time,
        #     batch_train_ops_per_second))
        iteration_num += 1
    if(epoch == 0):
        epoch_train_ops = batch_train_ops * iteration_num

    # 8-3-10 每训练完一个epoch测试一下准确率
    # test的时间相对于train较短，因此采用每个epoch统计一次时间
    print("Waiting Test!")
    with torch.no_grad():
        correct = 0
        total = 0
        iteration_num = 0
        # 在数据输入网络前记录时间
        if torch.cuda.is_available(): torch.cuda.synchronize()
        epoch_evaluate_start = time.time()
        for data in testloader:
            net.eval()
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            # 统计训练时的操作数，仅需统计一次
            # images.shape[0] 是batch_size,count_total_operations在实际计算时未考虑batch_size
            if(epoch == 0 & i == 0):
                batch_evaluate_ops = net.count_total_operations(images)
                batch_evaluate_ops = images.shape[0] * batch_evaluate_ops
            outputs = net(images)
            # 取得分最高的那个类 (outputs.data的索引号)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum()
            iteration_num += 1
        acc = 100. * correct / total
        if(epoch == 0):
            epoch_evaluate_ops = batch_evaluate_ops * iteration_num 
        # 在网络处理完成后记录时间
        if torch.cuda.is_available(): torch.cuda.synchronize()
        epoch_evaluate_time = time.time() - epoch_evaluate_start
        # 每秒运算次数
        epoch_evaluate_ops_per_second = epoch_evaluate_time / epoch_evaluate_time
        # print('[Epoch:%d] Validation Acc: %.3f%% | Ops: %d| Time: %.6fs | Ops/Sec : %d' % (
        #     epoch + 1, 
        #     acc,
        #     epoch_evaluate_ops,
        #     epoch_evaluate_time,
        #     epoch_evaluate_ops_per_second))


Epoch: 1
Waiting Test!

Epoch: 2
Waiting Test!


In [11]:
## 8-4 记录总时间_结束时间
if torch.cuda.is_available(): torch.cuda.synchronize()
total_time_end = time.time() - total_time_start

In [12]:
## 8-5 打印统计结果
total_ops = epoch_nums * (epoch_evaluate_ops + epoch_train_ops)
ops_per_second = total_ops / total_time_end
print('Time elapsed: %.3fs' % total_time_end)
print('total ops: %d' % total_ops)
print('ops/second : %d' % ops_per_second)
print('Finished Training')

Time elapsed: 14.359s
total ops: 85146038784
ops/second : 5929719446
Finished Training


In [13]:
input_size = (3, 32, 32)
net = net.to(device)
summary(net, input_size = input_size)
# Number of Parameters CONVOL1 = out_channels × (in_channels × kernel_size ** 2 + 1) = 6 * (3 * 25 + 1) = 6 * 76 = 456
# Taille de sortie après 1 layer convolution: ((size_input - kernel_size) + 1 // stride) =  (32 - 5) + 1 = 28
# Maxpooling (2 * 2) = Taille de sortie // 2 = 28 // 2 = 14
# MaxPool2d 层不包含可学习的参数 = 0
# Taille de sortie après 2 layer convolution: ((14 - 5) + 1) // 1 = 10
# Number of Parameters CONVOL2 = out_channels × (in_channels × kernel_size ** 2 + 1) = 16 * (6 * 25 + 1) = 2416
# Maxpooling (2 * 2) = Taille de sortie // 2 = 10 // 2 = 5
# Number of Parameters FC1 =(in_channels + 1) × out_channels = ((16 * 5 * 5) + 1) * 120 = 48120
# Number of Parameters FC2 =(in_channels + 1) × out_channels = ((120) + 1) * 84 = 10164
# Number of Parameters FC3 =(in_channels + 1) × out_channels = ((84) + 1) * 10 = 850


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 6, 28, 28]             456
         MaxPool2d-2            [-1, 6, 14, 14]               0
            Conv2d-3           [-1, 16, 10, 10]           2,416
         MaxPool2d-4             [-1, 16, 5, 5]               0
            Linear-5                  [-1, 120]          48,120
            Linear-6                   [-1, 84]          10,164
            Linear-7                   [-1, 10]             850
Total params: 62,006
Trainable params: 62,006
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (MB): 0.06
Params size (MB): 0.24
Estimated Total Size (MB): 0.31
----------------------------------------------------------------


=================================================================================================

In [14]:
class ResidualBlock(nn.Module):
    def __init__(self, inchannel, outchannel, stride=1):
        super(ResidualBlock, self).__init__()
        self.left = nn.Sequential(
            nn.Conv2d(inchannel, outchannel, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(outchannel),
            nn.ReLU(inplace=True),
            nn.Conv2d(outchannel, outchannel, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(outchannel)
        )
        self.shortcut = nn.Sequential()
        if stride != 1 or inchannel != outchannel:
            self.shortcut = nn.Sequential(
                nn.Conv2d(inchannel, outchannel, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(outchannel)
            )

    def forward(self, x):
        out = self.left(x)
        out += self.shortcut(x)
        out = F.relu(out)
        return out

In [49]:
class ResNet(nn.Module):
    def __init__(self, ResidualBlock, num_classes=10):
        super(ResNet, self).__init__()
        self.inchannel = 64
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        self.layer1 = self.make_layer(ResidualBlock, 64,  2, stride=1)
        #self.layer2 = self.make_layer(ResidualBlock, 128, 2, stride=2)
        #self.layer3 = self.make_layer(ResidualBlock, 256, 2, stride=2)
        #self.layer4 = self.make_layer(ResidualBlock, 512, 2, stride=2)
        #self.fc = nn.Linear(512, num_classes)
        #self.fc = nn.Linear(128, num_classes)
        self.fc = nn.Linear(1024, num_classes)

    def make_layer(self, block, channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)   #strides=[1,1]
        layers = []
        for stride in strides:
            layers.append(block(self.inchannel, channels, stride))
            self.inchannel = channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.layer1(out)
        #out = self.layer2(out)
        #out = self.layer3(out)
        #out = self.layer4(out)
        #out = F.avg_pool2d(out, 4)
        #out = F.avg_pool2d(out, 16)
        out = F.avg_pool2d(out, 8)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out


In [50]:
def ResNet18():
    return ResNet(ResidualBlock)

In [69]:
class My_AlexNet(nn.Module):
    def __init__(self):
        super(My_AlexNet, self).__init__()
        # 特征提取
        self.features = nn.Sequential(
            nn.Conv2d(in_channels=3,out_channels=16,kernel_size=5,stride=1,padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3,stride=2,padding=0),
            nn.Conv2d(in_channels=16,out_channels=48,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(48),
            nn.MaxPool2d(kernel_size=3,stride=2,padding=0),
            nn.Conv2d(in_channels=48,out_channels=64,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=64,out_channels=64,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=64,out_channels=48,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3,stride=2,padding=0),
        )
        # 全连接层
        self.classifier = nn.Sequential(
            nn.Linear(in_features=3*3*48,out_features=128),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(128,128),
            nn.ReLU(),
            nn.Linear(128,10)
        )

    # 前向算法
    def forward(self,x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        result = self.classifier(x)
        return result


    def count_maxpool_operations(self, input, output, pool_layer):
        if pool_layer:
            kernel_size = pool_layer.kernel_size
            stride = pool_layer.stride
            padding = pool_layer.padding
            output_height = output.shape[2]
            output_width = output.shape[3]
            out_channels = output.shape[1]
            num_max = ((output_height + 2 * padding - kernel_size) // stride + 1) * ((output_width + 2 * padding - kernel_size) // stride + 1) * (kernel_size ** 2 - 1) * out_channels  
        else:
            num_max = 0
        return num_max
             
    def count_conv_operations(self, input, output, output_pooled, conv_layer, pool_layer):
        out_channels, in_channels = output.size(1), conv_layer.in_channels
        output_height, output_width = output.size(2), output.size(3)
        filter_size = conv_layer.kernel_size[0]
        stride = conv_layer.stride[0]
        padding = conv_layer.padding[0]
        num_mults = output_height * output_width * in_channels * filter_size ** 2 * out_channels
        num_adds = output_height * output_width * in_channels * filter_size ** 2 * out_channels
        num_maxs = self.count_maxpool_operations(output, output_pooled, pool_layer)
        if pool_layer:
            num_maxs = self.count_maxpool_operations(output, output_pooled, pool_layer)
        else:
            num_maxs = 0
        total_ops = num_mults + num_adds + num_maxs
        return num_mults, num_adds, num_maxs, total_ops

    def count_operations(self, x):
        conv1_out = self.features[0](x)
        conv1_out_pooled = self.features[2](F.relu(conv1_out))
        conv2_out = self.features[3](conv1_out_pooled)
        conv2_out_pooled = self.features[6](F.relu(conv2_out))
        
        conv3_out = self.features[7](conv2_out_pooled)
        conv4_out = self.features[9](F.relu(conv3_out))
        conv5_out = self.features[11](F.relu(conv4_out))
        conv5_out_pooled = self.features[13](F.relu(conv5_out))
        
        conv1_ops = self.count_conv_operations(x, conv1_out, conv1_out_pooled, self.features[0], self.features[2])
        conv2_ops = self.count_conv_operations(conv1_out_pooled, conv2_out, conv2_out_pooled, self.features[3], self.features[6])
        conv3_ops = self.count_conv_operations(conv2_out_pooled, conv3_out, conv3_out, self.features[7], None)
        conv4_ops = self.count_conv_operations(conv3_out, conv4_out, conv4_out, self.features[9], None)
        conv5_ops = self.count_conv_operations(conv4_out, conv5_out, conv5_out_pooled, self.features[11], self.features[13])
        return conv1_ops, conv2_ops, conv3_ops, conv4_ops, conv5_ops


    def count_fc_operations(self, input, fc_layer):
        in_features = fc_layer.in_features
        out_features = fc_layer.out_features    
        num_mults = out_features * in_features
        num_adds = out_features * in_features
        num_maxs = 0
        total_ops = num_mults + num_adds
        return num_mults, num_adds, num_maxs, total_ops
    
    def count_total_operations(self, x):
        conv1_ops, conv2_ops, conv3_ops, conv4_ops, conv5_ops  = self.count_operations(x)
        fc1_ops = self.count_fc_operations(x, self.classifier[0])
        fc2_ops = self.count_fc_operations(x, self.classifier[3])
        fc3_ops = self.count_fc_operations(x, self.classifier[5])
        total_ops = sum(op[3] for op in [conv1_ops, conv2_ops, conv3_ops, conv4_ops, conv5_ops, fc1_ops, fc2_ops, fc3_ops])
        return total_ops

In [70]:
# 定义是否使用GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 超参数设置
EPOCH = 60
pre_epoch = 0  # 定义已经遍历数据集的次数
BATCH_SIZE = 128      #批处理尺寸(batch_size)
LR = 0.01        #学习率

In [71]:
# 准备数据集并预处理
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),  #先四周填充0，在吧图像随机裁剪成32*32
    transforms.RandomHorizontalFlip(),  #图像一半的概率翻转，一半的概率不翻转
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), #R,G,B每层的归一化用到的均值和方差
])

In [72]:
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

In [73]:
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train) #训练数据集
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)   #生成一个个batch进行批训练，组成batch的时候顺序打乱取


Files already downloaded and verified


In [74]:
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)
# Cifar-10的标签
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified


In [75]:
# 模型实例化，在这里更改实际使用到的各种模型
#net = ResNet18().to(device)
net = My_AlexNet().to(device)

# 定义损失函数和优化方式
criterion = nn.CrossEntropyLoss()  #损失函数为交叉熵，多用于多分类问题
optimizer = optim.SGD(net.parameters(), lr=LR, momentum=0.9, weight_decay=5e-4) #优化方式为mini-batch momentum-SGD，并采用L2正则化（权重衰减）


In [76]:
if not os.path.exists("./model/"):
    os.makedirs("./model/")
best_acc = 85
print("Start Training Resnet-18!")
with open("acc.txt", "w") as f:
    with open("log.txt", "w")as f2:
        if torch.cuda.is_available(): torch.cuda.synchronize()
        total_time_start = time.time()
        # 8-2 记录总操作数、单个epoch操作数、单个batch操作数
        total_ops = 0
        epoch_train_ops = 0
        epoch_evaluate_ops = 0
        batch_train_ops = 0
        batch_evaluate_ops = 0
        for epoch in range(pre_epoch, EPOCH):
            print('\nEpoch: %d' % (epoch + 1))
            net.train()
            sum_loss = 0.0
            correct = 0.0
            total = 0.0
            for i, data in enumerate(trainloader, 0):
                # 准备数据
                length = len(trainloader)
                inputs, labels = data
                inputs = inputs.to(device)
                labels = labels.to(device)        
                optimizer.zero_grad()
                
                # 8-3-4 统计训练时的操作数，仅需统计一次
                # images.shape[0] 是batch_size,count_total_operations在实际计算时未考虑batch_size
                if(epoch == 0 & i == 0):
                    batch_train_ops = net.count_total_operations(inputs)
                    batch_train_ops = inputs.shape[0] * batch_train_ops
                # 8-3-5 在数据输入网络前记录时间
                if torch.cuda.is_available(): torch.cuda.synchronize()
                batch_train_start = time.time()

                # forward + backward
                outputs = net(inputs)

                # 8-3-7 在网络处理完成后记录时间
                if torch.cuda.is_available(): torch.cuda.synchronize()
                batch_train_time = time.time() - batch_train_start
        
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                # 每训练1个batch打印一次loss和准确率
                sum_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += predicted.eq(labels.data).cpu().sum()
                # 每秒运算次数
                batch_train_ops_per_second = batch_train_ops / batch_train_time
                print('[epoch:%d, iter:%d] Loss: %.03f | Acc: %.3f%% | Ops: %d| Time: %.6fs | Ops/Sec : %d'
                      % (epoch + 1, (i + 1 + epoch * length), sum_loss / (i + 1), 100. * correct / total, batch_train_ops, batch_train_time, batch_train_ops_per_second))
                iteration_num += 1

            if(epoch == 0):
                epoch_train_ops = batch_train_ops * iteration_num

            # 每训练完一个epoch测试一下准确率
            print("Waiting Test!")
            with torch.no_grad():
                correct = 0
                total = 0
                iteration_num = 0
                if torch.cuda.is_available(): torch.cuda.synchronize()
                epoch_evaluate_start = time.time()
                for data in testloader:
                    net.eval()
                    images, labels = data
                    images, labels = images.to(device), labels.to(device)
                    if(epoch == 0 & i == 0):
                        batch_evaluate_ops = net.count_total_operations(images)
                        batch_evaluate_ops = images.shape[0] * batch_evaluate_ops
                    outputs = net(images)
                    # 取得分最高的那个类 (outputs.data的索引号)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum()
                    iteration_num += 1
                    # 向tensorboard添加pr曲线
                    # writer.add_pr_curve("pr-curve", labels, predicted)

                print('测试分类准确率为：%.3f%%' % (100 * correct / total))
                acc = 100. * correct / total
                if(epoch == 0):
                    epoch_evaluate_ops = batch_evaluate_ops * iteration_num 
                # 在网络处理完成后记录时间
                if torch.cuda.is_available(): torch.cuda.synchronize()
                epoch_evaluate_time = time.time() - epoch_evaluate_start
                # 每秒运算次数
                epoch_evaluate_ops_per_second = epoch_evaluate_time / epoch_evaluate_time
                print('[Epoch:%d] Validation Acc: %.3f%% | Ops: %d| Time: %.6fs | Ops/Sec : %d' % (
                    epoch + 1, 
                    acc,
                    epoch_evaluate_ops,
                    epoch_evaluate_time,
                    epoch_evaluate_ops_per_second))

        if torch.cuda.is_available(): torch.cuda.synchronize()
        total_time_end = time.time() - total_time_start
        ## 8-5 打印统计结果
        total_ops = epoch_nums * (epoch_evaluate_ops + epoch_train_ops)
        ops_per_second = total_ops / total_time_end
        print('Time elapsed: %.3fs' % total_time_end)
        print('total ops: %d' % total_ops)
        print('ops/second : %d' % ops_per_second) 
        print("Training Finished, TotalEPOCH=%d" % EPOCH)

Start Training Resnet-18!

Epoch: 1
[epoch:1, iter:1] Loss: 2.310 | Acc: 10.156% | Ops: 1888731136| Time: 0.019913s | Ops/Sec : 94847078752
[epoch:1, iter:2] Loss: 2.309 | Acc: 10.156% | Ops: 1888731136| Time: 0.001251s | Ops/Sec : 1510374177054
[epoch:1, iter:3] Loss: 2.309 | Acc: 9.896% | Ops: 1888731136| Time: 0.001179s | Ops/Sec : 1602004561910
[epoch:1, iter:4] Loss: 2.305 | Acc: 11.133% | Ops: 1888731136| Time: 0.001134s | Ops/Sec : 1665667064476
[epoch:1, iter:5] Loss: 2.304 | Acc: 11.406% | Ops: 1888731136| Time: 0.001130s | Ops/Sec : 1671642236473
[epoch:1, iter:6] Loss: 2.305 | Acc: 10.938% | Ops: 1888731136| Time: 0.001286s | Ops/Sec : 1468380455727
[epoch:1, iter:7] Loss: 2.305 | Acc: 11.161% | Ops: 1888731136| Time: 0.001163s | Ops/Sec : 1624674437786
[epoch:1, iter:8] Loss: 2.304 | Acc: 11.523% | Ops: 1888731136| Time: 0.001094s | Ops/Sec : 1725906875522
[epoch:1, iter:9] Loss: 2.304 | Acc: 11.285% | Ops: 1888731136| Time: 0.001126s | Ops/Sec : 1676950160594
[epoch:1, ite