In [1]:
import os 
import time
import random
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets

## 1 模型定义

In [2]:
"""
Pytorch中神经网络模块化接口nn的了解:

torch.nn是专门为神经网络设计的模块化接口。nn构建于autograd之上，可以用来定义和运行神经网络。
nn.Module是nn中十分重要的类,包含网络各层的定义及forward方法。
定义自已的网络：
    需要继承nn.Module类，并实现forward方法。
    一般把网络中具有可学习参数的层放在构造函数__init__()中，
    不具有可学习参数的层(如ReLU)可放在构造函数中，也可不放在构造函数中(而在forward中使用nn.functional来代替)
    
    只要在nn.Module的子类中定义了forward函数，backward函数就会被自动实现(利用Autograd)。
    在forward函数中可以使用任何Variable支持的函数，毕竟在整个pytorch构建的图中，是Variable在流动。还可以使用
    if,for,print,log等python语法.
    
    注：Pytorch基于nn.Module构建的模型中，只支持mini-batch的Variable输入方式，
    比如，只有一张输入图片，也需要变成 N x C x H x W 的形式：
    
    input_image = torch.FloatTensor(1, 28, 28)
    input_image = Variable(input_image)
    input_image = input_image.unsqueeze(0)   # 1 x 1 x 28 x 28
    
    二维卷积层, 输入的尺度是(N, C_in,H,W)，输出尺度（N,C_out,H_out,W_out）的计算方式
    torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True)
    in_channels(int) – 输入信号的通道
    out_channels(int) – 卷积产生的通道
    kerner_size(int or tuple) - 卷积核的尺寸
    stride(int or tuple, optional) - 卷积步长
    padding(int or tuple, optional) - 输入的每一条边补充0的层数
    dilation(int or tuple, optional) – 卷积核元素之间的间距
    groups(int, optional) – 从输入通道到输出通道的阻塞连接数
    bias(bool, optional) - 如果bias=True，添加偏置
    
    
"""
class LeNet(nn.Module):
    def __init__(self):
        # nn.Module的子类函数必须在构造函数中执行父类的构造函数
        super(LeNet, self).__init__()
        # nn.Conv2d返回的是一个Conv2d class的一个对象，该类中包含forward函数的实现
        # 当调用self.conv1(input)的时候，就会调用该类的forward函数
        # 第一层conv1卷积层，in_channel=1,output_channel=6,kernel_size=5*5,input_size=32*32,output_size=28*28
        self.conv1 = nn.Conv2d(1, 6, 5)
        # 第二层conv2，output_channel=6, kernel 5*5, output_size=10*10,input_size=14*14
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16*5*5, 120)
        self.fc2 = nn.Linear(120, 80)
        self.fc3 = nn.Linear(80, 10) # 不用增加softmax层，在cross_entropy的Loss中自动增加了Softmax
       
    # 定义了每次执行的 计算步骤。在所有的子类中都需要重写这个函数
    def forward(self,x):
        x = F.max_pool2d(F.relu(self.conv1(x)), 2)   # F.max_pool2d的返回值是一个Variable
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(x.shape[0], -1)  # 返回一个有相同数据但大小不同的tensor。 返回的tensor必须有与原tensor相同的数据和相同数目的元素，但可以有不同的大小。一个tensor必须是连续的contiguous()才能被查看
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

## 2 模型训练与评估类

In [3]:
def timer(func):
    def wrapper(*args, **kwargs):
        start = time.time()
        func(*args,**kwargs)
        end = time.time()
        cost = end - start
        print("Cost time: {} mins.".format(cost/60)) 
    return wrapper

class CNNModel(object):
    def __init__(self, model, train_data, test_data, model_dir, model_name,
                 best_valid_loss=float('inf'), n_split=0.9, batch_size=64, epochs=10):
        self.batch_size = batch_size
        self.epochs = epochs
        self.best_valid_loss = best_valid_loss
        self.model_dir = model_dir
        self.model_name = model_name
        self.n_split = n_split
        
        self.train_data =  train_data
        self.test_data = test_data
        
        self.device = self.get_device()
        self.init_data()
        self.init_iterator()
        self.init_model_path()
        
        self.model = self.init_model(model)
        self.optimizer = self.set_optimizer()
        self.criterion = self.set_criterion()
        
    def get_device(self):
        d = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        return d
    
    def init_data(self):
        n_train = int(len(self.train_data)*self.n_split)
        n_validation = len(self.train_data) - n_train
        self.train_data, self.valid_data = torch.utils.data.random_split(self.train_data, [n_train, n_validation])
    
    def init_iterator(self):
        self.train_iterator = torch.utils.data.DataLoader(self.train_data, shuffle=True, batch_size=self.batch_size)
        self.valid_iterator = torch.utils.data.DataLoader(self.valid_data, batch_size=self.batch_size)
        self.test_iterator = torch.utils.data.DataLoader(self.test_data, batch_size=self.batch_size)
        
    def set_optimizer(self):
        optimizer = optim.Adam(self.model.parameters()) 
        return optimizer
    
    def set_criterion(self):
        criterion = nn.CrossEntropyLoss()
        return criterion
    
    def init_model(self, model):
        if torch.cuda.device_count() > 1:
            model = nn.DataParallel(model)
        model = model.to(self.device)
        return model
        
    def init_model_path(self):
        if not os.path.isdir(self.model_dir):
            os.makedirs(self.model_dir)
        self.model_path = os.path.join(self.model_dir, self.model_name)
        
    # 定义评估函数
    def accu(self, fx, y):
        pred = fx.max(1, keepdim=True)[1]
        correct = pred.eq(y.view_as(pred)).sum()  # 得到该batch的准确度
        acc = correct.float()/pred.shape[0]
        return acc

    def train(self):
        epoch_loss = 0   # 积累变量
        epoch_acc = 0    # 积累变量
        self.model.train()    # 该函数表示PHASE=Train

        for (x,y) in self.train_iterator:  # 拿去每一个minibatch
            x = x.to(self.device)
            y = y.to(self.device)
            self.optimizer.zero_grad()
            fx = self.model(x)           # 进行forward
            loss = self.criterion(fx,y)  # 计算Loss,train_loss
            type(loss)
            acc = self.accu(fx,y)    # 计算精确度，train_accu
            loss.backward()          # 进行BP
            self.optimizer.step()    # 统一更新模型
            epoch_loss += loss.item()
            epoch_acc += acc.item()

        return epoch_loss/len(self.train_iterator),epoch_acc/len(self.train_iterator)

    def evaluate(self, iterator):
        epoch_loss = 0
        epoch_acc = 0
        self.model.eval()
        with torch.no_grad():
            for (x,y) in iterator:
                x = x.to(self.device)
                y = y.to(self.device)
                fx = self.model(x)
                loss = self.criterion(fx,y)
                acc = self.accu(fx,y)
                epoch_loss += loss.item()
                epoch_acc += acc.item()
        return epoch_loss/len(iterator),epoch_acc/len(iterator)
    
    @timer
    def train_fit(self):
        info = 'Epoch:{0} | Train Loss:{1} | Train Acc:{2} | Val Loss:{3} | Val Acc:{4}'
        for epoch in range(self.epochs):
            train_loss, train_acc = self.train()
            valid_loss, valid_acc = self.evaluate(self.valid_iterator)
            if valid_loss < self.best_valid_loss:  # 如果是最好的模型就保存到文件夹
                self.best_valid_loss = valid_loss
                torch.save(self.model.state_dict(), self.model_path)
            print(info.format(epoch+1, train_loss, train_acc, valid_loss, valid_acc))
    
    def get_acc(self):
        self.model.load_state_dict(torch.load(self.model_path))
        test_loss, test_acc = self.evaluate(self.test_iterator)
        print('| Test Loss: {0} | Test Acc: {1} |'.format(test_loss,test_acc))


## 3 数据集的准备

In [4]:
data_trans = transforms.Compose([
    transforms.Resize(32),
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

In [5]:
train_data = datasets.MNIST('data', train=True, download=True, transform=data_trans)
test_data = datasets.MNIST('data', train=False, download=True, transform=data_trans)

In [25]:
len(train_data)

60000

In [26]:
len(test_data)

10000

## 4 模型训练

In [6]:
epochs = 1
n_split = 0.9
batch_size = 64
model_dir = 'models'
best_valid_loss = float('inf')

model_name = "lenet_mnist.pt"
model = LeNet()

obj = CNNModel(model=model, 
               train_data=train_data, 
               test_data=test_data, 
               model_dir=model_dir, 
               model_name=model_name,
               best_valid_loss=best_valid_loss, 
               n_split=n_split, 
               batch_size=batch_size, 
               epochs=epochs)

In [7]:
for name, parameters in model.named_parameters():   # 各层参数及具体数字
    print('name: {}, param: {}'.format(name, parameters))
# for n, c in model.named_children():    # 各层名称与具体定义
#     print("name:{}, children:{}".format(n,c))

name: conv1.weight, param: Parameter containing:
tensor([[[[ 0.0589, -0.0283,  0.1930,  0.1327, -0.0820],
          [-0.0741, -0.0690,  0.0288, -0.0728,  0.0379],
          [ 0.0830,  0.1200,  0.0167,  0.0064,  0.0377],
          [ 0.0174, -0.0979,  0.0465,  0.1862,  0.1280],
          [ 0.1560, -0.0336, -0.0894, -0.1760, -0.1037]]],


        [[[ 0.1907,  0.0524, -0.1414, -0.0559, -0.1851],
          [-0.1228,  0.1052,  0.0046,  0.0253, -0.0967],
          [-0.0429,  0.1239,  0.1373, -0.1639, -0.0494],
          [ 0.0752, -0.0897, -0.1574, -0.1168,  0.1470],
          [-0.0850, -0.1789, -0.0055,  0.0314, -0.1272]]],


        [[[ 0.1110, -0.1656,  0.1073,  0.0823, -0.0452],
          [ 0.1121, -0.1560, -0.0929, -0.1319, -0.0042],
          [ 0.0045,  0.0822,  0.1764,  0.1985, -0.1017],
          [-0.0646,  0.1559,  0.0980, -0.1105,  0.0501],
          [ 0.0624,  0.0212,  0.1089,  0.0460, -0.1877]]],


        [[[ 0.1006,  0.0958,  0.0620,  0.0621, -0.1434],
          [-0.1676,  0.0610

In [8]:
print(obj.device)

cuda


In [9]:
print(obj.model)

DataParallel(
  (module): LeNet(
    (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
    (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
    (fc1): Linear(in_features=400, out_features=120, bias=True)
    (fc2): Linear(in_features=120, out_features=80, bias=True)
    (fc3): Linear(in_features=80, out_features=10, bias=True)
  )
)


In [23]:
# print(len(list(obj.train_iterator)))   # 844 iterator；每次iterator 64个样本（batch_size);每个样本（32*32）
# for (x, y) in obj.train_iterator:
#     print(x, len(x))
#     print("***"*30)
#     print(x[0][0][0], len(x[0][0][0]))
#     print("***"*30)
#     print(y, len(y))
#     input()

def accu(fx, y):
    pred = fx.max(1, keepdim=True)[1]
    print("Pred:", pred, pred.shape)
    z = y.view_as(pred)   # 把y的size变成和pred的一样，后面好比较
    print("Z:", z, len(z))
    print("pred eq", pred.eq(z))
    correct = pred.eq(y.view_as(pred)).sum()  # 得到该batch的准确度
    print("Correct:", correct)
    acc = correct.float()/pred.shape[0]
    return acc

epoch_loss = 0   # 积累变量
epoch_acc = 0    # 积累变量
obj.model.train() 
for (x, y) in obj.train_iterator:  
    print("X:",x, x.shape)       # x  torch.Size([64, 1, 32, 32])
    print("Y:",y, y.shape)       # y torch.Size([64]
    x = x.to(obj.device)
    y = y.to(obj.device)
    print(x, x.shape)
    print(y, y.shape)
    
    obj.optimizer.zero_grad()   # 清空梯度计算损失
    fx = obj.model(x)           # 进行forward
    print("Fx:", fx, fx.shape)   # torch.Size([64, 10]  # 每个类的概率
    input() 
    loss = obj.criterion(fx,y)  # 计算Loss,train_loss
    print("Loss:", loss, type(loss))
    acc = accu(fx,y)     # 计算精确度，train_accu
    print("Acc:", acc)
    
    loss.backward()          # 进行BP
    obj.optimizer.step()    # 统一更新模型 #所有的optimizer都实现了step()方法，这个方法会更新所有的参数,这是大多数optimizer所支持的简化版本。一旦梯度被如backward()之类的函数计算好后，我们就可以调用这个函数。
    epoch_loss += loss.item()
    epoch_acc += acc.item()
    print("epoch Loss:", epoch_loss)
    print("epoch Acc:", epoch_acc)
    input("2:")
    

X: tensor([[[[-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          ...,
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242]]],


        [[[-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          ...,
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242]]],


        [[[-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.


Loss: tensor(2.2915, device='cuda:0', grad_fn=<NllLossBackward>) <class 'torch.Tensor'>
Pred: tensor([[8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [5],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [5],
        [8],
        [8],
        [8],
        [8],
        [5],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [5],
        [8],
        [8],
        [8],
        [8],
        [5],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [5],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8],
        [8]], device='cuda:0') torch.Size([64, 1])
Z: tensor([[4],
        [7],
      

KeyboardInterrupt: 

In [12]:
obj.train_fit()

Epoch:1 | Train Loss:0.2314406042141725 | Train Acc:0.930002221563981 | Val Loss:0.09761913215860407 | Val Acc:0.9704122340425532
Cost time: 0.642307702700297 mins.


In [13]:
obj.get_acc()

| Test Loss: 0.07367058542029113 | Test Acc: 0.9765127388535032 |


In [None]:
# loss的内部是怎么计算的？