# GPU

In [1]:
# ! 表示执行bash代码
!nvidia-smi

Sat Apr 30 20:24:54 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.52       Driver Version: 511.79       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:08:00.0  On |                  N/A |
|  0%   48C    P8    20W / 220W |   1077MiB /  8192MiB |     13%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# 计算设备，所有框架默认CPU，需要指定
import torch
from torch import nn

In [3]:
torch.device('cpu'), torch.cuda.device('cuda'), torch.cuda.device('cuda:0')

(device(type='cpu'),
 <torch.cuda.device at 0x7f360dff0e50>,
 <torch.cuda.device at 0x7f360dfe6400>)

In [4]:
# 查询可用GPU数量
torch.cuda.device_count()

1

In [5]:
# 下面两个函数尝试获取GPU
def try_gpu(i=0):
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

def try_all_gpus():
    devices = [
        torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

try_gpu(), try_gpu(2), try_all_gpus()

(device(type='cuda', index=0),
 device(type='cpu'),
 [device(type='cuda', index=0)])

In [6]:
# 查询张量所在设备
x = torch.tensor([1, 2, 3],device=try_gpu())
y = torch.tensor([1, 2, 3])
x.device, y.device

(device(type='cuda', index=0), device(type='cpu'))

In [7]:
# x = torch.zeros((2, 3), device=try_gpu())
x + y
# 需要保证x y 在相同设备上计算，同一cpu或者同一gpu才可以计算

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

In [8]:
# cpu挪到gpu上
y = y.cuda(0)

In [9]:
x.device, y.device

(device(type='cuda', index=0), device(type='cuda', index=0))

In [10]:
x + y

tensor([2, 4, 6], device='cuda:0')

#### 不同设备间挪数据是很慢，尤其GPU到CPU，所以一般同一块cpu或者同一块GPU做运算

#### 神经网络与GPU

In [11]:
net = nn.Sequential(nn.Linear(3, 1))
net = net.to(device=try_gpu())

In [12]:
# 确认参数模型存储在同一个GPU上
net[0].weight.data.device

device(type='cuda', index=0)

In [13]:
x = torch.ones(2, 3, device=try_gpu())
net(x)

tensor([[-0.2222],
        [-0.2222]], device='cuda:0', grad_fn=<AddmmBackward0>)

# 购买建议
- 买最新的，最贵的
- 显存原则上越大越好，但是越大越贵

# QA
#### 显存不够？batch_size 变小占用率低？
- 只能变小模型

#### 长时间满负荷？
- 满负荷没问题，主要看稳定，自用的话不要超过80度，最好别90度（此时CPU也会降频），否则可能烧卡

#### GPU训练何时 data to gpu好？
- 一般在net前，因为一般很多data处理gpu上不一定好，
- 也有特殊情况，比如图片的一些处理依赖矩阵运算可能效果好，最好办法测试下cpu和gpu处理数据的快慢决定，此时gpu预处理也占用gpu性能
- 如果数据读取比gpu快的话，尽量cpu做，计算留给网络

#### tensor.cuda 和to(device )区别？
- module只能to device，也是这的唯一用法，cuda是tensor使用的

#### GPU加速不明显？占用60%？
- 可能设备不行，其实60也可以了，不一定GPU一定比CPU好
- 一般CNN能到90+

#### GPU上推理是不是可能提升性能上限？
- 推理内存不是关键，可以加内存，来提高batch size大小

# PyTroch 神经网络基础        
## 层和块

In [14]:
import torch
from torch import nn
from torch.nn import functional as F

net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))

X = torch.rand(2, 20)
net(X)

tensor([[ 0.0588,  0.2057, -0.0397,  0.1319, -0.2440, -0.0127,  0.1214,  0.1394,
         -0.0091, -0.0613],
        [ 0.1725,  0.2061, -0.0579,  0.0874, -0.1618,  0.1379,  0.2155,  0.0401,
          0.0130,  0.0270]], grad_fn=<AddmmBackward0>)

In [15]:
# nn.Sequential 定义了一种特殊的Module, 任何一个层和神经网络都是module的子类
# 自定义块
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.out = nn.Linear(256, 10)
        
    def forward(self, x):
        return self.out(F.relu(self.hidden(X)))


In [16]:
# 实例化多层感知机的层，然后每次调用正向传播函数时调用这些层
net = MLP()
net(X)

tensor([[ 3.0427e-01, -1.0922e-01, -6.7308e-02,  5.1343e-02,  1.7009e-01,
          8.6855e-05,  6.1816e-02,  2.9546e-01,  1.0539e-01, -4.3151e-02],
        [ 2.4337e-01, -3.5482e-02, -7.5882e-02,  1.0736e-01,  1.0195e-01,
         -4.3157e-02,  5.3373e-02,  1.5682e-01,  9.6066e-02, -7.9189e-02]],
       grad_fn=<AddmmBackward0>)

In [17]:
# 顺序块
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for block in args:
            self._modules[block] = block
    
    def forward(self, x):
        for block in self._modules.values():
            x = block(x)
        return x

net = MySequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
# X = torch.rand(2, 20)
net(X)
    

tensor([[ 0.2889, -0.1662, -0.1983,  0.0204, -0.0609,  0.1689,  0.0609,  0.0702,
          0.0625, -0.0872],
        [ 0.2288, -0.0725, -0.2015,  0.1356, -0.0269,  0.0200, -0.0056,  0.1747,
          0.1357, -0.1314]], grad_fn=<AddmmBackward0>)

In [18]:
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.rand_weight = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)
    
    def forward(self, X):
        X = self.linear(X)
        X = self.linear(X)
        X = F.relu(torch.mm(X, self.rand_weight) + 1)
        X = self.linear(X)
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()
    
net = FixedHiddenMLP()
net(X)
    

tensor(-0.0472, grad_fn=<SumBackward0>)

In [19]:
x = torch.randn(2,20)
net(x)

tensor(-0.1504, grad_fn=<SumBackward0>)

In [19]:
# 混合搭配各种组合块的方法，嵌套
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(20, 64), nn.ReLU(),
                                nn.Linear(64, 32), nn.ReLU())
        self.linear = nn.Linear(32, 16)
        
    def forward(self, X):
        return self.linear(self.net(X))
    
chimera = nn.Sequential(NestMLP(), nn.Linear(16, 20), FixedHiddenMLP())
chimera(x)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_addmm)

## 参数管理
首先关注具有参隐藏层的多层感知机

In [22]:
import torch
from torch import nn

net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.randn(size=(2, 4))
net(X)

tensor([[0.1431],
        [0.4060]], grad_fn=<AddmmBackward0>)

In [29]:
# 参数访问
print(net[2].state_dict())

OrderedDict([('weight', tensor([[ 0.3489,  0.3034,  0.2152,  0.0363, -0.1470, -0.0586,  0.3035, -0.1088]])), ('bias', tensor([0.0303]))])


In [30]:
# 目标参数
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([0.0303], requires_grad=True)
tensor([0.0303])


In [31]:
# 此时还没开始反向传播，因此NONE
net[2].weight.grad == None

True

In [34]:
# 一次性访问所有参数, *表示解包的意思
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [35]:
net.state_dict()['0.weight'].data

tensor([[ 0.0143,  0.1397, -0.4827, -0.4155],
        [-0.1635, -0.2532,  0.2536, -0.1404],
        [-0.0105,  0.0315, -0.3571, -0.1983],
        [-0.2115,  0.1083,  0.4309, -0.0385],
        [ 0.0081, -0.2564, -0.3492, -0.1623],
        [-0.0392, -0.3294,  0.0745, -0.0602],
        [ 0.2812, -0.2586,  0.0294, -0.0144],
        [-0.2938,  0.1303,  0.3845, -0.0615]])

In [36]:
# 从嵌套块中收集参数
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        # 此处跟直接名字括号里面写的区别在于没有了0,1，相当于显式给每次字符串命名
        net.add_module(f'block {i}', block1())
    
    return net

rgnet= nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

tensor([[0.1001],
        [0.0998]], grad_fn=<AddmmBackward0>)

In [37]:
rgnet(X)

tensor([[0.1001],
        [0.0998]], grad_fn=<AddmmBackward0>)

In [40]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [42]:
# 内置初始化
def init_normal(m):
    if type(m) == nn.Linear: 
        # _表示inplace操作，会直接替换值而不返回
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)
        
# apply就是遍历操作，对net中的layer进行for loop
net.apply(init_normal)

net[0].weight.data[0], net[0].bias.data[0]

(tensor([-0.0053,  0.0044, -0.0092,  0.0059]), tensor(0.))

In [45]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)

# net.apply(init_constant)
# 实际使用不能初始化为常数，否则不能训练
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [50]:
# 对某些不同的块使用不同的初始化方法
def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
        
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([-0.4052,  0.6939,  0.5479,  0.4905])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [52]:
# 自定义初始化
def my_init(m):
    if type(m) == nn.Linear:
        print(
            "Init",
            *[(name, param.shape) for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5
        
net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[-0.0000,  7.5553,  0.0000, -8.3238],
        [ 0.0000,  0.0000,  0.0000, -6.5870]], grad_fn=<SliceBackward0>)

In [56]:
net[0].weight.data[:] += 1
net[0].weight.data[0, 1] = 42
net[0].weight.data[0]

tensor([43.0000, 42.0000,  3.0000, -5.3238])

In [62]:
# 参数绑定
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), shared, nn.ReLU(), shared,
                     nn.ReLU(), nn.Linear(8, 1))
net(X)
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
print(net[2].weight.data[0] == net[4].weight.data[0])
id(net[2]), id(net[4])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


(139865434242832, 139865434242832)

## 自定义层
### 构造一个没有任何参数的自定义层

In [63]:
import torch
from torch import nn
import torch.nn.functional as F

In [64]:
class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, X):
        return X - X.mean()
    
layer = CenteredLayer()
layer(torch.FloatTensor([1, 2, 3, 4, 5]))

tensor([-2., -1.,  0.,  1.,  2.])

In [65]:
# 将层作为组件合并到构建更复杂的模型中
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())
Y = net(torch.randn(4, 8))
Y.mean()

tensor(9.3132e-10, grad_fn=<MeanBackward0>)

In [67]:
# 带参数的图层
class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(torch.randn(units,))
        
    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)
    
dense = MyLinear(5, 3)
dense.weight

Parameter containing:
tensor([[ 0.0340,  0.6473,  1.4968],
        [-0.1911,  1.4998,  0.4955],
        [-0.3826,  1.7768,  0.2501],
        [ 1.0632, -0.0716, -0.0891],
        [-0.3101, -0.5089, -0.6608]], requires_grad=True)

In [68]:
# 使用自定义层直接执行正向传播计算
dense(torch.rand(2, 5))

tensor([[1.9945, 1.2891, 1.5993],
        [1.6039, 1.4188, 0.5859]])

In [69]:
# 使用自定义层构建模型
net = nn.Sequential(MyLinear(64, 8), MyLinear(8, 1))
net(torch.randn(2, 64))

tensor([[ 0.0000],
        [25.6525]])

# 读写文件
加载和保存张量

In [1]:
import torch
from torch import nn
from torch.nn import functional as F

In [2]:
x = torch.arange(4)

In [4]:
torch.save(x, 'x-file')

In [6]:
!ls

1_basic_numpy_tensor.ipynb  7_overfiting_underfiting.ipynb
2_torch_tensor.ipynb	    8_数值稳定性.ipynb
3_matrix.ipynb		    9_gpu_pytorch_base.ipynb
4_linear_regression.ipynb   README.md
5_softmax.ipynb		    data
6_percpection.ipynb	    x-file


In [7]:
x2 = torch.load("x-file")

In [8]:
x2

tensor([0, 1, 2, 3])

In [9]:
# 存储一个张量，然后把他们读回内存
y = torch.zeros(4)
torch.save([x, y], 'x-files')
x2, y2 = torch.load('x-files')
(x2, y2)

(tensor([0, 1, 2, 3]), tensor([0., 0., 0., 0.]))

In [12]:
# 写入或读取从字符串映射到张量的字典
mydict = {'x': x, 'y': y}
torch.save(mydict, 'mydict')
mydict2 = torch.load('mydict')
mydict2

{'x': tensor([0, 1, 2, 3]), 'y': tensor([0., 0., 0., 0.])}

In [18]:
# 保存和加载模型
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.output = nn.Linear(256, 2)
        
    def forward(self, x):
        return self.output(F.relu(self.hidden(x)))

In [19]:
net = MLP()
X = torch.randn(size = (2, 20))
Y = net(X)
Y

tensor([[ 0.0285, -0.6572],
        [ 0.0433, -0.0883]], grad_fn=<AddmmBackward0>)

In [20]:
# 将模型的参数保存为一个叫做 mlp.params的文件
torch.save(net.state_dict(), 'mlp.params')

### load时，需要先实例化多层感知机的一个备份，然后再读取

In [21]:
clone = MLP()
clone.load_state_dict(torch.load('mlp.params'))
clone.eval()

MLP(
  (hidden): Linear(in_features=20, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=2, bias=True)
)

In [22]:
Y_clone = clone(X)
Y_clone == Y

tensor([[True, True],
        [True, True]])

## QA
#### 做one-hot内存爆了怎么办，如100w个不同的string，类别变量变成伪变量时
- 可以存稀疏矩阵，
- 很多变量时，可以考虑别的办法，如bag for words，等

#### MLP层数有什么讲究嘛？变一般这种？
- 之前讲过 