## pytorch实现两层神经网络
- 文档：https://pytorch.org/docs/torch

In [1]:
import torch

In [2]:
torch.__version__

'1.0.1'

## 用numpy实现两层神经网络
一个全连接ReLU神经网络，一个隐藏层，没有bias。用来从x预测y，使用L2 Loss。
- $h = W_1X$
- $a = max(0, h)$
- $y_{hat} = W_2a$

这一实现完全使用numpy来计算前向神经网络，loss，和反向传播。
- forward pass
- loss
- backward pass

numpy ndarray是一个普通的n维array。它不知道任何关于深度学习或者梯度(gradient)的知识，也不知道计算图(computation graph)，只是一种用来计算数学运算的数据结构。

In [3]:
import numpy as np

In [18]:
# 样本个数，输入维度，hinton, 输出维度
N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建一些训练数据
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)
print("训练集维度：", len(x[1]))

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for it in range(501):
    # 向前传播
    h = x.dot(w1)   # N * H
    # X 与 Y 逐位比较取其大者, 至少接收两个参数
    h_relu = np.maximum(h, 0)  # N * H
    y_pred = h_relu.dot(w2)  # N * D_out
    
    # 计算损失
    loss = np.square(y_pred - y).sum()
    # print(it, loss)
    
    # 反向传播
    # 计算梯度
    grad_y_pred = 2.0 * (y_pred - y)  # N * D_out
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # update weights of w1 and w2
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    if it % 50 == 0:
        print("itor: {} | loss:{}".format(it, loss))

训练集维度： 1000
itor: 0 | loss:36379916.30562085
itor: 50 | loss:11522.84048949815
itor: 100 | loss:312.54151381266536
itor: 150 | loss:13.699522200570192
itor: 200 | loss:0.7797040963869617
itor: 250 | loss:0.05274861775962181
itor: 300 | loss:0.004007803190526088
itor: 350 | loss:0.00032858051495920253
itor: 400 | loss:2.82897568570412e-05
itor: 450 | loss:2.5142220375801123e-06
itor: 500 | loss:2.2822002515655443e-07


## PyTorch: Tensors
- 使用PyTorch tensors来创建前向神经网络，计算损失，以及反向传播。
- 一个PyTorch Tensor很像一个numpy的ndarray。但是它和numpy ndarray最大的区别是，PyTorch Tensor可以在CPU或者GPU上运算。如果想要在GPU上运算，就需要把Tensor换成cuda类型。

In [35]:
N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

w1 = torch.randn(D_in, H)
w2 = torch.randn(H, D_out)

learning_rate = 1e-6
for it in range(501):
    # Forward pass
    h = x.mm(w1) # N * H
    # clamp(min=x)小于x的等于x，＞x等于本身
    h_relu = h.clamp(min=0) # N * H
    y_pred = h_relu.mm(w2) # N * D_out
    
    # compute loss
    loss = (y_pred - y).pow(2).sum().item()
    
    # Backward pass
    # compute the gradient
    grad_y_pred = 2.0 * (y_pred - y)
    ## mm是点乘
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # update weights of w1 and w2
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    if it % 50 == 0:
        print("itor: {} | loss:{}".format(it, loss))

itor: 0 | loss:26421104.0
itor: 50 | loss:8897.6162109375
itor: 100 | loss:248.85116577148438
itor: 150 | loss:12.546610832214355
itor: 200 | loss:0.842868447303772
itor: 250 | loss:0.06756731867790222
itor: 300 | loss:0.006269404664635658
itor: 350 | loss:0.0008211490930989385
itor: 400 | loss:0.0001843837380874902
itor: 450 | loss:6.254202889977023e-05
itor: 500 | loss:2.910074545070529e-05


In [44]:
a = torch.tensor([[-1,2,3]])
a

tensor([[-1,  2,  3]])

In [45]:
h_relu = a.clamp(min=0)
h_relu

tensor([[0, 2, 3]])

## PyTorch: nn

- 这次我们使用PyTorch中nn这个库来构建网络。
- 用PyTorch autograd来构建计算图和计算gradients，
- 然后PyTorch会帮我们自动计算gradient。

In [48]:
import torch.nn as nn

N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H, bias=False), # w_1 * x + b_1
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out, bias=False),
)

torch.nn.init.normal_(model[0].weight)
torch.nn.init.normal_(model[2].weight)

# model = model.cuda()

loss_fn = nn.MSELoss(reduction='sum')

learning_rate = 1e-6
for it in range(501):
    # Forward pass
    y_pred = model(x) # model.forward() 
    
    # compute loss
    loss = loss_fn(y_pred, y) # computation graph
#     print(it, loss.item())
    
    # Backward pass
    loss.backward()
    
    # update weights of w1 and w2
    with torch.no_grad():
        for param in model.parameters(): # param (tensor, grad)
            param -= learning_rate * param.grad
            
    model.zero_grad()
    if it % 50 == 0:
        print("itor: {} | loss:{}".format(it, loss))

itor: 0 | loss:34639348.0
itor: 50 | loss:14032.80859375
itor: 100 | loss:599.4075927734375
itor: 150 | loss:46.61559295654297
itor: 200 | loss:4.714935302734375
itor: 250 | loss:0.5426292419433594
itor: 300 | loss:0.06672129034996033
itor: 350 | loss:0.00874057225883007
itor: 400 | loss:0.0014095803489908576
itor: 450 | loss:0.00035330350510776043
itor: 500 | loss:0.0001314057590207085


## PyTorch: optim

- 这一次我们不再手动更新模型的weights,而是使用optim这个包来帮助我们更新参数。
- optim这个package提供了各种不同的模型优化方法，包括SGD+momentum, RMSProp, Adam等等。

In [50]:
import torch.nn as nn

N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H, bias=False), # w_1 * x + b_1
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out, bias=False),
)

torch.nn.init.normal_(model[0].weight)
torch.nn.init.normal_(model[2].weight)

# model = model.cuda()

loss_fn = nn.MSELoss(reduction='sum')
# learning_rate = 1e-4
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

learning_rate = 1e-6
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

for it in range(500):
    # Forward pass
    y_pred = model(x) # model.forward() 
    
    # compute loss
    loss = loss_fn(y_pred, y) # computation graph
#     print(it, loss.item())

    optimizer.zero_grad()
    # Backward pass
    loss.backward()
    
    # update model parameters
    optimizer.step()
    if it % 50 == 0:
        print("itor: {} | loss:{}".format(it, loss))


itor: 0 | loss:25604472.0
itor: 50 | loss:13182.306640625
itor: 100 | loss:355.7388916015625
itor: 150 | loss:17.296062469482422
itor: 200 | loss:1.1180250644683838
itor: 250 | loss:0.08529026061296463
itor: 300 | loss:0.0075076608918607235
itor: 350 | loss:0.0009207671391777694
itor: 400 | loss:0.0002065193111775443
itor: 450 | loss:7.471397111658007e-05


PyTorch: 自定义 nn Modules
--------------------------

我们可以定义一个模型，这个模型继承自nn.Module类。如果需要定义一个比Sequential模型更加复杂的模型，就需要定义nn.Module模型。

In [51]:
import torch.nn as nn

N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        # define the model architecture
        self.linear1 = torch.nn.Linear(D_in, H, bias=False)
        self.linear2 = torch.nn.Linear(H, D_out, bias=False)
    
    def forward(self, x):
        y_pred = self.linear2(self.linear1(x).clamp(min=0))
        return y_pred

model = TwoLayerNet(D_in, H, D_out)
loss_fn = nn.MSELoss(reduction='sum')
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for it in range(500):
    # Forward pass
    y_pred = model(x) # model.forward() 
    
    # compute loss
    loss = loss_fn(y_pred, y) # computation graph
    print(it, loss.item())

    optimizer.zero_grad()
    # Backward pass
    loss.backward()
    
    # update model parameters
    optimizer.step()
    if it % 50 == 0:
        print("itor: {} | loss:{}".format(it, loss))

0 733.0379028320312
itor: 0 | loss:733.0379028320312
1 715.0055541992188
2 697.4888916015625
3 680.5151977539062
4 664.1006469726562
5 648.2057495117188
6 632.8078002929688
7 617.8178100585938
8 603.2796020507812
9 589.1598510742188
10 575.4701538085938
11 562.09765625
12 549.1077270507812
13 536.4368286132812
14 524.1356201171875
15 512.1729736328125
16 500.55474853515625
17 489.3145446777344
18 478.4518127441406
19 467.8425598144531
20 457.54962158203125
21 447.5525817871094
22 437.77862548828125
23 428.2828674316406
24 419.04010009765625
25 410.01165771484375
26 401.2219543457031
27 392.6513977050781
28 384.2953796386719
29 376.14251708984375
30 368.1478576660156
31 360.3475036621094
32 352.6905517578125
33 345.2397766113281
34 337.96453857421875
35 330.8605041503906
36 323.9277648925781
37 317.14654541015625
38 310.4918212890625
39 303.96148681640625
40 297.5679016113281
41 291.3141174316406
42 285.1827087402344
43 279.1679992675781
44 273.2625732421875
45 267.4571228027344
46 261.

365 0.0008798656053841114
366 0.0008372166194021702
367 0.0007965742261148989
368 0.0007578626973554492
369 0.0007209571776911616
370 0.0006857806001789868
371 0.000652278249617666
372 0.0006203744560480118
373 0.0005900075775571167
374 0.0005610770895145833
375 0.0005335356690920889
376 0.0005072933272458613
377 0.0004823018389288336
378 0.00045850162860006094
379 0.0004358526784926653
380 0.00041428091935813427
381 0.00039375151391141117
382 0.00037419970612972975
383 0.0003555973235052079
384 0.00033788266591727734
385 0.0003210420545656234
386 0.0003050037776120007
387 0.00028973701409995556
388 0.0002752211585175246
389 0.0002614006807561964
390 0.0002482580894138664
391 0.00023575864906888455
392 0.0002238633023807779
393 0.00021254907187540084
394 0.000201794522581622
395 0.0001915686298161745
396 0.00018184821237809956
397 0.00017260605818592012
398 0.0001638112444197759
399 0.00015545614587608725
400 0.0001475143217248842
itor: 400 | loss:0.0001475143217248842
401 0.0001399632