### 作業目標: 使用Pytorch進行微分與倒傳遞
這份作業我們會實作微分與倒傳遞以及使用Pytorch的Autograd。

### 使用Pytorch實作微分與倒傳遞

這裡我們很簡單的實作兩層的神經網路進行回歸問題，其中loss function為L2 loss

$$
L2\_loss = (y_{pred}-y)^2
$$

兩層經網路如下所示
$$
y_{pred} = ReLU(XW_1)W_2
$$

In [1]:
import torch
device = torch.device('cpu')

In [2]:
# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# 隨機生成x, y
x = torch.randn((N,D_in)).to(device)
y = torch.randn((N,D_out)).to(device)

# 初始化weight W1, W2
W1 = torch.randn((D_in,H)).to(device)
W2 = torch.randn((H,D_out)).to(device)

# 設置learning rate
learning_rate = 1e-6

# 訓練500個epoch
for t in range(500):  
    y_pred = torch.matmul(torch.relu(torch.matmul(x, W1)), W2)

    # 計算loss
    loss = ((y_pred - y)**2).sum()
    print(t, loss.item())

    # 倒傳遞: 計算W1與W2對loss的微分(梯度)
    L2_loss_grad = 2. * (y_pred - y)
    W2_grad = torch.relu(torch.matmul(x, W1)).T.matmul(L2_loss_grad)
    h_grad = L2_loss_grad.matmul(W2.T) * (torch.matmul(x, W1) > 0.)
    W1_grad = x.T.matmul(h_grad)

    # 參數更新
    W1.data -= learning_rate * W1_grad
    W2.data -= learning_rate * W2_grad

0 29573022.0
1 23184328.0
2 20624434.0
3 18734586.0
4 16096101.0
5 12752184.0
6 9217782.0
7 6251038.5
8 4074380.5
9 2651278.5
10 1759594.125
11 1214082.75
12 875682.25
13 659963.5
14 516459.125
15 416554.21875
16 343759.28125
17 288675.71875
18 245477.109375
19 210791.109375
20 182370.4375
21 158731.3125
22 138828.125
23 121921.3515625
24 107449.96875
25 94996.0234375
26 84236.9375
27 74901.5859375
28 66760.2578125
29 59638.046875
30 53386.91796875
31 47886.453125
32 43034.9609375
33 38740.41015625
34 34931.234375
35 31547.79296875
36 28536.154296875
37 25849.71484375
38 23446.892578125
39 21292.970703125
40 19360.87109375
41 17624.69921875
42 16062.65625
43 14654.6220703125
44 13383.599609375
45 12234.2421875
46 11194.0556640625
47 10251.1318359375
48 9395.69140625
49 8619.2880859375
50 7913.21240234375
51 7270.56640625
52 6684.9423828125
53 6150.68603515625
54 5663.13330078125
55 5218.0107421875
56 4810.6806640625
57 4438.47412109375
58 4097.91748046875
59 3785.6748046875
60 3499.423

470 6.110611866461113e-05
471 6.0285849031060934e-05
472 5.953609070274979e-05
473 5.8732053730636835e-05
474 5.797452831757255e-05
475 5.701140980818309e-05
476 5.639107621391304e-05
477 5.568465348915197e-05
478 5.48773241462186e-05
479 5.407694334280677e-05
480 5.3541891247732565e-05
481 5.3209387260721996e-05
482 5.2354811487020925e-05
483 5.18041561008431e-05
484 5.100627458887175e-05
485 5.0374233978800476e-05
486 4.993880065740086e-05
487 4.903002263745293e-05
488 4.875361992162652e-05
489 4.7962985263438895e-05
490 4.7450215788558125e-05
491 4.6704641135875136e-05
492 4.6305416617542505e-05
493 4.59054863313213e-05
494 4.5285007217898965e-05
495 4.47628008259926e-05
496 4.417887976160273e-05
497 4.372231705929153e-05
498 4.3216681660851464e-05
499 4.246036769472994e-05


### 使用Pytorch的Autograd

In [3]:
import torch
device = torch.device('cpu')

In [4]:
# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# 隨機生成x, y
x = torch.randn((N,D_in)).to(device)
y = torch.randn((N,D_out)).to(device)

# 初始化weight W1, W2
W1 = torch.randn((D_in,H), requires_grad=True).to(device)
W2 = torch.randn((H,D_out), requires_grad=True).to(device)

# 設置learning rate
learning_rate = 1e-6

# 訓練500個epoch
for t in range(500):
    # 向前傳遞: 計算y_pred
    y_pred = torch.matmul(torch.relu(torch.matmul(x, W1)), W2)
  
    # 計算loss
    loss = torch.square(y_pred - y).sum()
    print(t, loss.item())

    # 倒傳遞: 計算W1與W2對loss的微分(梯度)
    loss.backward()

    # 參數更新: 這裡再更新參數時，我們不希望更新參數的計算也被紀錄微分相關的資訊，因此使用torch.no_grad()
    with torch.no_grad():
        # 更新參數W1 W2
        W1.data -= learning_rate * W1_grad
        W2.data -= learning_rate * W2_grad

        # 將紀錄的gradient清空(因為已經更新參數)
        W1.grad.zero_()
        W2.grad.zero_()

0 30655236.0
1 30655236.0
2 30655236.0
3 30655236.0
4 30655236.0
5 30655240.0
6 30655240.0
7 30655240.0
8 30655236.0
9 30655240.0
10 30655240.0
11 30655240.0
12 30655240.0
13 30655240.0
14 30655240.0
15 30655244.0
16 30655244.0
17 30655244.0
18 30655244.0
19 30655244.0
20 30655244.0
21 30655244.0
22 30655244.0
23 30655244.0
24 30655244.0
25 30655244.0
26 30655246.0
27 30655246.0
28 30655246.0
29 30655246.0
30 30655246.0
31 30655250.0
32 30655246.0
33 30655250.0
34 30655250.0
35 30655250.0
36 30655250.0
37 30655250.0
38 30655250.0
39 30655250.0
40 30655250.0
41 30655250.0
42 30655252.0
43 30655252.0
44 30655252.0
45 30655256.0
46 30655256.0
47 30655252.0
48 30655252.0
49 30655256.0
50 30655256.0
51 30655256.0
52 30655256.0
53 30655256.0
54 30655256.0
55 30655256.0
56 30655256.0
57 30655258.0
58 30655258.0
59 30655260.0
60 30655260.0
61 30655260.0
62 30655260.0
63 30655260.0
64 30655260.0
65 30655260.0
66 30655260.0
67 30655262.0
68 30655262.0
69 30655262.0
70 30655262.0
71 30655264.0
72