<a href="https://colab.research.google.com/github/yanjiahui123/pytorch-handbook/blob/master/pytorchLearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import torch

device = torch.device('cpu')

N, D_input, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_input, device=device)
y = torch.randn(N, D_out, device=device)

w1 = torch.randn(D_input, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
  y_pred = x.mm(w1).clamp(min=0).mm(w2)
  loss = (y - y_pred).pow(2).sum()
  print(t, loss.item())

  loss.backward()

  with torch.no_grad():
    w1 -= learning_rate * w1.grad
    w2 -= learning_rate * w2.grad

    w1.grad.zero_()
    w2.grad.zero_()

0 32647684.0
1 31363812.0
2 35953592.0
3 39730480.0
4 35992748.0
5 24654360.0
6 12802306.0
7 5771324.0
8 2725245.75
9 1532252.625
10 1031763.875
11 782167.6875
12 631543.625
13 526057.875
14 445456.9375
15 380966.8125
16 328211.3125
17 284310.90625
18 247503.328125
19 216383.71875
20 189894.828125
21 167232.234375
22 147783.0
23 131048.0546875
24 116540.90625
25 103920.359375
26 92892.2265625
27 83219.2109375
28 74713.671875
29 67209.1796875
30 60563.765625
31 54668.51171875
32 49424.3125
33 44749.453125
34 40577.28125
35 36858.61328125
36 33522.04296875
37 30525.73828125
38 27830.85546875
39 25401.08984375
40 23208.287109375
41 21226.615234375
42 19434.806640625
43 17809.54296875
44 16335.00390625
45 14995.33203125
46 13777.7236328125
47 12669.5
48 11659.9892578125
49 10739.193359375
50 9898.19140625
51 9129.6796875
52 8420.353515625
53 7771.9765625
54 7178.0859375
55 6634.32177734375
56 6135.5341796875
57 5677.7265625
58 5257.2109375
59 4870.9169921875
60 4515.74462890625
61 4189.377

In [19]:
import torch

class MyRelu(torch.autograd.Function):

  @staticmethod
  def forward(ctx, x):
    ctx.save_for_backward(x)
    return x.clamp(min=0)
  
  @staticmethod
  def backward(ctx, grad_out):
    x, = ctx.saved_tensors
    grad_x = grad_out.clone()
    grad_x[x < 0] = 0
    return grad_x
  
device = torch.device('cpu')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6

for t in range(500):
  y_pred = MyRelu.apply(x.mm(w1)).mm(w2)
  loss = (y - y_pred).pow(2).sum()
  print(t, loss.item())

  loss.backward()

  with torch.no_grad():
    w1 -= learning_rate * w1.grad
    w2 -= learning_rate * w2.grad

    w1.grad.zero_()
    w2.grad.zero_()

0 39844444.0
1 39837460.0
2 41704984.0
3 36857388.0
4 25423020.0
5 13468573.0
6 6386385.5
7 3191213.75
8 1900571.875
9 1330822.625
10 1031887.0625
11 842004.5625
12 705472.0625
13 599917.0
14 515045.5625
15 445274.0
16 387162.0
17 338340.21875
18 297069.21875
19 262031.25
20 231987.484375
21 206076.640625
22 183666.84375
23 164215.75
24 147235.515625
25 132329.984375
26 119192.7265625
27 107581.3828125
28 97290.2421875
29 88139.8125
30 79981.8984375
31 72697.4765625
32 66179.3515625
33 60330.671875
34 55069.29296875
35 50324.4609375
36 46041.64453125
37 42167.234375
38 38657.953125
39 35478.921875
40 32591.76953125
41 29966.623046875
42 27575.125
43 25395.794921875
44 23406.26953125
45 21589.693359375
46 19927.837890625
47 18406.203125
48 17011.609375
49 15732.302734375
50 14557.7841796875
51 13478.9140625
52 12486.5478515625
53 11573.748046875
54 10733.1171875
55 9958.5146484375
56 9244.4638671875
57 8585.5341796875
58 7977.48779296875
59 7415.92724609375
60 6896.79736328125
61 6416.6

In [22]:
import torch

device = torch.device('cpu')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
).to(device)

loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4

for t in range(500):
  y_pred = model(x)

  loss = loss_fn(y_pred, y)
  print(t, loss.item())

  model.zero_grad()

  loss.backward()

  with torch.no_grad():
    for param in model.parameters():
      param.data -= learning_rate * param.grad
      print(param.shape)

0 646.6494750976562
torch.Size([100, 1000])
torch.Size([100])
torch.Size([10, 100])
torch.Size([10])
1 596.8502197265625
torch.Size([100, 1000])
torch.Size([100])
torch.Size([10, 100])
torch.Size([10])
2 554.0298461914062
torch.Size([100, 1000])
torch.Size([100])
torch.Size([10, 100])
torch.Size([10])
3 516.1879272460938
torch.Size([100, 1000])
torch.Size([100])
torch.Size([10, 100])
torch.Size([10])
4 482.7661437988281
torch.Size([100, 1000])
torch.Size([100])
torch.Size([10, 100])
torch.Size([10])
5 452.6790771484375
torch.Size([100, 1000])
torch.Size([100])
torch.Size([10, 100])
torch.Size([10])
6 425.33758544921875
torch.Size([100, 1000])
torch.Size([100])
torch.Size([10, 100])
torch.Size([10])
7 400.26666259765625
torch.Size([100, 1000])
torch.Size([100])
torch.Size([10, 100])
torch.Size([10])
8 377.0878601074219
torch.Size([100, 1000])
torch.Size([100])
torch.Size([10, 100])
torch.Size([10])
9 355.5338439941406
torch.Size([100, 1000])
torch.Size([100])
torch.Size([10, 100])
torch

In [2]:
import torch

device = torch.device('cpu')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
).to(device)


learning_rate = 1e-4

loss_fn = torch.nn.MSELoss(reduction='sum')

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


for t in range(500):
  y_pred = model(x)

  loss = loss_fn(y_pred, y)
  print(t, loss.item())

  optimizer.zero_grad()

  loss.backward()

  optimizer.step()
  

0 686.0032348632812
1 669.3762817382812
2 653.21240234375
3 637.4785766601562
4 622.16796875
5 607.2908325195312
6 592.843994140625
7 578.7908325195312
8 565.209228515625
9 552.091552734375
10 539.3091430664062
11 526.8460083007812
12 514.744873046875
13 502.94964599609375
14 491.44244384765625
15 480.2275390625
16 469.284912109375
17 458.5810241699219
18 448.1910400390625
19 438.0715637207031
20 428.19476318359375
21 418.65509033203125
22 409.38153076171875
23 400.327880859375
24 391.5376892089844
25 382.9677429199219
26 374.6051025390625
27 366.436767578125
28 358.4083251953125
29 350.5547790527344
30 342.92034912109375
31 335.4521484375
32 328.1229553222656
33 320.92303466796875
34 313.8473815917969
35 306.8738708496094
36 300.0171203613281
37 293.281005859375
38 286.6653137207031
39 280.17413330078125
40 273.79522705078125
41 267.5245666503906
42 261.37127685546875
43 255.31625366210938
44 249.39154052734375
45 243.57159423828125
46 237.85855102539062
47 232.26632690429688
48 226.7

In [3]:
import torch

class TwoLayer(torch.nn.Module):
  def __init__(self, D_in, H, D_out):
    super(TwoLayer, self).__init__()
    self.linear1 = torch.nn.Linear(D_in, H)
    self.linear2 = torch.nn.Linear(H, D_out)
  
  def forward(self, x):
    h_relu = self.linear1(x).clamp(min=0)
    y_pred = self.linear2(h_relu)
    return y_pred
  
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = TwoLayer(D_in, H, D_out)

loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for t in range(500):

  y_pred = model(x)

  loss = loss_fn(y_pred, y)

  print(t, loss.item())

  optimizer.zero_grad()

  loss.backward()

  optimizer.step()
  

0 686.8893432617188
1 670.1851806640625
2 653.979736328125
3 638.2551879882812
4 623.0625
5 608.32763671875
6 593.9436645507812
7 579.9166259765625
8 566.330078125
9 553.170654296875
10 540.41259765625
11 528.0795288085938
12 516.0573120117188
13 504.3270568847656
14 492.8406982421875
15 481.7505798339844
16 470.988525390625
17 460.5028076171875
18 450.2523498535156
19 440.2481689453125
20 430.4924011230469
21 420.9295654296875
22 411.59808349609375
23 402.53515625
24 393.672607421875
25 384.98333740234375
26 376.46380615234375
27 368.13568115234375
28 360.0203857421875
29 352.08990478515625
30 344.3205261230469
31 336.7179870605469
32 329.2821044921875
33 322.0041809082031
34 314.83642578125
35 307.7794189453125
36 300.88507080078125
37 294.14959716796875
38 287.55682373046875
39 281.09942626953125
40 274.74432373046875
41 268.5069274902344
42 262.38043212890625
43 256.3664855957031
44 250.45516967773438
45 244.62086486816406
46 238.8939208984375
47 233.2767333984375
48 227.7677001953

In [6]:
import random
import torch

class DynamicNet(torch.nn.Module):
  def __init__(self, D_in, H, D_out):
    super(DynamicNet, self).__init__()
    self.input_linear = torch.nn.Linear(D_in, H)
    self.middle_linear = torch.nn.Linear(H, H)
    self.output_linear = torch.nn.Linear(H, D_out)

  def forward(self, x):
    h_relu = self.input_linear(x).clamp(min=0)

    for _ in range(random.randint(0,3)):
      h_relu = self.middle_linear(h_relu).clamp(min=0)
    y_pred =  self.output_linear(h_relu)
    return y_pred

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)

y = torch.randn(N, D_out)

learning_rate = 1e-4

criterion = torch.nn.MSELoss(reduction='sum')

model = DynamicNet(D_in, H, D_out)

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

for t in range(500):
  y_pred = model(x)

  loss = criterion(y_pred, y)

  print(t, loss.item())

  optimizer.zero_grad()

  loss.backward()

  optimizer.step()

0 679.5557861328125
1 727.0682373046875
2 672.5166625976562
3 673.1077880859375
4 669.641845703125
5 654.0753784179688
6 667.0050048828125
7 628.7530517578125
8 663.4086303710938
9 653.5300903320312
10 659.8729248046875
11 356.5311279296875
12 642.1958618164062
13 654.0440673828125
14 651.3298950195312
15 551.9733276367188
16 619.3939819335938
17 638.6834106445312
18 250.960205078125
19 496.4230651855469
20 579.551025390625
21 610.0798950195312
22 598.6616821289062
23 533.2574462890625
24 568.20849609375
25 384.4667663574219
26 361.55072021484375
27 150.3515167236328
28 305.4895324707031
29 276.76104736328125
30 381.39154052734375
31 353.17938232421875
32 320.45501708984375
33 197.3142852783203
34 353.8849792480469
35 311.2147521972656
36 208.40948486328125
37 276.19061279296875
38 183.0838165283203
39 171.82748413085938
40 147.14291381835938
41 332.02239990234375
42 277.27923583984375
43 230.27005004882812
44 398.7292785644531
45 157.37078857421875
46 300.68511962890625
47 145.4284667