#### Một mạng học với numpy
Một mạng học với vector đầu vào 64 chiều, output 10 chiều, hidden layer 64 node, có ReLU, hàm lỗi l2 (neural network)

In [1]:
# -*- coding: utf-8 -*-
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 25376487.364383105
1 17199433.472263467
2 13663459.257381603
3 12016769.577510707
4 11134931.628481641
5 10397012.102562303
6 9513864.84220118
7 8353981.655970557
8 7012908.351336306
9 5618892.414014231
10 4345264.88597982
11 3265099.6750499103
12 2419414.4836796
13 1780804.6516404683
14 1316974.8044137228
15 983642.9362093029
16 746497.9271794932
17 576653.6966276998
18 454553.169248445
19 365316.12600330054
20 299177.3974917346
21 249136.37590312277
22 210576.3854993299
23 180268.5078471123
24 155977.13679089135
25 136183.0555122341
26 119794.3095539607
27 106025.30503570806
28 94332.47059155305
29 84285.38283383357
30 75591.57757235954
31 68008.80356028468
32 61352.91543983854
33 55480.17846080706
34 50276.682955947734
35 45652.00296899556
36 41525.0895256051
37 37835.27947735637
38 34524.41159450427
39 31546.211358674722
40 28861.0740711436
41 26435.845598049033
42 24241.67827204369
43 22253.245225395593
44 20449.01066106598
45 18809.967836064843
46 17319.709615439453
47 15961.34

448 1.3865920943561642e-05
449 1.3247699608532944e-05
450 1.2657202678670592e-05
451 1.2093200166366684e-05
452 1.1554491117033422e-05
453 1.1039958586069643e-05
454 1.0548455760282888e-05
455 1.0078975919435482e-05
456 9.630436051126872e-06
457 9.201993944159193e-06
458 8.79272430554099e-06
459 8.40189311009381e-06
460 8.028384893555977e-06
461 7.671541904731817e-06
462 7.330743745255149e-06
463 7.005147724306644e-06
464 6.694090394498027e-06
465 6.396934891018315e-06
466 6.11304337879112e-06
467 5.841796954613675e-06
468 5.582614141044094e-06
469 5.335081521721157e-06
470 5.098591130202075e-06
471 4.872588459769043e-06
472 4.656638624630036e-06
473 4.450351912889093e-06
474 4.2532999917152965e-06
475 4.064943020863135e-06
476 3.8849891963672865e-06
477 3.713081855007419e-06
478 3.548790428643377e-06
479 3.391810093459603e-06
480 3.241810829022296e-06
481 3.0985416190526896e-06
482 2.9615598116805653e-06
483 2.830678567669764e-06
484 2.705617884234075e-06
485 2.58613860223361e-06
486 

#### Tương tự ở trên nhưng là pytorch với autograd

##### 1. vẫn còn tương đối gần với code tự nhiên
Thêm device=device cho các inputs labels (x, y) và các weight (w1, w2)

In [2]:
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 29763322.0
1 24948024.0
2 23357938.0
3 21568012.0
4 18237736.0
5 13731391.0
6 9289232.0
7 5889218.5
8 3668704.25
9 2353200.25
10 1594990.75
11 1152038.0
12 880815.125
13 704225.5625
14 580987.25
15 489712.65625
16 418704.625
17 361482.09375
18 314249.9375
19 274666.25
20 241023.046875
21 212224.828125
22 187459.21875
23 166052.375
24 147523.109375
25 131384.9375
26 117283.921875
27 104909.46875
28 94029.96875
29 84437.9765625
30 75957.734375
31 68441.78125
32 61761.1953125
33 55818.8046875
34 50515.80859375
35 45779.33984375
36 41537.3828125
37 37735.3046875
38 34323.171875
39 31255.052734375
40 28493.091796875
41 26001.291015625
42 23751.384765625
43 21718.4765625
44 19877.9609375
45 18210.939453125
46 16698.51171875
47 15324.9990234375
48 14075.841796875
49 12938.06640625
50 11901.595703125
51 10955.974609375
52 10092.9853515625
53 9304.1005859375
54 8582.5078125
55 7922.25341796875
56 7317.25537109375
57 6762.7099609375
58 6253.95361328125
59 5787.00634765625
60 5357.7626953125
61

386 0.0004998681833967566
387 0.000484687858261168
388 0.0004714067908935249
389 0.000458129943581298
390 0.00044501773663796484
391 0.0004322260501794517
392 0.00042039304389618337
393 0.0004088736022822559
394 0.00039782619569450617
395 0.0003866181359626353
396 0.0003748924646060914
397 0.0003659101785160601
398 0.0003567873209249228
399 0.0003477884456515312
400 0.00033863988937810063
401 0.0003296062641311437
402 0.0003215925535187125
403 0.00031371659133583307
404 0.00030583416810259223
405 0.00029782054480165243
406 0.00029071333119645715
407 0.0002835628401953727
408 0.00027554493863135576
409 0.00026936986250802875
410 0.00026312534464523196
411 0.0002565843751654029
412 0.0002503615105524659
413 0.0002448514278512448
414 0.00023918820079416037
415 0.00023334138677455485
416 0.00022825313499197364
417 0.00022300888667814434
418 0.00021796562941744924
419 0.00021372761693783104
420 0.00020866244449280202
421 0.00020332704298198223
422 0.00019860619795508683
423 0.00019464566139

##### 2. Dùng torch.nn để khai báo luôn model, và dùng optim để tự update weights

In [3]:
# -*- coding: utf-8 -*-
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
device = ('cuda:0' if torch.cuda.is_available() else 'cpu')

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
).to(device)
loss_fn = torch.nn.MSELoss(reduction='sum')


# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    x, y = x.to(device), y.to(device)
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 639.7642822265625
1 623.5816040039062
2 607.8521118164062
3 592.5883178710938
4 577.76708984375
5 563.3704833984375
6 549.4036254882812
7 535.7747192382812
8 522.50732421875
9 509.61761474609375
10 497.0679931640625
11 484.88507080078125
12 473.03289794921875
13 461.496337890625
14 450.27569580078125
15 439.3400573730469
16 428.6935729980469
17 418.35284423828125
18 408.2983093261719
19 398.51019287109375
20 388.95611572265625
21 379.6602783203125
22 370.68389892578125
23 361.9196472167969
24 353.36016845703125
25 345.0015563964844
26 336.82208251953125
27 328.8321533203125
28 321.0518493652344
29 313.4681396484375
30 306.06298828125
31 298.8232421875
32 291.7303161621094
33 284.80010986328125
34 278.0565490722656
35 271.461181640625
36 265.0228576660156
37 258.73394775390625
38 252.57754516601562
39 246.56126403808594
40 240.65255737304688
41 234.85488891601562
42 229.17515563964844
43 223.59170532226562
44 218.13546752929688
45 212.81443786621094
46 207.60543823242188
47 202.498168

451 8.274094653870634e-08
452 7.637758869805111e-08
453 7.063105300630923e-08
454 6.516469142070491e-08
455 6.021370779762947e-08
456 5.56410846286326e-08
457 5.140747205700791e-08
458 4.7414690840241747e-08
459 4.3805382432537954e-08
460 4.0444085414037545e-08
461 3.732212405793689e-08
462 3.4472144250230485e-08
463 3.181057905976559e-08
464 2.9391483025165144e-08
465 2.711746027728168e-08
466 2.5045496343523155e-08
467 2.3126171200260615e-08
468 2.1351917567358214e-08
469 1.970950869178978e-08
470 1.8187931161151027e-08
471 1.6794897916838636e-08
472 1.5538306641360577e-08
473 1.433162122310705e-08
474 1.326532839129868e-08
475 1.2227997281399894e-08
476 1.1342833339256231e-08
477 1.0474714429165033e-08
478 9.684045565450106e-09
479 8.950721941403117e-09
480 8.267740270184731e-09
481 7.647249944398027e-09
482 7.064761220476612e-09
483 6.567926202905028e-09
484 6.080855374790417e-09
485 5.622646348513172e-09
486 5.199823682744409e-09
487 4.819109999942839e-09
488 4.467223035931056e-09

##### 3. Tự custom model bằng cách override nn.Module
Sửa phần __init__: các thành phần có trong mạng học   
Sửa phần forward: cài thuật lan truyền tới (nhân ma trận, activate function, ...)

In [4]:
# -*- coding: utf-8 -*-
import torch


class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
device = ('cuda:0' if torch.cuda.is_available() else 'cpu') 

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out).to(device)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    x, y = x.to(device), y.to(device)
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 665.6723022460938
1 615.6961669921875
2 572.3592529296875
3 534.5738525390625
4 500.7029724121094
5 470.1873779296875
6 442.5574951171875
7 417.37603759765625
8 394.1014099121094
9 372.4454345703125
10 352.25006103515625
11 333.30999755859375
12 315.4559020996094
13 298.6021728515625
14 282.6286315917969
15 267.4856262207031
16 253.1005859375
17 239.3969268798828
18 226.30384826660156
19 213.82769775390625
20 201.97067260742188
21 190.6277313232422
22 179.7881622314453
23 169.496826171875
24 159.726318359375
25 150.4066619873047
26 141.5497283935547
27 133.13294982910156
28 125.15528106689453
29 117.60627746582031
30 110.47349548339844
31 103.72522735595703
32 97.37612915039062
33 91.39659881591797
34 85.77445983886719
35 80.47434997558594
36 75.50691223144531
37 70.84098815917969
38 66.45990753173828
39 62.3533935546875
40 58.50189208984375
41 54.899051666259766
42 51.527557373046875
43 48.367252349853516
44 45.40916061401367
45 42.640960693359375
46 40.05049133300781
47 37.63305664

##### 4. Dynamic Net, số hidden layer ở giữa sẽ là không cố định, mỗi lần forward thì số hidden là khác nhau (random trong khoảng 0-3)
Mô hình hơi lạ nên học hơi ho lao, loss ko thấp bằng các model trước

In [5]:
# -*- coding: utf-8 -*-
import random
import torch


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


device = ('cuda:0' if torch.cuda.is_available() else 'cpu') 
    
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out).to(device)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    x, y = x.to(device), y.to(device)
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 593.6741943359375
1 592.4642333984375
2 634.0089111328125
3 588.4356079101562
4 591.5281982421875
5 583.9599609375
6 597.8818969726562
7 579.201416015625
8 586.0654907226562
9 443.618408203125
10 567.7295532226562
11 573.017578125
12 571.3245239257812
13 568.6451416015625
14 588.27490234375
15 561.6085205078125
16 302.44500732421875
17 277.9593811035156
18 243.1953125
19 503.7231140136719
20 583.2324829101562
21 145.80174255371094
22 470.2858581542969
23 576.9555053710938
24 573.2031860351562
25 567.87060546875
26 399.6158142089844
27 80.72774505615234
28 77.83818054199219
29 68.96293640136719
30 56.47093200683594
31 319.34307861328125
32 523.5900268554688
33 286.15130615234375
34 261.9869079589844
35 39.10780334472656
36 406.4261474609375
37 39.74056625366211
38 449.02972412109375
39 37.03495788574219
40 411.3583068847656
41 386.56768798828125
42 359.30340576171875
43 332.6279602050781
44 303.91436767578125
45 276.487060546875
46 152.99517822265625
47 216.39329528808594
48 235.66336

416 0.3378777503967285
417 0.7126188278198242
418 0.4183761775493622
419 0.32966262102127075
420 0.32621216773986816
421 0.8958817720413208
422 0.7495238780975342
423 0.4049714207649231
424 0.1755066215991974
425 0.8913800716400146
426 0.603550136089325
427 0.38063037395477295
428 0.5048751831054688
429 0.835598349571228
430 0.16939230263233185
431 0.32921385765075684
432 0.3984529674053192
433 0.6797975897789001
434 0.6372608542442322
435 0.10016734153032303
436 0.5591944456100464
437 0.4569770097732544
438 0.4281143546104431
439 0.44734644889831543
440 0.449373722076416
441 0.10140588134527206
442 0.09843084216117859
443 0.6897181272506714
444 0.41510045528411865
445 0.5829455256462097
446 0.0575246624648571
447 0.33093732595443726
448 0.33952564001083374
449 0.4323686361312866
450 0.4148242175579071
451 0.2738761305809021
452 0.06326846033334732
453 0.4349290430545807
454 0.05365229398012161
455 0.4324166476726532
456 0.26484405994415283
457 0.03667404502630234
458 0.395539551973342