In [19]:
import torch
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

w1 = torch.randn(D_in, H,  requires_grad=True)
w2 = torch.randn(H, D_out, requires_grad=True)
lr = 1e-6

for it in range(500):
    #前行传播，预测值
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    #损失函数：
    loss = (y - y_pred).pow(2).sum()

    #打印损失函数
    if it % 10 == 0:
        print(it, loss.item())
    
    #反响传播，计算梯度
    loss.backward()
    
    #更新梯度
    with torch.no_grad():
                   
        w1 -= lr * w1.grad
        w2 -= lr * w2.grad
        
        #初始化梯度
        w1.grad.zero_()
        w2.grad.zero_()
        

0 25879036.0
10 1958814.75
20 229992.21875
30 71788.4453125
40 26634.759765625
50 10894.345703125
60 4758.9560546875
70 2180.8173828125
80 1034.8173828125
90 504.6166687011719
100 252.1534423828125
110 128.49661254882812
120 66.53272247314453
130 34.911956787109375
140 18.526264190673828
150 9.924428939819336
160 5.359869003295898
170 2.9148449897766113
180 1.594876766204834
190 0.8767669200897217
200 0.4842936396598816
210 0.26847556233406067
220 0.14940953254699707
230 0.0833967924118042
240 0.04670237377285957
250 0.026254083961248398
260 0.014823523350059986
270 0.008420996367931366
280 0.004856535233557224
290 0.002866468159481883
300 0.0017428763676434755
310 0.0010969940340146422
320 0.0007193601340986788
330 0.0004887799732387066
340 0.00034424426849000156
350 0.0002498180838301778
360 0.0001871380809461698
370 0.00014319577894639224
380 0.00011258618178544566
390 9.023600432556123e-05
400 7.332509994739667e-05
410 6.065829438739456e-05
420 5.091541970614344e-05
430 4.297457417

In [30]:
import torch.nn as nn
torch.manual_seed(369)

N, D_in, D_out, H = 64, 1000, 10, 100

#构建样本数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

#构建模型
model = nn.Sequential(
    nn.Linear(D_in, H, bias=False),
    nn.ReLU6(),
    nn.Linear(H, D_out, bias=False)
)
#模型权重初始化
nn.init.normal_(model[0].weight)
nn.init.normal_(model[2].weight)

#损失函数
loss_fn = nn.MSELoss(reduction='sum')

#学习率
lr = 1e-5
#优化器
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

#训练模型
for t in range(5000):
    #前向传播
    y_pred = model(x)
    #损失函数
    loss = loss_fn(y_pred, y)
    
    if t % 50 == 0:
        print(t, loss.item())
    
    #优化器参数初始化
    optimizer.zero_grad()
    
    #反向传播
    loss.backward()
    
    #更新参数
    optimizer.step()

        



0 1227997.875
50 62189.02734375
100 23484.353515625
150 11969.5927734375
200 7079.94140625
250 4615.00390625
300 3217.106201171875
350 2355.58984375
400 1788.4459228515625
450 1395.3150634765625
500 1111.767578125
550 895.470703125
600 733.0950927734375
650 608.497314453125
700 510.5863952636719
750 432.3362121582031
800 368.9596252441406
850 317.1201171875
900 274.3964538574219
950 238.84898376464844
1000 208.9881591796875
1050 183.691162109375
1100 162.10781860351562
1150 143.57879638671875
1200 127.58416748046875
1250 113.70838928222656
1300 101.61593627929688
1350 91.03378295898438
1400 81.73772430419922
1450 73.54264831542969
1500 66.2945785522461
1550 59.864803314208984
1600 54.14491653442383
1650 49.04378890991211
1700 44.48529815673828
1750 40.40127182006836
1800 36.73427200317383
1850 33.435115814208984
1900 30.461681365966797
1950 27.77726173400879
2000 25.350141525268555
2050 23.143339157104492
2100 21.141538619995117
2150 19.32623863220215
2200 17.677274703979492
2250 16.17

In [33]:
import torch.nn as nn

N, D_in, H, D_out = 64, 1000, 100, 10

torch.manual_seed(1111)

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

class TwoLayerNet(torch.nn.Module):
    
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        
        self.linear1 = torch.nn.Linear(D_in, H, bias=False )
        self.linear2 = torch.nn.Linear(H, D_out, bias=False)
        
    def forward(self, x):
        y_pred =self.linear2(self.linear1(x).clamp(0))
        return y_pred
    
model = TwoLayerNet(D_in, H, D_out)

loss_fn = nn.MSELoss(reduction='sum')
lr = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
for t in range(500):
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    
    if t % 10 == 0:
        print(t, loss.item())
    optimizer.zero_grad()
    
    loss.backward()
    
    optimizer.step()
    

0 638.0828857421875
10 492.1266784667969
20 383.0085754394531
30 299.4586486816406
40 233.99899291992188
50 181.67398071289062
60 139.5641326904297
70 105.86624908447266
80 79.0312271118164
90 58.01652526855469
100 41.79880905151367
110 29.483972549438477
120 20.319873809814453
130 13.65184497833252
140 8.966757774353027
150 5.76162052154541
160 3.6332035064697266
170 2.256176471710205
180 1.3856301307678223
190 0.8450749516487122
200 0.5129745602607727
210 0.3101717233657837
220 0.18695853650569916
230 0.11240239441394806
240 0.06741150468587875
250 0.040293749421834946
260 0.02396821416914463
270 0.014244171790778637
280 0.008457311429083347
290 0.005008826497942209
300 0.0029593869112432003
310 0.001744803972542286
320 0.0010265009477734566
330 0.0006026564515195787
340 0.0003531278343871236
350 0.00020647705241572112
360 0.00012043593596899882
370 7.005003863014281e-05
380 4.058843114762567e-05
390 2.3414890165440738e-05
400 1.3435500477498863e-05
410 7.658702998014633e-06
420 4.33