In [2]:
import numpy as np

# N is batch size: D_in is input dimension;
# H is hidden dimension; D_out is output dimension

N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = np.matmul(x, w1)
    h_relu = np.maximum(h, 0)
    y_pred = np.matmul(h_relu, w2)

    # Compute and print loss
    loss = ((y_pred - y) ** 2).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and
    # w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = np.matmul(h_relu.T, grad_y_pred)
    grad_h_relu = np.matmul(grad_y_pred, w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = np.matmul(x.T, grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    
    
    


0 33629486.18830399
1 34090685.34809681
2 35442541.087108545
3 31961138.417162538
4 23267759.776096202
5 13424700.989192188
6 6888556.715144397
7 3572722.9290045616
8 2112216.5400463613
9 1446813.2852093847
10 1105834.9363492343
11 899572.3993320896
12 755549.6409728327
13 645720.515965341
14 557731.7438204467
15 485309.71541270276
16 424562.7017725657
17 373158.30096347997
18 329317.9152280884
19 291722.5123226956
20 259242.71347523882
21 231048.5305469565
22 206499.20109404685
23 185045.15493889773
24 166241.42195669928
25 149685.6692522074
26 135062.35228503926
27 122105.62070933929
28 110586.66407249418
29 100321.59511926027
30 91144.82305034471
31 82962.94859489451
32 75640.95024709495
33 69062.35913532763
34 63144.86768232152
35 57810.48569896792
36 53005.58106764299
37 48658.3589101266
38 44717.77810544474
39 41139.469532441006
40 37885.607066011275
41 34921.32285206888
42 32219.55733886832
43 29753.311755387524
44 27500.555468014296
45 25438.803207185432
46 23549.970572661015
4

In [3]:
import torch

# N is batch size: D_in is input dimension;
# H is hidden dimension; D_out is output dimension

N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Randomly initialize weights
w1 = torch.randn(D_in, H)
w2 = torch.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = torch.matmul(x, w1)
    h_relu = h.clamp(min=0)
    y_pred = torch.matmul(h_relu, w2)

    # Compute and print loss
    loss = ((y_pred - y) ** 2).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and
    # w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = torch.matmul(h_relu.t(), grad_y_pred)
    grad_h_relu = torch.matmul(grad_y_pred, w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = torch.matmul(x.t(), grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    
    
    
    
    

0 tensor(2.5782e+07)
1 tensor(2.0561e+07)
2 tensor(1.8688e+07)
3 tensor(1.7519e+07)
4 tensor(1.5776e+07)
5 tensor(1.3229e+07)
6 tensor(1.0173e+07)
7 tensor(1.00000e+06 *
       7.2882)
8 tensor(1.00000e+06 *
       4.9619)
9 tensor(3.3148e+06)
10 tensor(2.2255e+06)
11 tensor(1.00000e+06 *
       1.5339)
12 tensor(1.00000e+06 *
       1.0974)
13 tensor(1.00000e+05 *
       8.1870)
14 tensor(1.00000e+05 *
       6.3574)
15 tensor(1.00000e+05 *
       5.1101)
16 tensor(1.00000e+05 *
       4.2228)
17 tensor(1.00000e+05 *
       3.5645)
18 tensor(1.00000e+05 *
       3.0582)
19 tensor(1.00000e+05 *
       2.6553)
20 tensor(1.00000e+05 *
       2.3267)
21 tensor(1.00000e+05 *
       2.0529)
22 tensor(1.00000e+05 *
       1.8213)
23 tensor(1.00000e+05 *
       1.6229)
24 tensor(1.00000e+05 *
       1.4515)
25 tensor(1.00000e+05 *
       1.3023)
26 tensor(1.00000e+05 *
       1.1718)
27 tensor(1.00000e+05 *
       1.0571)
28 tensor(95603.7109)
29 tensor(86657.8516)
30 tensor(78710.0859)
31 te