## Warm-up: numpy

Before introducing PyTorch, we will first implement the network using numpy.

Numpy provides an n-dimensional array object, and many functions for manipulating
these arrays. Numpy is a generic framework for scientific computing; it does not
know anything about computation graphs, or deep learning, or gradients. However
we can easily use numpy to fit a two-layer network to random data by manually
implementing the forward and backward passes through the network using numpy
operations:

## The original version with bias

In [8]:
# Code in file tensor/two_layer_net_numpy.py

import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
b1 = np.random.randn(1,H)
w2 = np.random.randn(H, D_out)
b2 = np.random.randn(1,D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1) + b1
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2) + b2

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_b2 = grad_y_pred
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    grad_b1 = grad_h

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    b1 -= learning_rate * np.sum(grad_b1,0)
    b2 -= learning_rate * np.sum(grad_b2,0)

0 35120482.9527
1 37280192.2504
2 46516324.7899
3 52146231.0408
4 42812156.9911
5 22916914.6023
6 8719193.88474
7 3375360.12822
8 1811838.76348
9 1279340.29625
10 1016482.43641
11 841711.410785
12 708699.489145
13 602404.116901
14 515700.241553
15 444245.191247
16 384822.313467
17 335058.335791
18 293047.895302
19 257315.719798
20 226769.383357
21 200525.089841
22 177874.772161
23 158259.882772
24 141208.541511
25 126304.254384
26 113238.116947
27 101750.341749
28 91617.7575714
29 82659.5881884
30 74702.4715964
31 67632.7761809
32 61333.3416948
33 55708.7810432
34 50684.3152695
35 46180.2661719
36 42132.2251338
37 38489.5836343
38 35204.9701857
39 32239.8734442
40 29558.3275155
41 27130.4526402
42 24930.0323929
43 22933.5867146
44 21117.5669183
45 19463.5036628
46 17956.0406619
47 16580.7714161
48 15323.6285075
49 14173.2914571
50 13119.1323385
51 12152.7938326
52 11265.7638148
53 10450.4725165
54 9700.63199518
55 9010.18436469
56 8374.47120512
57 7789.67648965
58 7249.9332216
59 6751.

## meaning example

In [75]:
# Code in file tensor/two_layer_net_numpy.py
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H,  D_out = 12, 2, 10, 4

# Create random input and output data
x = np.array([[1,1],[-1,1],[-1,-1],[1,-1],[10,1],[-10,1],[-10,-1],[10,-1],[1,10],[-1,10],[-1,-10],[1,-10]])
y = np.array([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1],[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1],[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]])

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
b1 = np.random.randn(H)
w2 = np.random.randn(H, D_out)
b2 = np.random.randn(D_out)

learning_rate = 1e-5
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1) + b1
    y_pred = h.dot(w2) + b2

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    
    grad_w2 = h.T.dot(grad_y_pred)
    grad_b2 = grad_y_pred
    
    grad_h = grad_y_pred.dot(w2.T)
    
    grad_w1 = x.T.dot(grad_h)
    grad_b1 = grad_h

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    b1 -= learning_rate * np.sum(grad_b1,0)
    b2 -= learning_rate * np.sum(grad_b2,0)

0 38491.3782864
1 25207.6237248
2 17362.4167392
3 12368.2991716
4 9027.5193738
5 6713.92936643
6 5070.69292749
7 3881.03820195
8 3006.7486722
9 2356.35508165
10 1867.55050816
11 1496.92052429
12 1213.66630749
13 995.620449483
14 826.637640828
15 694.841168673
16 591.419372895
17 509.785171521
18 444.980754034
19 393.250967607
20 351.734594167
21 318.239080777
22 291.074979114
23 268.933489413
24 250.795359548
25 235.862743533
26 223.507970518
27 213.2348354
28 204.649207282
29 197.436604472
30 191.345001892
31 186.171586193
32 181.752502686
33 177.95487989
34 174.670595895
35 171.811382912
36 169.304964682
37 167.091994823
38 165.123619173
39 163.359526571
40 161.76638372
41 160.316573488
42 158.987173991
43 157.759129586
44 156.616575466
45 155.5462857
46 154.537220864
47 153.580156309
48 152.667375958
49 151.792419507
50 150.949873315
51 150.135197088
52 149.344580006
53 148.574821116
54 147.823229739
55 147.087542467
56 146.365853898
57 145.65655877
58 144.958303593
59 144.269946172

In [87]:
test = np.array([[111,-111],[1,9],[8,1],[1,8]])
h = test.dot(w1) + b1
y_pred = h.dot(w2) + b2
print(y_pred)

[[ 1.09142174 -7.0996596   1.42970353  4.09225893]
 [ 0.8117118   0.13584796  0.44320844 -1.19339659]
 [ 0.81978398 -0.33605286  0.51781296 -0.84870038]
 [ 0.78496105  0.10433452  0.47573421 -1.17047174]]


In [88]:
w1

array([[ -4.94620330e-03,   1.39368443e-01,   3.42470316e-01,
         -1.46486552e-01,  -2.54308479e-01,   2.59593938e+00,
          1.90712944e+00,  -1.83493233e+00,  -4.77565628e-02,
         -4.49735241e-01],
       [ -1.40069874e+00,   1.81918208e-03,  -4.41708519e-02,
          8.33067429e-01,   9.17334810e-01,  -6.26893431e-01,
         -5.96155113e-01,  -1.79890889e+00,   1.62922730e-01,
         -1.02690650e+00]])

In [89]:
w2

array([[-0.81363603,  0.43419112,  0.19364838,  0.0333749 ],
       [ 0.21056058, -1.31721275, -0.38622232,  0.32475315],
       [ 1.29686827,  0.26977449, -1.11172664,  1.04798621],
       [ 0.13537576, -0.20020269,  0.68635664,  0.94918699],
       [-0.3802344 , -0.49056601, -0.73597831, -1.14232822],
       [-0.82956238,  0.0078384 ,  0.11999635, -0.67596608],
       [ 1.47292975, -0.15129894, -0.16927932,  0.47671657],
       [ 0.81921002,  0.27539056, -0.25976912, -0.17101063],
       [ 0.99174538,  0.45992632,  0.4672269 , -0.4380368 ],
       [-0.82789558, -1.56389965,  0.26822651,  0.04785675]])

In [96]:
np.random.randn(10,20)

array([[-0.70586139,  1.50402015,  0.87135864,  1.03837858, -0.49889948,
         0.4740006 , -1.49154922,  1.22180822, -0.80018929, -0.21411686,
        -1.98892684, -0.31845253, -0.17727626, -0.7235015 , -1.5699794 ,
         3.70015946,  1.34034273, -0.83139057,  0.74621522, -0.28729142],
       [-0.83277571, -0.03439773,  0.48384911,  0.06627897, -0.22701893,
         0.76591947, -1.19481824,  0.27073818, -1.53062417, -1.04585974,
         0.32800924,  1.71715128,  0.8950344 ,  0.33942632, -0.09107685,
         0.2209535 , -0.7701262 , -0.34195191,  0.27456486, -1.49732735],
       [ 0.53497579, -2.21240168,  1.65989027, -0.85141053, -0.48039712,
         1.52119979, -0.2532592 ,  0.72471355, -0.19548291,  0.66672889,
        -0.12514354,  0.51385233, -0.0773705 ,  0.80838632, -1.34134744,
         0.33618543, -0.63265954, -0.62348408, -0.0747078 , -0.56123047],
       [-0.91518108, -0.52247888,  1.56844924,  0.5879926 , -1.04118448,
         0.52894346,  0.16652937,  0.0348101 , -