In [59]:
import torch
import numpy as np
import operator
import math

In [5]:
x = torch.empty(5, 3)
print(x)

tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])


In [6]:
x = torch.rand(5, 3)
print(x)

tensor([[0.2012, 0.7006, 0.0685],
        [0.8754, 0.0235, 0.1121],
        [0.8929, 0.1105, 0.0917],
        [0.0814, 0.0635, 0.8166],
        [0.9105, 0.9210, 0.0853]])


In [8]:
x = torch.zeros(5, 3, dtype = torch.long)
print(x)

tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]])


In [9]:
x = torch.tensor([5.5, 3])
print(x)

tensor([5.5000, 3.0000])


In [10]:
x = x.new_ones(5, 3)
print(x)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])


In [16]:
x.size()

torch.Size([5, 3])

In [18]:
y = torch.rand(5, 3)
result = torch.empty(5, 3)
torch.add (x, y, out = result)

tensor([[1.4262, 1.0661, 1.0662],
        [1.6769, 1.0176, 1.9241],
        [1.8122, 1.8169, 1.8032],
        [1.3747, 1.8122, 1.2546],
        [1.7174, 1.9286, 1.4377]])

In [19]:
x = torch.randn(4, 4)
y = x.view(16)
z = x.view(-1, 8)
z

tensor([[-1.0202, -1.1443, -0.1051,  2.2824,  0.3574, -0.1758, -0.8805,  1.7793],
        [ 1.0300, -0.0786, -0.3269,  0.2332,  0.0331,  0.1424,  0.3752,  0.3183]])

CUDA Tensors

In [21]:
print(torch.cuda.is_available())
if torch.cuda.is_available():
    device = torch.device("cuda")
    y = torch.ones_like(x, device = device)
    x = x.to(device)
    z = x + y
    print(z)
    print(z.to("cpu", torch.double))

False


In [22]:
#tensor可以在gpu上，但numpy只有cpu，所以一般都是np转为tensor放到gpu运算，然后再转回numpy并放到cpu里
y.to("cpu").data.numpy()
y.cpu().data.numpy()

array([-1.0201975 , -1.1442864 , -0.10506658,  2.2824323 ,  0.3573812 ,
       -0.17584214, -0.88049424,  1.7792723 ,  1.0299932 , -0.07858746,
       -0.32689717,  0.23323706,  0.03314139,  0.1423604 ,  0.3752281 ,
        0.31828701], dtype=float32)

用numpy实现两层神经网络
----------------
一个全连接ReLU神经网络，一个隐藏层，没有bias。用来从x预测y，使用L2 Loss
- h = W_1X + b_1
- a = max (0, h) ReLU
- y_{hat} = W_2a + b_2

实现
- forward pass
- loss
- backward pass

numpy ndarray只是一个普通的n维array，并不涉及任何深度学习或者梯度gradient的知识。只是一种用来计算数学运算的数据结构

In [37]:
N, D_in, H, D_out = 64, 1000, 100, 10
#64个输入，1000维度，hidden是100维度，输出10维度
#64个训练数据一个batch，把一个1000维度的向量转成一个10维度的向量

#随机创建一些训练数据
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = math.pow(10, -6)

for it in range(50):
    #forward pass
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    #compute loss
    #MSE mean square error
    loss = np.square(y_pred - y).sum()
    print(it, loss)
    
    #backward pass
    #compute the gradient
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    #update weights of w1 and w2
    w1 -= learning_rate * grad_w1
    w2 = w2 - learning_rate * grad_w2

0 37330079.9821741
1 31141540.215710815
2 28895152.73736138
3 25755639.577937238
4 20276683.309540555
5 13961857.319894487
6 8637280.106683142
7 5148183.258450883
8 3140292.6277650776
9 2049105.3749066382
10 1445036.9350677927
11 1091533.661590245
12 866827.2291749764
13 712188.8905861593
14 598114.3194623955
15 509185.82570199104
16 437904.52453823196
17 379485.29022308456
18 330720.57711180503
19 289628.576905093
20 254734.85746594996
21 224932.21133046155
22 199274.61230761284
23 177080.16045152151
24 157808.08275309845
25 141028.7806726255
26 126345.90107098638
27 113441.09583735152
28 102065.27420329615
29 92010.4478221159
30 83098.91305846235
31 75190.52280873308
32 68157.42504827632
33 61875.923908946854
34 56261.48556820596
35 51228.48500897418
36 46708.78706976758
37 42641.244425472934
38 38976.83110366805
39 35670.40327172875
40 32683.089063409698
41 29979.16442706629
42 27527.978337167668
43 25301.599765721796
44 23277.07860867127
45 21435.1011697373
46 19756.95286878866
47 

在pytorch上再实现一遍

In [40]:
N, D_in, H, D_out = 64, 1000, 100, 10
#64个输入，1000维度，hidden是100维度，输出10维度
#64个训练数据一个batch，把一个1000维度的向量转成一个10维度的向量

#随机创建一些训练数据
# x = np.random.randn(N, D_in)
# y = np.random.randn(N, D_out)

# w1 = np.random.randn(D_in, H)
# w2 = np.random.randn(H, D_out)

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

w1 = torch.randn(D_in, H)
w2 = torch.randn(H, D_out)

learning_rate = math.pow(10, -6)

for it in range(50):
    #forward pass
    #h = x.dot(w1)
    #matrix mulitply
    h = x.mm(w1)
    #h_relu = np.maximum(h, 0)
    h_relu = h.clamp(min = 0)
    y_pred = h_relu.mm(w2)
    
    #compute loss
    #MSE mean square error
    #loss = np.square(y_pred - y).sum()
    loss = (y_pred - y).pow(2).sum().item()
    print(it, loss)
    
    #backward pass
    #compute the gradient
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    #update weights of w1 and w2
    w1 -= learning_rate * grad_w1
    w2 = w2 - learning_rate * grad_w2

0 26721898.0
1 23825340.0
2 26743670.0
3 31681644.0
4 34588864.0
5 30907394.0
6 21719118.0
7 11979791.0
8 5813523.0
9 2808557.25
10 1527440.25
11 972467.6875
12 707769.0
13 560086.875
14 463833.15625
15 393350.03125
16 338012.8125
17 292901.25
18 255313.5
19 223614.59375
20 196629.171875
21 173497.640625
22 153576.8125
23 136341.6875
24 121376.515625
25 108329.4765625
26 96915.40625
27 86900.484375
28 78078.0859375
29 70280.5234375
30 63374.34765625
31 57244.2734375
32 51794.13671875
33 46938.29296875
34 42597.2890625
35 38709.98046875
36 35220.8984375
37 32083.5078125
38 29257.837890625
39 26707.953125
40 24404.84765625
41 22322.162109375
42 20435.26953125
43 18724.5078125
44 17172.291015625
45 15761.6640625
46 14477.662109375
47 13308.326171875
48 12243.1552734375
49 11270.3984375


简单的autograd

In [44]:
x = torch.tensor(1., requires_grad = True)
w = torch.tensor(2., requires_grad = True)
b = torch.tensor(3., requires_grad = True)

y = w * x + b # y = 2 * 1 + 3
y.backward()

print(x.grad)
print(w.grad)
print(b.grad)


tensor(2.)
tensor(1.)
tensor(1.)


再进行简化之后的网络

In [55]:
w1.grad.zero_()
w2.grad.zero_()

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],


In [65]:
N, D_in, H, D_out = 64, 1000, 100, 10
#64个输入，1000维度，hidden是100维度，输出10维度
#64个训练数据一个batch，把一个1000维度的向量转成一个10维度的向量

#随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

w1 = torch.randn(D_in, H, requires_grad = True)
w2 = torch.randn(H, D_out, requires_grad = True)

learning_rate = math.pow(10, -6)

for it in range(50):
    #forward pass
    y_pred = x.mm(w1).clamp(min = 0).mm(w2)
    
    #compute loss
    #MSE mean square error
    loss = (y_pred - y).pow(2).sum()
    print(it, loss.item())
    

    #backward pass
    #compute the gradient
    loss.backward()
    
    #update weights of w1 and w2
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

0 33126524.0
1 32853148.0
2 37433264.0
3 39726992.0
4 34150600.0
5 22079134.0
6 11144883.0
7 5134465.0
8 2607425.0
9 1609302.875
10 1168911.75
11 931114.8125
12 774803.5
13 658245.4375
14 565665.75
15 489707.0625
16 426358.96875
17 372965.125
18 327607.125
19 288879.8125
20 255635.4375
21 226957.328125
22 201907.84375
23 180152.75
24 161198.9375
25 144603.59375
26 130012.1484375
27 117155.3828125
28 105773.84375
29 95674.765625
30 86689.8671875
31 78677.3515625
32 71520.703125
33 65114.27734375
34 59356.875
35 54205.24609375
36 49587.8359375
37 45418.60546875
38 41649.375
39 38236.63671875
40 35141.9453125
41 32330.95703125
42 29772.37890625
43 27441.37890625
44 25317.513671875
45 23377.189453125
46 21602.2265625
47 19979.7734375
48 18492.9921875
49 17129.4609375


In [66]:
w1.grad

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

pytorch:nn

In [69]:
import torch.nn as nn
N, D_in, H, D_out = 64, 1000, 100, 10
#64个输入，1000维度，hidden是100维度，输出10维度
#64个训练数据一个batch，把一个1000维度的向量转成一个10维度的向量

#随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H, bias = False),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out, bias = False)
)

torch.nn.init.normal_(model[0].weight)
torch.nn.init.normal_(model[2].weight)
# model = model.cuda()

loss_fn = nn.MSELoss(reduction = 'sum')

learning_rate = math.pow(10, -6)

for it in range(50):
    #forward pass
#     y_pred = x.mm(w1).clamp(min = 0).mm(w2)
    y_pred = model(x)
    
    #compute loss
    #MSE mean square error
    loss = loss_fn(y_pred, y)
    print(it, loss.item())
    
    model.zero_grad()
    #backward pass
    #compute the gradient
    loss.backward()
    
    #update weights of w1 and w2
    with torch.no_grad():
#         w1 -= learning_rate * w1.grad
#         w2 -= learning_rate * w2.grad
#         w1.grad.zero_()
#         w2.grad.zero_()
        for param in model.parameters():
            param -= learning_rate * param.grad


0 34791804.0
1 30468078.0
2 29244070.0
3 26309362.0
4 20529724.0
5 13640735.0
6 8082017.0
7 4613452.0
8 2736262.0
9 1762712.75
10 1241866.75
11 941071.5625
12 749775.6875
13 616923.25
14 518124.0
15 440983.40625
16 378822.09375
17 327739.9375
18 285153.625
19 249329.671875
20 218922.28125
21 192974.875
22 170758.46875
23 151600.921875
24 135005.96875
25 120582.1640625
26 107979.0
27 96945.296875
28 87244.7265625
29 78683.2109375
30 71101.4375
31 64373.5234375
32 58389.11328125
33 53052.5859375
34 48286.9375
35 44019.609375
36 40191.24609375
37 36748.21484375
38 33646.203125
39 30849.09375
40 28323.447265625
41 26037.79296875
42 23965.669921875
43 22083.193359375
44 20372.12109375
45 18814.517578125
46 17392.083984375
47 16092.6171875
48 14904.8330078125
49 13819.5751953125


In [72]:
model

Sequential(
  (0): Linear(in_features=1000, out_features=100, bias=False)
  (1): ReLU()
  (2): Linear(in_features=100, out_features=10, bias=False)
)

pytorch:optim

In [74]:
import torch.nn as nn
N, D_in, H, D_out = 64, 1000, 100, 10
#64个输入，1000维度，hidden是100维度，输出10维度
#64个训练数据一个batch，把一个1000维度的向量转成一个10维度的向量

#随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H, bias = False),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out, bias = False)
)

# torch.nn.init.normal_(model[0].weight)
# torch.nn.init.normal_(model[2].weight)
# model = model.cuda()

loss_fn = nn.MSELoss(reduction = 'sum')
#优化。有SGD，Adam
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

learning_rate = math.pow(10, -6)

for it in range(50):
    #forward pass
#     y_pred = x.mm(w1).clamp(min = 0).mm(w2)
    y_pred = model(x)
    
    #compute loss
    #MSE mean square error
    loss = loss_fn(y_pred, y)
    print(it, loss.item())
    
    optimizer.zero_grad()
    #backward pass
    #compute the gradient
    loss.backward()
    
    #update weights of w1 and w2
#     with torch.no_grad():
# #         w1 -= learning_rate * w1.grad
# #         w2 -= learning_rate * w2.grad
# #         w1.grad.zero_()
# #         w2.grad.zero_()
#         for param in model.parameters():
#             param -= learning_rate * param.grad

    optimizer.step()

0 570.1884765625
1 570.0304565429688
2 569.8724365234375
3 569.7144775390625
4 569.5565185546875
5 569.3986206054688
6 569.24072265625
7 569.0830078125
8 568.9251708984375
9 568.7674560546875
10 568.6097412109375
11 568.4521484375
12 568.2945556640625
13 568.1370849609375
14 567.9796142578125
15 567.822265625
16 567.6649169921875
17 567.5076904296875
18 567.3504028320312
19 567.1931762695312
20 567.0359497070312
21 566.8788452148438
22 566.7218017578125
23 566.5648193359375
24 566.4078369140625
25 566.2508544921875
26 566.093994140625
27 565.9371337890625
28 565.7803955078125
29 565.6239013671875
30 565.467529296875
31 565.3112182617188
32 565.1549682617188
33 564.9987182617188
34 564.8425903320312
35 564.6864013671875
36 564.5302734375
37 564.3742065429688
38 564.2181396484375
39 564.0621337890625
40 563.9061279296875
41 563.7501831054688
42 563.59423828125
43 563.4384155273438
44 563.28271484375
45 563.1270141601562
46 562.971435546875
47 562.81591796875
48 562.6604614257812
49 562.5

pytorch models

In [77]:
import torch.nn as nn
N, D_in, H, D_out = 64, 1000, 100, 10
#64个输入，1000维度，hidden是100维度，输出10维度
#64个训练数据一个batch，把一个1000维度的向量转成一个10维度的向量

#随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H, bias = False)
        self.relu = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(H, D_out, bias = False)
    def forward(self, x):
        y_pred = self.linear1(x)
        y_pred = self.relu(y_pred)
        y_pred = self.linear2(y_pred)
        return y_pred

model = TwoLayerNet(D_in, H, D_out)
# torch.nn.init.normal_(model[0].weight)
# torch.nn.init.normal_(model[2].weight)
# model = model.cuda()

loss_fn = nn.MSELoss(reduction = 'sum')
#优化。有SGD，Adam
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

learning_rate = math.pow(10, -6)

for it in range(50):
    #forward pass
#     y_pred = x.mm(w1).clamp(min = 0).mm(w2)
    y_pred = model(x)
    
    #compute loss
    #MSE mean square error
    loss = loss_fn(y_pred, y)
    print(it, loss.item())
    
    optimizer.zero_grad()
    #backward pass
    #compute the gradient
    loss.backward()
    
    #update weights of w1 and w2
#     with torch.no_grad():
# #         w1 -= learning_rate * w1.grad
# #         w2 -= learning_rate * w2.grad
# #         w1.grad.zero_()
# #         w2.grad.zero_()
#         for param in model.parameters():
#             param -= learning_rate * param.grad

    optimizer.step()

0 750.524169921875
1 750.33203125
2 750.14013671875
3 749.9481201171875
4 749.7562255859375
5 749.5643310546875
6 749.3724365234375
7 749.1805419921875
8 748.9887084960938
9 748.7969970703125
10 748.6053466796875
11 748.4137573242188
12 748.22216796875
13 748.0308227539062
14 747.8395385742188
15 747.6482543945312
16 747.4571533203125
17 747.2659301757812
18 747.0747680664062
19 746.8836669921875
20 746.692626953125
21 746.5015258789062
22 746.3104858398438
23 746.1195678710938
24 745.9286499023438
25 745.73779296875
26 745.5469970703125
27 745.356201171875
28 745.1655883789062
29 744.9749145507812
30 744.7843627929688
31 744.5938720703125
32 744.4033813476562
33 744.2129516601562
34 744.0224609375
35 743.8320922851562
36 743.6417236328125
37 743.4514770507812
38 743.2611694335938
39 743.0709228515625
40 742.880859375
41 742.69091796875
42 742.5009765625
43 742.3109741210938
44 742.12109375
45 741.9312133789062
46 741.7413940429688
47 741.5516357421875
48 741.3621215820312
49 741.17248