## pytorch实现两层神经网络
- 文档：https://pytorch.org/docs/torch

In [1]:
import torch

In [2]:
torch.__version__

'1.0.1'

## 用numpy实现两层神经网络
一个全连接ReLU神经网络，一个隐藏层，没有bias。用来从x预测y，使用L2 Loss。
- $h = W_1X$
- $a = max(0, h)$
- $y_{hat} = W_2a$

这一实现完全使用numpy来计算前向神经网络，loss，和反向传播。
- forward pass
- loss
- backward pass

numpy ndarray是一个普通的n维array。它不知道任何关于深度学习或者梯度(gradient)的知识，也不知道计算图(computation graph)，只是一种用来计算数学运算的数据结构。

In [3]:
import numpy as np

In [18]:
# 样本个数，输入维度，hinton, 输出维度
N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建一些训练数据
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)
print("训练集维度：", len(x[1]))

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for it in range(501):
    # 向前传播
    h = x.dot(w1)   # N * H
    # X 与 Y 逐位比较取其大者, 至少接收两个参数
    h_relu = np.maximum(h, 0)  # N * H
    y_pred = h_relu.dot(w2)  # N * D_out
    
    # 计算损失
    loss = np.square(y_pred - y).sum()
    # print(it, loss)
    
    # 反向传播
    # 计算梯度
    grad_y_pred = 2.0 * (y_pred - y)  # N * D_out
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # update weights of w1 and w2
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    if it % 50 == 0:
        print("itor: {} | loss:{}".format(it, loss))

训练集维度： 1000
itor: 0 | loss:36379916.30562085
itor: 50 | loss:11522.84048949815
itor: 100 | loss:312.54151381266536
itor: 150 | loss:13.699522200570192
itor: 200 | loss:0.7797040963869617
itor: 250 | loss:0.05274861775962181
itor: 300 | loss:0.004007803190526088
itor: 350 | loss:0.00032858051495920253
itor: 400 | loss:2.82897568570412e-05
itor: 450 | loss:2.5142220375801123e-06
itor: 500 | loss:2.2822002515655443e-07


## PyTorch: Tensors
- 使用PyTorch tensors来创建前向神经网络，计算损失，以及反向传播。
- 一个PyTorch Tensor很像一个numpy的ndarray。但是它和numpy ndarray最大的区别是，PyTorch Tensor可以在CPU或者GPU上运算。如果想要在GPU上运算，就需要把Tensor换成cuda类型。

In [35]:
N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

w1 = torch.randn(D_in, H)
w2 = torch.randn(H, D_out)

learning_rate = 1e-6
for it in range(501):
    # Forward pass
    h = x.mm(w1) # N * H
    h_relu = h.clamp(min=0) # N * H
    y_pred = h_relu.mm(w2) # N * D_out
    
    # compute loss
    loss = (y_pred - y).pow(2).sum().item()
    
    # Backward pass
    # compute the gradient
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # update weights of w1 and w2
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    if it % 50 == 0:
        print("itor: {} | loss:{}".format(it, loss))

itor: 0 | loss:26421104.0
itor: 50 | loss:8897.6162109375
itor: 100 | loss:248.85116577148438
itor: 150 | loss:12.546610832214355
itor: 200 | loss:0.842868447303772
itor: 250 | loss:0.06756731867790222
itor: 300 | loss:0.006269404664635658
itor: 350 | loss:0.0008211490930989385
itor: 400 | loss:0.0001843837380874902
itor: 450 | loss:6.254202889977023e-05
itor: 500 | loss:2.910074545070529e-05
