<a href="https://colab.research.google.com/github/titiac/testPytorch/blob/main/testPytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 一个使用relu激活函数的样例

import numpy as np

# 两层神经网络， 输入64个个体， 10 应该十个分类
N, D_in, H, D_out = 64, 1000, 100, 10

# x 输入层  y 输出层
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# w1 隐含层权重, w2输出层权重 
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6  # 学习率
for t in range(500):   
  h = x.dot(w1)
  h_relu = np.maximum(h, 0) # relu 激活
  y_pred = h_relu.dot(w2) # 输出

  loss = np.square(y_pred - y).sum()  # 对于正确标签的损失的损失
  print(t, loss)

  grad_y_pred = 2.0 * (y_pred - y)
  grad_w2 = h_relu.T.dot(grad_y_pred)  # 应该都是loss对grad进行求导，那么为啥是这样进行反向传播的，需要手动推？
  grad_h_relu = grad_y_pred.dot(w2.T)
  grad_h = grad_h_relu.copy()
  grad_h[h < 0] = 0
  grad_w1 = x.T.dot(grad_h)
 
  # Update weights
  w1 -= learning_rate * grad_w1
  w2 -= learning_rate * grad_w2

  # if(t == 499):
  #   print(f"x: \n{x}\n")
  #   print(f"h: \n{h}\n")
  #   print(f"h_relu: \n{h_relu}\n")
  #   print(f"y_pred: \n{y_pred}\n")
  #   print(f"loss: \n{loss}\n")

# Pytorch 导引
**Numpy** 是一个很好的框架，但它不能利用GPU进行加速，对于现代神经网络来说，GPU通常提供50倍或更高的加速，因此**Numpy**不适用于现代深度学习。

**Pytorch**的张量在概念上与$numpy$数组相同：张量是一个n维数组，$pytorch$也提供了很多对这些张量进行操作的函数。常见的$numpy$计算都可以用$pytorch$张量来完成。

与$numpy$不同的是，$pytorch$可以利用GPU进行加速其计算。

下面是手动模拟两层神经网络使用$pytorch$张量计算,
下面可能使用CPU还会比GPU快


In [None]:
# Code in file autograd/two_layer_net_autograd.py
import torch

device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU  如果将该注释取消掉的话， 设备将会变为GPU

# N is batch size; D_in is input dimension;  
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Create random Tensors for weights; setting requires_grad=True means that we
# want to compute gradients for these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y using operations on Tensors. Since w1 and           # 前向传播：使用张量运算计算y的预测值，当权重w1和w2的参数requires_grad=True，
  # w2 have requires_grad=True, operations involving these Tensors will cause             ## 涉及这些张量的计算将会促使Pytorch生成一个计算图， 这个计算图可以让 
  # PyTorch to build a computational graph, allowing automatic computation of             # 计算反向传播变得自动化，也就是脱离了手动计算反向传播
  # gradients. Since we are no longer implementing the backward pass by hand we
  # don't need to keep references to intermediate values.
  y_pred = x.mm(w1).clamp(min=0).mm(w2)     # mm就类似与np.dot，clamp(min=0) relu激活函数的实现方式，
  
  # Compute and print loss. Loss is a Tensor of shape (), and loss.item()
  # is a Python number giving its value.
  loss = (y_pred - y).pow(2).sum()
  print(t, loss.item())      # item()的作用是取出单元素张量的元素值并返回该值，保持该元素类型不变。 直接取也可以，但是item取出来的精度更高

  # Use autograd to compute the backward pass. This call will compute the       # 使用自动微分去计算反向传播， 这个调用将计算
  # gradient of loss with respect to all Tensors with requires_grad=True.        # 所有带有 requires_grad=True. 参数的张量的损失梯度   
  # After this call w1.grad and w2.grad will be Tensors holding the gradient     ## 在调用函数之后， w1.grad 和 w2.grad 将分别保存 
  # of the loss with respect to w1 and w2 respectively.                 # 损失值对于w1 和 w2的梯度
  loss.backward()

  # Update weights using gradient descent. For this step we just want to mutate    # 使用梯度下降更新梯度, 对于这个步骤我们只想就地转变w1和w2的值
  # the values of w1 and w2 in-place; we don't want to build up a computational   # 我们不想为更新步骤构建计算图, 所以我们使用上下文管理器，
  # graph for the update steps, so we use the torch.no_grad() context manager     ## 来阻止Pytorch 为了更新计算图而构建计算图
  # to prevent PyTorch from building a computational graph for the updates       ##
  with torch.no_grad():
    w1 -= learning_rate * w1.grad
    w2 -= learning_rate * w2.grad

    # Manually zero the gradients after running the backward pass    # 运行后就要将梯度置为0
    w1.grad.zero_()
    w2.grad.zero_()

In [None]:
# Code in file autograd/two_layer_net_custom_function.py
import torch

class MyReLU(torch.autograd.Function):
  """
  We can implement our own custom autograd Functions by subclassing            ## 我们可以通过子类化 torch.autograd函数 来实现我们自己的自定义
  torch.autograd.Function and implementing the forward and backward passes          # autograd函数 和实现前向传播和反向传播操作张量。 
  which operate on Tensors.
  """
  @staticmethod
  def forward(ctx, x):  # ctx 就是context 翻译成 "上下文或者环境" 常用于静态环境
    """
    In the forward pass we receive a context object and a Tensor containing the      #  在前向传播中我们得到了一个上下文对象和一个包含输入的张量
    input; we must return a Tensor containing the output, and we can use the       ##  我们必须返回一个包含输出的张量的输出， 并且我们能够使用这个
    context object to cache objects for use in the backward pass.              ##  上下文对象去缓存用于反向传播的对象
    """
    ctx.save_for_backward(x)
    return x.clamp(min=0)

  @staticmethod
  def backward(ctx, grad_output):
    """
    In the backward pass we receive the context object and a Tensor containing
    the gradient of the loss with respect to the output produced during the
    forward pass. We can retrieve cached data from the context object, and must
    compute and return the gradient of the loss with respect to the input to the
    forward function.
    """
    x, = ctx.saved_tensors
    grad_x = grad_output.clone()
    grad_x[x < 0] = 0
    return grad_x


device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and output
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y using operations on Tensors; we call our
  # custom ReLU implementation using the MyReLU.apply function
  y_pred = MyReLU.apply(x.mm(w1)).mm(w2)
 
  # Compute and print loss
  loss = (y_pred - y).pow(2).sum()
  print(t, loss.item())

  # Use autograd to compute the backward pass.
  loss.backward()

  with torch.no_grad():
    # Update weights using gradient descent
    w1 -= learning_rate * w1.grad
    w2 -= learning_rate * w2.grad

    # Manually zero the gradients after running the backward pass
    w1.grad.zero_()
    w2.grad.zero_()
