<a href="https://colab.research.google.com/github/xiaochengJF/DeepLearning/blob/master/PyTorch%E6%95%99%E7%A8%8B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tensor基本操作

In [0]:
from __future__ import print_function
import torch
x = torch.Tensor(5, 3)  # 构造一个未初始化的5*3的矩阵
print(x)
x = torch.rand(5, 3)  # 构造一个随机初始化的矩阵
print(x)
x.size() 

tensor([[1.0741e-36, 0.0000e+00, 3.7835e-44],
        [0.0000e+00,        nan, 1.2125e+25],
        [1.3733e-14, 6.4069e+02, 4.3066e+21],
        [1.1824e+22, 4.3066e+21, 6.3828e+28],
        [3.8016e-39, 4.2284e+21, 1.3563e-19]])
tensor([[0.0840, 0.9314, 0.5300],
        [0.3165, 0.1866, 0.5186],
        [0.2178, 0.1689, 0.3623],
        [0.5791, 0.0739, 0.9455],
        [0.5083, 0.4031, 0.4666]])


torch.Size([5, 3])

In [0]:
# torch.Size 事实上是一个tuple, 所以其支持相关的操作*
y = torch.rand(5, 3)
x + y # 语法一
torch.add(x, y) # 语法二

tensor([[0.9815, 1.2067, 1.3755],
        [0.6702, 0.5304, 0.9676],
        [0.7938, 0.7058, 0.8588],
        [1.5058, 0.9331, 1.7468],
        [1.4405, 0.6854, 0.6840]])

In [0]:
# 另外输出tensor也有两种写法
result = torch.Tensor(5, 3)
torch.add(x, y, out=result) 
y.add_(x) # 特别注明：任何可以改变tensor内容的操作都会在方法名后加一个下划线'_'，例如：x.copy_(y), x.t_(), 这俩都会改变x的值

tensor([[0.9815, 1.2067, 1.3755],
        [0.6702, 0.5304, 0.9676],
        [0.7938, 0.7058, 0.8588],
        [1.5058, 0.9331, 1.7468],
        [1.4405, 0.6854, 0.6840]])

In [0]:
# 此处演示tensor和numpy数据结构的相互转换
a = torch.ones(5)
b = a.numpy()
# 此处演示当修改numpy数组之后,与之相关联的tensor也会相应的被修改
a.add_(1)
print(a)
print(b)

tensor([2., 2., 2., 2., 2.])
[2. 2. 2. 2. 2.]


In [0]:
# 将numpy的Array转换为torch的Tensor
import numpy as np
a = np.ones(5)
b = torch.from_numpy(a)
np.add(a, 1, out=a)
print(a)
print(b)

[2. 2. 2. 2. 2.]
tensor([2., 2., 2., 2., 2.], dtype=torch.float64)


In [0]:
# 另外除了CharTensor之外，所有的tensor都可以在CPU运算和GPU预算之间相互转换
# 使用CUDA函数来将Tensor移动到GPU上
# 当CUDA可用时会进行GPU的运算
if torch.cuda.is_available():
    x = x.cuda()
    y = y.cuda()
    x + y

autograd.Variable 这是这个包中最核心的类。 它包装了一个Tensor，并且几乎支持所有的定义在其上的操作。一旦完成了你的运算，你可以调用 .backward()来自动计算出所有的梯度。
可以通过属性 .data 来访问原始的tensor，而关于这一Variable的梯度则集中于 .grad 属性中。
![替代文字](https://pic4.zhimg.com/80/v2-08e0530dfd6879ff2bee56cfc5cc5073_hd.jpg)

在自动求导中非常重要的类 Function
Variable 和 Function 二者相互联系并且构建了一个描述整个运算过程的无环图。每个Variable拥有一个 .creator 属性，其引用了一个创建Variable的 Function。(除了用户创建的Variable其 creator 部分是 None)。

如果你想要进行求导计算，你可以在Variable上调用.backward()。 如果Variable是一个标量（例如它包含一个单元素数据），你无需对backward()指定任何参数，然而如果它有更多的元素，你需要指定一个和tensor的形状想匹配的grad_output参数

更多关于Variable 和 Function的文档:https://link.zhihu.com/?target=http%3A//pytorch.org/docs/autograd.html

In [0]:
from torch.autograd import Variable
x = Variable(torch.ones(2, 2), requires_grad = True)
y = x + 2
#y.creator  # 错误????????????????????
y.grad  # 貌似creater被grad取代了

In [0]:
# y 是作为一个操作的结果创建的因此y有一个creator ?????????????????
z = y * y * 3
out = z.mean()  # ?????????????????

In [0]:
# 现在我们来使用反向传播
out.backward()

# out.backward()和操作out.backward(torch.Tensor([1.0]))是等价的
# 在此处输出 d(out)/dx
x.grad

tensor([[4.5000, 4.5000],
        [4.5000, 4.5000]])

In [0]:
x = torch.randn(3)
x = Variable(x, requires_grad = True)  # 用上一个随机初始化的tensor初始化变量x?????????????????????????
y = x * 2
while y.data.norm() < 1000:
    y = y * 2
gradients = torch.FloatTensor([0.1, 1.0, 0.0001])  # ?????????????????????
y.backward(gradients)
x.grad

tensor([5.1200e+01, 5.1200e+02, 5.1200e-02])

***
# 搭建神经网络
用 torch.nn 包搭建神经网络  

nn建立在autograd的基础上来进行模型的定义和微分  
**一个典型的神经网络的训练过程：**

1、定义一个有着可学习的参数（或者权重）的神经网络  
2、对着一个输入的数据集进行迭代:  
3、用神经网络对输入进行处理  
4、计算代价值 (对输出值的修正到底有多少)  
5、将梯度传播回神经网络的参数中  
6、更新网络中的权重  
$\qquad$通常使用简单的更新规则: weight = weight + learning_rate * gradient  

  
  
  


##定义一个神经网络
定义一个<font color=geen>**forward函数**</font>，backward会自动地生成， 可以在forward函数中使用所有的Tensor中的操作，模型中可学习的参数会由<font color=geen>**net.parameters()**</font>返回。

In [18]:
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5) # 1 input image channel, 6 output channels, 5x5 square convolution kernel
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1   = nn.Linear(16*5*5, 120) # an affine operation: y = Wx + b
        self.fc2   = nn.Linear(120, 84)
        self.fc3   = nn.Linear(84, 10)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))  # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)  # If the size is a square you can only specify a single number
        x = x.view(-1, self.num_flat_features(x))  # ？？？？？？？？？？？？？？？？？
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:] # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

net = Net()
net

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

## 输入
**注意：** torch.nn包只接受小批量样本，而非单个样本。     
例如：nn.Conv2d能够接受四维的$TensornSamples \times nChannels \times Height \times Width$批量样本，
如果非要用单个样本，使用input.unsqueeze(0)来加一个假维度就可以了。

In [19]:
params = list(net.parameters())
print(len(params))
print(params[0].size()) # conv1's .weight

input = Variable(torch.randn(1, 1, 32, 32))
out = net(input)

10
torch.Size([6, 1, 5, 5])


## 反向传播

In [0]:
net.zero_grad() # 对所有的参数的梯度缓冲区进行归零
out.backward(torch.randn(1, 10)) # 使用随机的梯度进行反向传播

##计算loss

In [21]:
output = net(input)
target = Variable(torch.range(1, 10))  # a dummy target, for example
criterion = nn.MSELoss()
loss = criterion(output, target)
loss

  
  return F.mse_loss(input, target, reduction=self.reduction)


tensor(38.5381, grad_fn=<MseLossBackward>)

In [22]:
print(loss.grad_fn) # MSELoss
print(loss.grad_fn.previous_functions[0][0]) # Linear
print(loss.grad_fn.previous_functions[0][0].previous_functions[0][0]) # ReLU

<MseLossBackward object at 0x7f13515d3b38>


AttributeError: ignored

调用loss.backward()，看看 conv1's在进行反馈之后的偏置梯度如何

In [23]:
# 调用loss.backward(), 看看 conv1's在进行反馈之后的偏置梯度如何
net.zero_grad() # 归零操作
print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)
loss.backward()
print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([-0.0193, -0.0114,  0.0311,  0.0051, -0.0441, -0.0320])


***
## 更新权重

最简单的就是**随机梯度下降法(SGD)：**
  ~~~
weight = weight - learning_rate * gradient
  ~~~
**简单的python实现：**
  ~~~
learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)
~~~
还有许多不同种类的方法：**SGD, Nesterov-SGD, Adam, RMSProp, etc**，这些方法都可以用**torch.optim包**来实现

1、创建 optimizer

In [0]:
import torch.optim as optim
# 创建 optimizer
optimizer = optim.SGD(net.parameters(), lr = 0.01)

2、使用 optimizer 实现梯度下降

In [0]:
optimizer.zero_grad()  # 缓存区梯度归零
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()  # 开始更新

In [0]:
x = torch.tensor([[1.,2.,3.],[4.,5.,6.]],requires_grad=True)
y = x+1
print(y)
z = 2*y*y
print(z)
J = torch.mean(z)

tensor([[2., 3., 4.],
        [5., 6., 7.]], grad_fn=<AddBackward0>)
tensor([[ 8., 18., 32.],
        [50., 72., 98.]], grad_fn=<MulBackward0>)


In [0]:
z.backward(torch.tensor([[1.,1.,1.],[1.,1.,1.]]))
x.grad

tensor([[ 8., 12., 16.],
        [20., 24., 28.]])

In [0]:
z.backward()

RuntimeError: ignored

In [0]:
J.backward()

In [0]:
x.grad

tensor([[ 8., 12., 16.],
        [20., 24., 28.]])

In [0]:
x = torch.ones(2,4,requires_grad=True)
print(x)

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.]], requires_grad=True)


In [0]:
y = x + 2
print(y)

tensor([[3., 3., 3., 3.],
        [3., 3., 3., 3.]], grad_fn=<AddBackward0>)


In [0]:
y.requires_grad

True

In [0]:
x.requires_grad_(False)

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.]])

In [0]:
print(x.requires_grad,y.requires_grad)

False True


In [0]:
from torch.autograd import Variable
x = torch.rand(5)
print(x)
x = Variable(x,requires_grad = True)
y = x * 2
print(y)
grads = torch.FloatTensor([1,1,1,1,1])
y.backward(grads)#如果y是scalar的话，那么直接y.backward()，然后通过x.grad方式，就可以得到var的梯度
x.grad           #如果y不是scalar，那么只能通过传参的方式给x指定梯度

tensor([0.3219, 0.7271, 0.3582, 0.1854, 0.4333])
tensor([0.6437, 1.4541, 0.7165, 0.3709, 0.8666], grad_fn=<MulBackward0>)


tensor([2., 2., 2., 2., 2.])

In [0]:
import torch

x = torch.randn(2,1,7,3)
conv = torch.nn.Conv2d(1,8,(2,3))
res = conv(x)

print(res.shape)

torch.Size([2, 8, 6, 1])


In [0]:
x

tensor([[[[-0.6527, -1.2410, -1.1945],
          [-0.0185, -2.1766,  0.6402],
          [-0.7816, -1.0634, -0.2349],
          [ 0.2766,  0.1649, -0.1483],
          [-1.8347, -1.0713, -0.4455],
          [ 0.8888,  1.0780, -0.9985],
          [ 0.2306, -1.0541,  0.2395]]],


        [[[-0.0297, -0.6147,  0.8194],
          [-0.0355,  0.5811, -1.1233],
          [-0.2102,  0.5853, -0.1647],
          [ 1.0932, -0.9402, -1.8629],
          [-0.6348,  1.0294, -0.7822],
          [-0.2458, -0.6021, -0.5502],
          [ 0.6221,  1.1570, -0.1598]]]])

In [0]:
b = torch.Tensor([-1])
b

tensor([-1.])

In [0]:
torch.FloatTensor(b)

tensor([-1.])

In [0]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):#需要继承这个类
    def __init__(self):
        super(Net, self).__init__()
        #建立了两个卷积层，self.conv1, self.conv2，注意，这些层都是不包含激活函数的
        self.conv1 = nn.Conv2d(1, 6, 5) # 1 input image channel, 6 output channels, 5x5 square convolution kernel
        self.conv2 = nn.Conv2d(6, 16, 5)
        #三个全连接层
        self.fc1   = nn.Linear(16*5*5, 120) # an affine operation: y = Wx + b
        self.fc2   = nn.Linear(120, 84)
        self.fc3   = nn.Linear(84, 10)
net = Net()
net


Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

In [0]:
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class Net(nn.Module):#需要继承这个类
    def __init__(self):
        super(Net, self).__init__()
        #建立了两个卷积层，self.conv1, self.conv2，注意，这些层都是不包含激活函数的
        self.conv1 = nn.Conv2d(1, 6, 5) # 1 input image channel, 6 output channels, 5x5 square convolution kernel
        self.conv2 = nn.Conv2d(6, 16, 5)
        #三个全连接层
        self.fc1   = nn.Linear(16*5*5, 120) # an affine operation: y = Wx + b
        self.fc2   = nn.Linear(120, 84)
        self.fc3   = nn.Linear(84, 10)

    def forward(self, x): #注意，2D卷积层的输入data维数是 batchsize*channel*height*width
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv2(x)), 2) # If the size is a square you can only specify a single number
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:] # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

net = Net()
net


Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

In [0]:
params = list(net.parameters())
print(len(params))
print(params[0].size()) # conv1's .weight

10
torch.Size([6, 1, 5, 5])


In [0]:
len(list(net.parameters()))

10

In [0]:
input = Variable(torch.randn(1, 1, 32, 32))
out = net(input) #这个地方就神奇了，明明没有定义__call__()函数啊，所以只能猜测是父类实现了，并且里面还调用了forward函数
out              #查看源码之后，果真如此。那么，forward()是必须要声明的了，不然会报错

tensor([[ 0.0674, -0.0822,  0.0657, -0.1416, -0.1394, -0.0737,  0.0129, -0.1327,
          0.0250, -0.0900]], grad_fn=<AddmmBackward>)

In [0]:
import torch.optim as optim
# create your optimizer
optimizer = optim.SGD(net.parameters(), lr = 0.01)

# in your training loop:
optimizer.zero_grad() # 如果不置零，Variable 的梯度在每次 backward 的时候都会累加。

output = net(input) # 这里就体现出来动态建图了，你还可以传入其他的参数来改变网络的结构

loss = criterion(output, target)
loss.backward()
optimizer.step() # Does the update

NameError: ignored

In [0]:
output = net(input)
target = Variable(torch.range(1, 10))  # a dummy target, for example
criterion = nn.MSELoss()
loss = criterion(output, target)
loss

  
  return F.mse_loss(input, target, reduction=self.reduction)


tensor(39.1590, grad_fn=<MseLossBackward>)

In [0]:
# For illustration, let us follow a few steps backward
print(loss.creator) # MSELoss
print(loss.creator.previous_functions[0][0]) # Linear
print(loss.creator.previous_functions[0][0].previous_functions[0][0]) # ReLU

AttributeError: ignored

In [0]:
# 现在我们应当调用loss.backward(), 之后来看看 conv1's在进行反馈之后的偏置梯度如何
net.zero_grad() # 归零操作
print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)
loss.backward()
print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
None
conv1.bias.grad after backward
tensor([0.0719, 0.0485, 0.0808, 0.0707, 0.0149, 0.0770])


In [0]:
learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)