## Pytorch Hands-on Tutorial
(Adapted from Adam’s lecture: https://www.youtube.com/watch?v=Rv9naeLXolY)

### Basics

#### Tensor creation

In [155]:
import torch
x = torch.FloatTensor(5, 5) # ByteTensor, DoubleTensor, HalfTensor, cuda.*Tensor
x # x is filled with uninitialized data, might give "Overflow when unpacking long" error
x = torch.randn(5, 5)
x # x is filled with random numbers sampled from normal(mean=0, sigma=1)
x = torch.zeros(5, 5)
x # x is filled with zeros
x.shape
x.shape[0]
x.size()

False

#### Tensor operations
Reference: (Pytorch documentation)[http://pytorch.org/docs/master/]

In [10]:
x.add(1) # the result is stored in a new tensor 
x # x is unchanged
torch.add(x, 1) # the result is stored in a new tensor
x # x is unchanged
x.add_(1) # inplace operation
x # x is changed
x.numpy() # share the same memory, cheap

array([[ 2.80779743,  3.71639204,  3.74747205,  3.10655522,  0.38993454],
       [ 4.36135912,  3.05552649,  4.68370914,  3.41711283,  3.52829456],
       [ 0.78680897,  2.4560442 ,  0.92385507,  2.66148829,  1.61206424],
       [ 2.87399626,  3.8152256 ,  2.20971227,  2.61522675,  3.20394945],
       [ 5.45448875,  1.04150343,  3.3412509 ,  2.08594298,  2.36381507]], dtype=float32)

#### Different ways to create a tensor

In [15]:
#### For 0.4, from Simon's note
torch.tensor([[1], [2], [3]], dtype=torch.half) # create a tensor with half precision
torch.tensor(1) # scalar
torch.tensor([1, 2.3]).dtype # type inference
torch.tensor([1, 2]).dtype # type inference

# torch.*_like takes in an input Tensor instead of a shape.
# It returns a Tensor with same attributes as the input Tensor by default unless otherwise specified
x = torch.randn(3, dtype=torch.float64)
torch.zeros_like(x)
torch.zeros_like(x, dtype=torch.int)

# tensor.new_* can also create Tensors with same attributes as tensor, but it always takes in a shape argument:
x = torch.randn(3, dtype=torch.float64)
x.new_ones(2)
x.new_ones(4, dtype=torch.int)

#### Moving things between CPU and GPU

In [16]:
# Writing device-agnostic code 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# then whenever you get a new Tensor or Module
# this won't copy if they are already on the desired device
data = torch.randn(5, 5)
input = data.to(device)
l1 = torch.nn.Linear(10, 10)
model = l1.to(device)

#### Sharing storage with other tensors


In [20]:
x = torch.zeros(5, 5)
y = x[:2] # y is sharing storage with x
y
y.add_(1) # this is also mutating x
y
x # print, the first two rows of x have changed


 1.9286  0.6241  1.9697  1.0252  2.7235
 0.1873  0.1642  3.2338  1.4829  2.2798
 1.6903 -0.3187  0.1675 -0.0294 -0.2337
 0.4560 -0.7905 -0.9105 -0.5309  1.5335
-0.4103  1.1746 -0.3844 -0.2450  2.3123
[torch.FloatTensor of size (5,5)]

#### Serialization


Pickle usage is not recommended

In [21]:
import pickle
x = torch.zeros(5, 5)
y = x[:2]
pickle.dump((x, y), open('test', 'wb')) 
a, b = pickle.load(open('test', 'rb')) # a, b don't share the same storage any more
b.add_(1) # this doesn't change a
b # print
a # print


 1.9286  0.6241  1.9697  1.0252  2.7235
 0.1873  0.1642  3.2338  1.4829  2.2798
 1.6903 -0.3187  0.1675 -0.0294 -0.2337
 0.4560 -0.7905 -0.9105 -0.5309  1.5335
-0.4103  1.1746 -0.3844 -0.2450  2.3123
[torch.FloatTensor of size (5,5)]

torch.save/torch.load()

In [22]:
x = torch.zeros(5, 5)
y = x[:2]
torch.save((x, y), 'foo')
a, b = torch.load('foo') # a, b still share the same storage
b.add_(1) # this changes a
b # print
a # print


 2.9286  1.6241  2.9697  2.0252  3.7235
 1.1873  1.1642  4.2338  2.4829  3.2798
 1.6903 -0.3187  0.1675 -0.0294 -0.2337
 0.4560 -0.7905 -0.9105 -0.5309  1.5335
-0.4103  1.1746 -0.3844 -0.2450  2.3123
[torch.FloatTensor of size (5,5)]

#### Autograd

This is based on 0.3.1 with Variable, we should adjust it accordingly depending on if 0.4 release can make to the workshop. 
Variable is a wrapper around Tensor that holds the computation history, and the gradient with respect to this tensor.

In [240]:
from torch.autograd import Variable
x = torch.randn(5, 5)
x = Variable(x)
#### For 0.4#####
x = torch.randn(1)
x.requires_grad

False

In [241]:
y = x * 2
y # print

tensor([ 0.5865])

In [242]:
y.requires_grad

False

In [243]:
x = torch.randn(1, requires_grad=True)
x.requires_grad

True

In [244]:
y = x * 2
y.requires_grad

True

In [245]:
y.backward() # This works because y only has a single element. In cases where y is a multi-dimensional tensor, we should create scalar output from y and backprop from there
x.grad # This is dy / dx

tensor([ 2.])

In [246]:
#### For 0.4#####
x = torch.randn(1, requires_grad=True)
y = torch.randn(1, requires_grad=True)
z = 2 * x + y
z.backward()
y.grad # dz / dy

tensor([ 1.])

In [247]:
x.grad # dz / dx

tensor([ 2.])

#### Derivatives


In [65]:
# As long as any of the input requires grad, the output requires grad as well.
x = torch.ones(5, 5, requires_grad=True)
y = x ** 2 + 2 * x + 1
z = y * 2
y.requires_grad # print
z.requires_grad # print
z.sum().backward() # .sum() is a good way to get a scalar output from z and then be able to backprop through the model
y.grad # doesn't have value, because we don't store gradient for intermediate tensors by default
x.grad # has value

tensor([[ 8.,  8.,  8.,  8.,  8.],
        [ 8.,  8.,  8.,  8.,  8.],
        [ 8.,  8.,  8.,  8.,  8.],
        [ 8.,  8.,  8.,  8.,  8.],
        [ 8.,  8.,  8.,  8.,  8.]])

In [66]:
z.sum().backward() # Cannot backward a second time, efficient: once you differentiate a graph, we destroy it. 

RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.

In [67]:
x = torch.ones(5, 5, requires_grad=True)
y = x ** 2 + 2 * x + 1
z = y * 2

z.sum().backward(retain_graph=True) # If you really need to keep the gradients, this works.
z.sum().backward(retain_graph=True) # We backward twice to see its effect
x.grad # x.grad doubles since it accumulate the grad twice

tensor([[ 16.,  16.,  16.,  16.,  16.],
        [ 16.,  16.,  16.,  16.,  16.],
        [ 16.,  16.,  16.,  16.,  16.],
        [ 16.,  16.,  16.,  16.,  16.],
        [ 16.,  16.,  16.,  16.,  16.]])

In [72]:
x = torch.ones(5, 5, requires_grad=True)
y = x ** 2 + 2 * x + 1
z = y * 2
z.backward(torch.ones(5, 5)) # if we don't use .sum(), we can also specify gradient for each element in output tensor
x.grad # dz / dx

tensor([[ 8.,  8.,  8.,  8.,  8.],
        [ 8.,  8.,  8.,  8.,  8.],
        [ 8.,  8.,  8.,  8.,  8.],
        [ 8.,  8.,  8.,  8.,  8.],
        [ 8.,  8.,  8.,  8.,  8.]])

##### Higher order optimization

In [84]:
x = torch.ones(5, 5, requires_grad=True)
y = x ** 2 + 2 * x + 1
z = y * 2
x_grad, = torch.autograd.grad(z.sum(), x, create_graph=True) # We destroy the graph by default. If you care about how the grad was calculated, you should use this flag
x_grad.grad_fn
loss = z.sum() + torch.norm(x_grad) # We can then do this to also penalize large gradients 
loss.backward()
x.grad

tensor([[ 8.8000,  8.8000,  8.8000,  8.8000,  8.8000],
        [ 8.8000,  8.8000,  8.8000,  8.8000,  8.8000],
        [ 8.8000,  8.8000,  8.8000,  8.8000,  8.8000],
        [ 8.8000,  8.8000,  8.8000,  8.8000,  8.8000],
        [ 8.8000,  8.8000,  8.8000,  8.8000,  8.8000]])

#### Volatile flag

If a variable is volatile, it implicitly doesn't require grad
In 0.4, this will be deprecated, you should use torch.no_grad() instead

In [87]:
x = Variable(torch.randn(5, 5), volatile=True) # volatile take precedence over requires_grad flag, good for evaluation.
 

  """Entry point for launching an IPython kernel.


In [89]:
with torch.no_grad():
    # evaluation
    pass

torch.set_grad_enabled(False)
# blabla
    

#### Change a Variable

In [110]:
x = torch.randn(5, 5, requires_grad=True) # pre-0.4: Variable(torch.randn(5, 5), requires_grad=True)
y = x * 2
z = y * y

In [111]:
y[1:2] = 0 # Variables can be modified inplace
y # print

tensor([[-1.3981,  2.7427,  2.3629, -1.5718, -2.7228],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.3136, -3.0554, -0.5079,  1.7996,  1.7493],
        [-0.8151, -1.2129,  4.0082,  1.6945,  0.8834],
        [ 2.3605,  0.3543,  1.5048, -1.6850, -0.8497]])

In [None]:
y.sum().backward()
x.grad # print, the second row of y doesn't depend on x any more. 

In [115]:
# Since in our case dz / dy is dependent on y, and we changed the value of y after z is computed,
# the previously computed gradients will be wrong
z.sum().backward() 

RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.

In [121]:
x = torch.randn(5, 5, requires_grad=True) # pre-0.4: Variable(torch.randn(5, 5), requires_grad=True)
y = x * 2
z = y * y
y[1:2] = 0
z = y * y # need to recalculate z to make sure gradient is correct, after in-place operation of y
z.sum().backward()

### nn Module


In [125]:
import torch.nn as nn
# torch.nn has modules for common layers and loss functions

#### nn.Linear

In [383]:
# Implement linear layer manually
# y = xW^T + b
W = torch.randn(3, 2, requires_grad=True)
b = torch.randn(3, requires_grad=True)
x = torch.randn(2)
y = torch.matmul(x, torch.t(W)) + b

print("x:", x)
print()
y.sum().backward()
print(W.grad)
print(b.grad)

# The gradient for every row of W is dy / dx * x

x: tensor([-1.0614,  1.8644])

tensor([[-1.0614,  1.8644],
        [-1.0614,  1.8644],
        [-1.0614,  1.8644]])
tensor([ 1.,  1.,  1.])


In [377]:
# We can also use nn.Linear
linear = nn.Linear(2, 3)

In [378]:
linear # print

Linear(in_features=2, out_features=3, bias=True)

In [379]:
# nn only works with batchs, leading dimension for batch is required
x = torch.randn(2) # pre-0.4: Variable(torch.rands(5))
# here the number of samples in the batch is 4, and each sample is a tensor of torch.Size([5])
x

tensor([-0.3104,  0.4423])

In [380]:
y = linear(x) 
y

tensor([ 0.3516,  0.5766, -0.2015])

In [381]:
y.requires_grad # Linear is layer with learnable parameters

True

In [382]:
y.backward(torch.ones(3))
linear.weight.grad

tensor([[-0.3104,  0.4423],
        [-0.3104,  0.4423],
        [-0.3104,  0.4423]])

#### nn.Sequential

In [132]:
c = nn.Sequential(
    nn.Linear(5, 10),
    nn.Sigmoid(),
    nn.Linear(10, 5)
)

In [133]:
c(x)

tensor([[ 0.1842, -0.0729, -0.1844, -0.2492,  0.6756],
        [ 0.0322, -0.0618, -0.4573, -0.2712,  0.6674],
        [ 0.1052, -0.0329, -0.4848, -0.3079,  0.7856],
        [-0.0029, -0.1427, -0.4209, -0.2858,  0.7788]])

#### Loss function

In [145]:
output = torch.zeros(4, 5, requires_grad=True)
criterion = nn.MSELoss()
target = torch.ones(4, 5)
loss = criterion(output, target)
loss

tensor(1.)

In [146]:
loss.backward()
output.grad

tensor([[-0.1000, -0.1000, -0.1000, -0.1000, -0.1000],
        [-0.1000, -0.1000, -0.1000, -0.1000, -0.1000],
        [-0.1000, -0.1000, -0.1000, -0.1000, -0.1000],
        [-0.1000, -0.1000, -0.1000, -0.1000, -0.1000]])

#### Customized module
nn Modules are easy to create modules with learnable parameters, but for layers like sigmoid which doesn't have learnable parameters, we don't actually need to create a variable to store it, and we can just use the functional API.

In [229]:
import torch.nn.functional as F

class MyModule(nn.Module):
    def __init__(self, num_input, num_output):
        super(MyModule, self).__init__()
        self.linear1a = nn.Linear(num_input, num_output // 2)
        self.linear1b = nn.Linear(num_input, num_output // 2)
        self.weight = nn.Parameter(torch.randn(num_input))
        
    def forward(self, x):
        x_w = self.weight * x
        x_a = F.sigmoid(self.linear1a(x_w)) # alternatively, we can do self.sigmoid = nn.Sigmoid() and then use self.sigmoid here
        x_b = F.tanh(self.linear1b(x_w))
        x_2 = x_a + x_b
        return x_2
    

In [230]:
net = MyModule(10, 40)

In [231]:
net # self.weight won't show here because it's not a submodule of nn.Module, unlike other layers

MyModule(
  (linear1a): Linear(in_features=10, out_features=20, bias=True)
  (linear1b): Linear(in_features=10, out_features=20, bias=True)
)

In [232]:
net.weight # but we can still access it like this

Parameter containing:
tensor([ 1.3597,  0.6631,  0.4937,  1.9683, -2.1145,  0.4908,  1.4367,
         0.5062,  0.2233, -1.5731])

In [233]:
net.parameters() # lazy generator, more memory efficient

<generator object Module.parameters at 0x1196ce888>

In [234]:
list(net.parameters()) # return a list

[Parameter containing:
 tensor([ 1.3597,  0.6631,  0.4937,  1.9683, -2.1145,  0.4908,  1.4367,
          0.5062,  0.2233, -1.5731]), Parameter containing:
 tensor([[-0.2576, -0.1966, -0.0793, -0.0573, -0.3150, -0.3097, -0.2373,
          -0.2488,  0.0254, -0.0453],
         [-0.1284, -0.2624, -0.2582, -0.1382, -0.0062,  0.2397,  0.2294,
           0.3058, -0.3135, -0.1735],
         [-0.1767,  0.0095,  0.1932,  0.3122, -0.1010, -0.0084, -0.0896,
           0.1888,  0.2653,  0.1977],
         [-0.0082,  0.0060, -0.1862, -0.2656, -0.0911,  0.2171,  0.1016,
           0.1136,  0.2675, -0.3076],
         [-0.2813,  0.1856, -0.2866, -0.1865,  0.0996, -0.0111, -0.2480,
          -0.0058,  0.2700, -0.0102],
         [ 0.2518,  0.0036, -0.1820,  0.0772, -0.0133, -0.0062,  0.1908,
           0.0964,  0.2201, -0.0469],
         [ 0.0820, -0.0560,  0.1339, -0.1544,  0.1690,  0.1904, -0.0691,
           0.2197,  0.1762,  0.1700],
         [-0.1888, -0.1120, -0.0924,  0.0693, -0.0245, -0.1220,  0.1

In [235]:
net.linear1a # indexing a layer

Linear(in_features=10, out_features=20, bias=True)

In [236]:
net(torch.ones(10))

tensor([ 1.3206,  0.6968, -0.3083,  0.2719, -0.1071,  0.4292,  0.3404,
         0.3785,  0.0274,  0.6607,  0.8987, -0.0660,  1.3234,  1.0281,
         0.6534,  1.1839, -0.3487,  1.0072, -0.4281,  0.3137])

### Torchvision package

In [192]:
import torchvision

In [193]:
net = torchvision.models.resnet18()
net # print

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
      (conv2): Co

In [194]:
# Module indexing
net.layer4[1].conv1

Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)

In [201]:
list(net.parameters()) # list all parameters

[Parameter containing:
 tensor([[[[-1.5621e-02,  2.4440e-02,  2.8120e-02,  ..., -8.6362e-03,
            -2.9720e-02,  1.3076e-02],
           [-1.4154e-02, -3.4821e-03, -2.7462e-02,  ...,  6.8957e-03,
             3.8500e-02,  3.6217e-03],
           [-4.0210e-02, -2.8557e-02,  8.4798e-03,  ..., -2.0184e-02,
             1.1764e-02,  6.0403e-03],
           ...,
           [-5.3917e-03,  1.9962e-02,  5.6974e-02,  ..., -7.7565e-03,
            -1.1616e-02, -1.7233e-02],
           [-2.2773e-02,  1.8171e-02, -5.7931e-02,  ...,  5.4715e-03,
            -4.0303e-02,  2.4557e-02],
           [-1.1588e-03,  4.3717e-02, -5.4032e-04,  ..., -7.6246e-03,
             3.7155e-02,  3.6510e-02]],
 
          [[ 2.7597e-02, -4.5002e-03, -9.2732e-03,  ..., -9.5263e-03,
             4.4152e-02,  1.4242e-02],
           [ 5.8133e-03,  3.4428e-02, -3.6560e-02,  ..., -2.7198e-02,
            -1.5470e-02, -1.1621e-02],
           [ 4.4464e-03, -1.5390e-02,  1.9589e-02,  ..., -6.8363e-03,
             3.4

In [197]:
sum(p.numel() for p in net.parameters()) # get total number of parameters

11689512

In [202]:
list(net.named_modules())

[('', ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (

#### Use PyTorch with numpy (or other libraries) [TODO]: needs work


In [238]:
import numpy as np
import scipy.ndiimage.filters as filters
import matplotlib.pyplot as plt

import torch
from torch.autograd import Variable
import torch.nn.functional as F

def show(var):
    img = var.data.squeeze(0).permute(1, 2, 0).contiguous().numpy()
    plt.imshow(img)
    plt.show()
    
x = np.zeros((5, 5), np.float)
x[2, 2] = 1
raw_filter = torch.from_numpy(filters.gaussian_filter(x, 1))
gaussian_filter = torch.zeros(3, 3, 5, 5)
for i in range(3):
    gaussian_filter[i, i] = raw_filter

ModuleNotFoundError: No module named 'scipy.ndiimage'

In [146]:
plt.imshow(raw_filter.numpy()) # plot gaussian filter
plt.show()

NameError: name 'plt' is not defined

In [147]:
class Blur(nn.Module):
    def __init__(self):
        self.register_buffer('gaussian', gaussian_filter) # not learnable
    
    def forward(self, x):
        return F.conv2d(x, Variable(self.gaussian))

In [148]:
blur = Blur()
x = Variable(torch.randn(1, 3, 200, 200)) # 1 imagen, 3 channels
blur(x)

NameError: name 'gaussian_filter' is not defined

In [149]:
show(x)

NameError: name 'show' is not defined