## Pytorch Hands-on Tutorial
(Adapted from Adam’s lecture: https://www.youtube.com/watch?v=Rv9naeLXolY)

### Basics

#### Tensor creation

In [155]:
import torch
x = torch.FloatTensor(5, 5) # ByteTensor, DoubleTensor, HalfTensor, cuda.*Tensor
# x is filled with uninitialized data
x = torch.randn(5, 5)
x.shape
x.size()

False

#### Tensor operations
Reference: (Pytorch documentation)[http://pytorch.org/docs/master/]

In [10]:
x.add(1)
torch.add(x, 1)
x.add_(1) # inplace operation
x.numpy() # share the same memory, cheap

array([[ 2.80779743,  3.71639204,  3.74747205,  3.10655522,  0.38993454],
       [ 4.36135912,  3.05552649,  4.68370914,  3.41711283,  3.52829456],
       [ 0.78680897,  2.4560442 ,  0.92385507,  2.66148829,  1.61206424],
       [ 2.87399626,  3.8152256 ,  2.20971227,  2.61522675,  3.20394945],
       [ 5.45448875,  1.04150343,  3.3412509 ,  2.08594298,  2.36381507]], dtype=float32)

#### CUDA Tensors

In [None]:
x = torch.randn(5, 5)
x.cuda() # TODO: revisit this with 0.4 device object
with torch.cuda.device(1):
    x = torch.cuda.FloatTensor(1) # on device 1, the second device
    x.cuda() # on device 1
    x.cuda(0) # on device 0
# first cuda call initialize cuda context, so it's slow
# x.cuda() is one GPU but x is still on CPU, to replace, use
x = x.cuda()

In [15]:
#### For 0.4, from Simon's note
device = torch.device("cuda:1")
x = torch.randn(5, 5, dtype=torch.float64, device=device)
x.requires_grad # print
x = torch.zeros(3, requires_grad=True)
x.requires_grad # print

cuda = torch.device("cuda")
torch.tensor([[1], [2], [3]], dtype=torch.half, device=cuda)
torch.tensor(1) # scalar
torch.tensor([1, 2.3]).dtype # type inference
torch.tensor([1, 2]).dtype # type inference

# torch.*_like takes in an input Tensor instead of a shape. It returns a Tensor with same attributes as the input Tensor by default unless otherwise specified
x = torch.randn(3, dtype=torch.float64)
torch.zeros_like(x)
torch.zeros_like(x, dtype=torch.int)

# tensor.new_* can also create Tensors with same attributes as tensor, but it always takes in a shape argument:
x = torch.randn(3, dtype=torch.float64)
x.new_ones(2)
x.new_ones(4, dtype=torch.int)

# .to() method
cpu = torch.device("cpu")
cuda = torch.device("cuda")
x = torch.randn(3, dtype=torch.double) 
y = torch.zeros(2, dtype=torch.half, device=cuda)
x.to(cuda) # move to a different device
y.to(torch.double) # cast to a different type
x.to(cuda, torch.half) # move and cast at the same time
y.to(x) # move and cast to the same device and dtype as a tensor
# module.to() works similarly but is in-place and does not support taking in a Tensor

# Writing device-agnostic code 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# then whenever you get a new Tensor or Module
# this won't copy if they are already on the desired device
input = data.to(device)
model = MyModule(...).to(device)

#### Serialization


In [20]:
x = torch.randn(5, 5)
y = x[:2] # y is sharing storage with x
y.add_(1) # this is also mutating x
x # print, the first two rows of x have changed


 1.9286  0.6241  1.9697  1.0252  2.7235
 0.1873  0.1642  3.2338  1.4829  2.2798
 1.6903 -0.3187  0.1675 -0.0294 -0.2337
 0.4560 -0.7905 -0.9105 -0.5309  1.5335
-0.4103  1.1746 -0.3844 -0.2450  2.3123
[torch.FloatTensor of size (5,5)]

Pickle usage is not recommended

In [21]:
import pickle
pickle.dump((x, y), open('test', 'wb')) 
a, b = pickle.load(open('test', 'rb')) # a, b don't share the same storage any more
b.add_(1) # this doesn't change a
a # print


 1.9286  0.6241  1.9697  1.0252  2.7235
 0.1873  0.1642  3.2338  1.4829  2.2798
 1.6903 -0.3187  0.1675 -0.0294 -0.2337
 0.4560 -0.7905 -0.9105 -0.5309  1.5335
-0.4103  1.1746 -0.3844 -0.2450  2.3123
[torch.FloatTensor of size (5,5)]

torch.save/torch.load()

In [22]:
torch.save((x, y), 'foo')
a, b = torch.load('foo') # a, b still share the same storage
b.add_(1) # this changes a
a # print


 2.9286  1.6241  2.9697  2.0252  3.7235
 1.1873  1.1642  4.2338  2.4829  3.2798
 1.6903 -0.3187  0.1675 -0.0294 -0.2337
 0.4560 -0.7905 -0.9105 -0.5309  1.5335
-0.4103  1.1746 -0.3844 -0.2450  2.3123
[torch.FloatTensor of size (5,5)]

#### Autograd

This is based on 0.3.1 with Variable, we should adjust it accordingly depending on if 0.4 release can make to the workshop. 
Variable is a wrapper around Tensor that holds the computation history, and the gradient with respect to this tensor.

In [157]:
from torch.autograd import Variable
x = torch.ones(5, 5)
x = Variable(x)
#### For 0.4#####
# x = torch.ones(5, 5, requires_grad=True)
x.requires_grad
x # print
y = x ** 2 + 2 * x + 1
y # print 
y.grad_fn # Shows the grad_fn
x.grad_fn # User created Variables never has grad_fn, this is None. leaf variables

In [159]:
x.requires_grad


True

#### Derivatives


In [32]:
z = y * 2
z.backward() # FAIL: z must be a scalar, x doesn't require grad.

RuntimeError: element 0 of variables does not require grad and does not have a grad_fn

In [45]:
# As long as any of the input requires grad, the output requires grad as well.
x = torch.ones(5, 5)
x = Variable(x, requires_grad=True)
y = x ** 2 + 2 * x + 1
z = y * 2
y.requires_grad # print
z.requires_grad # print
z.sum().backward()
x.grad # print
y.grad #print

In [49]:
z.sum().backward() # Cannot backward a second time, efficient: once you differentiate a graph, we destroy it. 

In [56]:
z.sum().backward(retain_graph=True) # This works.
x.grad # x.grad doubles since it accumulate the grad


 8  8  8  8  8
 8  8  8  8  8
 8  8  8  8  8
 8  8  8  8  8
 8  8  8  8  8
[torch.FloatTensor of size (5,5)]

In [61]:
z.backward(torch.ones(5, 5) * 2, retain_graph=True) # du/dz = torch.ones(5, 5) * 2
x.grad


 16  16  16  16  16
 16  16  16  16  16
 16  16  16  16  16
 16  16  16  16  16
 16  16  16  16  16
[torch.FloatTensor of size (5,5)]

#### Symbolic

In [83]:
x = torch.ones(5, 5)
x = Variable(x, requires_grad=True)
y = x ** 2 + 2 * x + 1
z = y * 2

In [75]:
#x_grad, y_grad = torch.autograd.grad(z.sum(), x)
x_grad, y_grad = torch.autograd.grad(z.sum(), (x,y))
x.grad # pure functional, doesn't change x.grad
x_grad # print
y_grad # print


 2  2  2  2  2
 2  2  2  2  2
 2  2  2  2  2
 2  2  2  2  2
 2  2  2  2  2
[torch.FloatTensor of size (5,5)]

In [84]:
torch.autograd.backward([x.sum(), y.sum()], retain_graph=True)
# Equivalent to, but more efficient by sharing the graph downstream
# x.sum().backward()
# y.sum().backward()

##### Higher order optimization

In [86]:
x_grad, = torch.autograd.grad(z.sum(), x, create_graph=True) # default to dump the graph, if you care about how the grad was calculated, you should use this flag
x_grad.grad_fn # In some cases, loss = loss + norm(grad) and then call backward()

<AddBackward1 at 0x7f9fd81d5160>

#### Volatile flag

If a variable is volatile, it implicitly doesn't require grad
In 0.4, this will be deprecated, you should use torch.no_grad() instead

In [87]:
x = Variable(torch.randn(5, 5), volatile=True) # volatile take precedence over requires_grad flag, good for evaluation.
 

  """Entry point for launching an IPython kernel.


In [89]:
with torch.no_grad():
    # evaluation
    pass

torch.set_grad_enabled(False)
# blabla
    

#### Change a Variable

In [112]:
x = Variable(torch.randn(5, 5), requires_grad=True)
y = x * 2
z = y * y

In [113]:
y[1:2] = 0 # Variables can be modified inplace
y # print


 0.4008  0.3784  2.9285 -0.7939 -0.5596
 0.0000  0.0000  0.0000  0.0000  0.0000
-2.1118  0.2188  0.2066  1.6685  1.4737
 0.6268  0.6557 -1.7691  3.2598 -1.9490
-3.0023 -0.8143  2.1804 -0.6113  0.6959
[torch.FloatTensor of size (5,5)]

In [106]:
y.sum().backward()
x.grad # print, the second row of y doesn't depend on x any more. 


  1.0479   6.0266  14.0994  -7.4395  -1.4479
 15.5501  -2.4441   4.6461  -3.3696   4.3458
  3.2041  -3.3341  -8.3297  -7.1601  -0.7443
-17.1637   5.3590  -4.1770  11.3721   2.1357
-15.5063   5.1080   3.2012  -2.8212   2.3345
[torch.FloatTensor of size (5,5)]

In [115]:
z.sum().backward() 
# we save y for z, and detect the error when you do backward.
# You can use it, but too much inplace operations are  hard to debug. 

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation

### nn Module


In [116]:
import torch.nn as nn

In [117]:
linear = nn.Linear(5, 10)

In [118]:
linear # print


Linear(in_features=5, out_features=10, bias=True)

In [119]:
# nn only works with batchs, leading dimension for batch is required
x = Variable(torch.ones(4, 5))

In [121]:
y = linear(x) 
y.requires_grad # Linear is layer with learnable parameters

#### nn.Sequential

In [123]:
c = nn.Sequential(
    nn.Linear(5, 10),
    nn.Sigmoid(),
    nn.Linear(10, 5)
)

In [124]:
c(x)


-0.1148 -0.0654 -0.8692  0.8922  0.1524
-0.1148 -0.0654 -0.8692  0.8922  0.1524
-0.1148 -0.0654 -0.8692  0.8922  0.1524
-0.1148 -0.0654 -0.8692  0.8922  0.1524
[torch.FloatTensor of size (4,5)]

#### Customized module
nn Modules are easy to create modules with learnable parameters, but for layers like sigmoid which doesn't have learnable parameters, we have a functional API for it.

In [125]:
import torch.nn.functional as F

class MyModule(nn.Module):
    def __init__(self, num_input, num_output):
        super(MyModule, self).__init__()
        self.linear1a = nn.Linear(num_input, num_output // 2)
        self.linear1b = nn.Linear(num_input, num_output // 2)
        self.linear2a = nn.Linear(num_output // 2, num_output // 2)
        self.linear2b = nn.Linear(num_output // 2, num_output // 2)
        
    def forward(self, x):
        x_a = F.sigmoid(self.linear1a(x))
        x_b = F.tanh(self.linear1b(x))
        x_2 = x_a + x_b
        return self.linear2a(x_2), self.linear2b(x_2)
    

In [127]:
net = MyModule(10, 40)

In [132]:
net # print
net.parameters() # lazy generator, more memory efficient
list(net.parameters()) # return a list
net.linear1a # indexing a layer

Linear(in_features=10, out_features=20, bias=True)

### Torchvision package

In [133]:
import torchvision

In [137]:
net = torchvision.models.resnet18()
net # print
# Module indexing
net.layer4[1].conv1

Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)

In [144]:
list(net.parameters()) 
sum(p.numel() for p in net.parameters())
list(net.named_modules())
net.cuda() # You don't need to move every weight one by one. This is slightly different from tensor.cuda() since that returns a clone of tensor on the gpu device
# module.cuda() replaces all weights inplace
type(net.conv1.weight.data) # Get the tensor type of a variable
net.cpu() # Move the model back to cpu
net.double() # Case weights to double precision, remember to give a double precision input

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
      (conv2): Co

#### External python libraries


In [145]:
import numpy as np
import scipy.ndiimage.filters as filters
import matplotlib.pyplot as plt

import torch
from torch.autograd import Variable
import torch.nn.functional as F

def show(var):
    img = var.data.squeeze(0).permute(1, 2, 0).contiguous().numpy()
    plt.imshow(img)
    plt.show()
    
x = np.zeros((5, 5), np.float)
x[2, 2] = 1
raw_filter = torch.from_numpy(filters.gaussian_filter(x, 1))
gaussian_filter = torch.zeros(3, 3, 5, 5)
for i in range(3):
    gaussian_filter[i, i] = raw_filter

ModuleNotFoundError: No module named 'scipy'

In [146]:
plt.imshow(raw_filter.numpy()) # plot gaussian filter
plt.show()

NameError: name 'plt' is not defined

In [147]:
class Blur(nn.Module):
    def __init__(self):
        self.register_buffer('gaussian', gaussian_filter) # not learnable
    
    def forward(self, x):
        return F.conv2d(x, Variable(self.gaussian))

In [148]:
blur = Blur()
x = Variable(torch.randn(1, 3, 200, 200)) # 1 imagen, 3 channels
blur(x)

NameError: name 'gaussian_filter' is not defined

In [149]:
show(x)

NameError: name 'show' is not defined

#### List of Modules in customized module
We don't support registering arbitrary structure since it slows down a lot.

In [150]:
class MyModule(nn.Module):
    def __init__(self, num_input, num_output):
        super(MyModule, self).__init__()
        self.linear1a = nn.Linear(num_input, num_output // 2)
        self.linear1b = nn.Linear(num_input, num_output // 2)
        self.linear2a = nn.Linear(num_output // 2, num_output // 2)
        self.linear2b = nn.Linear(num_output // 2, num_output // 2)
        self.modules = nn.ModuleList([nn.Lieanr(5, 2), nn.Linear(5, 2)])
        
    def forward(self, x):
        for module in self.modules:
            pass
        x_a = F.sigmoid(self.linear1a(x))
        x_b = F.tanh(self.linear1b(x))
        x_2 = x_a + x_b
        return self.linear2a(x_2), self.linear2b(x_2)

In [151]:
# Use pretrained models to form your own
mynet = nn.Sequential([[net.conv1, net.bn1, net.relu, net.maxpool]])

TypeError: list is not a Module subclass

### Extending Pytorch


In [152]:
# Inherit from Function
class LinearFunction(Function):

    # Note that both forward and backward are @staticmethods
    @staticmethod
    # bias is an optional argument
    def forward(ctx, input, weight, bias=None):
        ctx.save_for_backward(input, weight, bias)
        output = input.mm(weight.t())
        if bias is not None:
            output += bias.unsqueeze(0).expand_as(output)
        return output

    # This function has only a single output, so it gets only one gradient
    @staticmethod
    def backward(ctx, grad_output):
        # This is a pattern that is very convenient - at the top of backward
        # unpack saved_tensors and initialize all gradients w.r.t. inputs to
        # None. Thanks to the fact that additional trailing Nones are
        # ignored, the return statement is simple even when the function has
        # optional inputs.
        input, weight, bias = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = None

        # These needs_input_grad checks are optional and there only to
        # improve efficiency. If you want to make your code simpler, you can
        # skip them. Returning gradients for inputs that don't require it is
        # not an error.
        if ctx.needs_input_grad[0]:
            grad_input = grad_output.mm(weight)
        if ctx.needs_input_grad[1]:
            grad_weight = grad_output.t().mm(input)
        if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum(0).squeeze(0)

        return grad_input, grad_weight, grad_bias

linear = LinearFunction.apply

NameError: name 'Function' is not defined