In [3]:
import math
import logging

import torch
import torch.nn as nn
from torch.nn import functional as F



In [16]:
# Random Initiation 
# Self Attention Init
B = 16
T = 100
DIM = 512
x = torch.rand(B,T,DIM)

In [17]:
x.shape

torch.Size([16, 100, 512])

In [18]:
# View 
# Splitting into heads
num_heads = 8
x = x.view(B,T,num_heads,DIM//num_heads)
x.shape

torch.Size([16, 100, 8, 64])

In [19]:
# Transpose 
print(x.transpose(1,2).shape)
print(x.transpose(0,2).shape)

torch.Size([16, 8, 100, 64])
torch.Size([8, 100, 16, 64])


### BroadCasting 

In [18]:
# Basics 
from torch import tensor 

In [8]:
a = tensor([1,2,3])

In [9]:
a > 0 

tensor([True, True, True])

In [17]:
c = tensor([10.,20,30]); c.shape

torch.Size([3])

In [11]:
m = tensor([[1., 2, 3], [4,5,6], [7,8,9]]); m

tensor([[1., 2., 3.],
        [4., 5., 6.],
        [7., 8., 9.]])

In [12]:
m + c 

tensor([[11., 22., 33.],
        [14., 25., 36.],
        [17., 28., 39.]])

In [13]:
t = c.expand_as(m)

In [14]:
t.stride(), t.shape

((0, 1), torch.Size([3, 3]))

In [16]:
t.storage()

 10.0
 20.0
 30.0
[torch.FloatStorage of size 3]

In [24]:
# Unsqueeze: Adds extra dimension into input axis. 
print(c)
print(c.unsqueeze(0)) # Extra dim at index 0
print(c.unsqueeze(1)) # Extra dim at index 1

tensor([10., 20., 30.])
tensor([[10., 20., 30.]])
tensor([[10.],
        [20.],
        [30.]])


In [25]:
# Unsqueeze: Adds extra dimension into input axis. 
print(c.shape)
print(c.unsqueeze(0).shape) # Extra dim at index 0
print(c.unsqueeze(1).shape) # Extra dim at index 1

torch.Size([3])
torch.Size([1, 3])
torch.Size([3, 1])


In [26]:
# Unsqueeze: Alternate Implementation 
print(c.shape)
print(c[None,:].shape) # Extra dim at index 0
print(c[:,None].shape) # Extra dim at index 1

torch.Size([3])
torch.Size([1, 3])
torch.Size([3, 1])


In [31]:
print(c)
print(c.expand_as(m))
print(c[None,:].expand_as(m))


tensor([10., 20., 30.])
tensor([[10., 20., 30.],
        [10., 20., 30.],
        [10., 20., 30.]])
tensor([[10., 20., 30.],
        [10., 20., 30.],
        [10., 20., 30.]])


In [32]:
print(c[:,None])
print(c[:,None].expand_as(m))

tensor([[10.],
        [20.],
        [30.]])
tensor([[10., 10., 10.],
        [20., 20., 20.],
        [30., 30., 30.]])


In [38]:
# Sum Dim 
m 

tensor([[1., 2., 3.],
        [4., 5., 6.],
        [7., 8., 9.]])

In [34]:
m.sum()

tensor(45.)

In [35]:
m.sum(dim=0)

tensor([12., 15., 18.])

In [36]:
m.sum(dim=1)

tensor([ 6., 15., 24.])

In [39]:
# Einstein Summation

In [41]:
def matmul(a,b): return torch.einsum('ik,kj->ij', a, b)

In [42]:
p = torch.rand(5,512)
q = torch.rand(512,10)

In [45]:
matmul(p,q).shape

torch.Size([5, 10])

In [8]:
# View
import torch 
x = torch.rand(10000,784)
x.view(-1,1,28,28).shape

torch.Size([10000, 1, 28, 28])

# Advanced PyTorch 

## Use AutoGrad to Calculate Grad on a Random function. 

In [2]:
import torch 

In [17]:
x = torch.tensor([5., 1.], requires_grad=True)
a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)

In [13]:
Q = 3*a**3 - b**2

In [14]:
Q

tensor([-12.,  65.], grad_fn=<SubBackward0>)

In [15]:
external_grad = torch.tensor([1., 1.])
Q.backward(gradient=external_grad)

In [16]:
print(9*a**2 == a.grad)
print(-2*b == b.grad)

tensor([True, True])
tensor([True, True])


In [9]:
a.grad

tensor([36., 81.])

In [18]:
P = a@x 

In [20]:
P.backward()

In [22]:
a.grad

tensor([5., 1.])

## Calculate the time taken by ech operation in  forward pass and backward? 


In [24]:
x = torch.randn((1, 1), requires_grad=True)
with torch.autograd.profiler.profile() as prof:
    for _ in range(100):  # any normal python code, really!
        y = x ** 2
        y.backward()

print(prof.key_averages().table(sort_by="self_cpu_time_total"))

-----------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                               Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-----------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                          aten::pow        27.68%       3.594ms        40.66%       5.278ms      26.391us           200  
                          aten::mul        15.10%       1.961ms        23.11%       3.000ms      15.001us           200  
                        aten::copy_        10.32%       1.340ms        10.32%       1.340ms       6.700us           200  
                         aten::add_         9.98%       1.296ms         9.98%       1.296ms      13.087us            99  
                       PowBackward0         9.80%       1.273ms        57.89%       7.515ms      75.146us           100  
                        

## Define a new Operation in PyTorch and it's Forward and Backward Pass. 
https://pytorch.org/docs/stable/notes/extending.html#extending-torch-autograd

In [22]:
from torch.autograd import Function 
class Exp(Function):
    @staticmethod
    def forward(ctx, i):
        result = i.exp()
        ctx.save_for_backward(result)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        import pdb;pdb.set_trace()
        result, = ctx.saved_tensors
        return grad_output * result

#Use it by calling the apply method:
import torch 
in_ = torch.tensor([1.,2.],requires_grad=True)
output = Exp.apply(in_)

In [23]:
output

tensor([2.7183, 7.3891], grad_fn=<ExpBackward>)

In [24]:
external_grad = torch.tensor([1., 1.])
output.backward(gradient=external_grad)

> [0;32m<ipython-input-22-055d647126c5>[0m(12)[0;36mbackward[0;34m()[0m
[0;32m     10 [0;31m    [0;32mdef[0m [0mbackward[0m[0;34m([0m[0mctx[0m[0;34m,[0m [0mgrad_output[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     11 [0;31m        [0;32mimport[0m [0mpdb[0m[0;34m;[0m[0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 12 [0;31m        [0mresult[0m[0;34m,[0m [0;34m=[0m [0mctx[0m[0;34m.[0m[0msaved_tensors[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     13 [0;31m        [0;32mreturn[0m [0mgrad_output[0m [0;34m*[0m [0mresult[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     14 [0;31m[0;34m[0m[0m
[0m
ipdb> p ctx
<torch.autograd.function.ExpBackward object at 0x7f55e3cf4e40>
ipdb> ctx.saved_tensors
(tensor([2.7183, 7.3891], grad_fn=<ExpBackward>),)
ipdb> p grad_output
tensor([1., 1.])
ipdb> q


BdbQuit: 

In [21]:
in_.grad

tensor([2.7183, 7.3891])

In [27]:
# Inherit from Function
class LinearFunction(Function):

    # Note that both forward and backward are @staticmethods
    @staticmethod
    # bias is an optional argument
    def forward(ctx, input, weight, bias=None):
        ctx.save_for_backward(input, weight, bias)
        output = input.mm(weight.t())
        if bias is not None:
            output += bias.unsqueeze(0).expand_as(output)
        return output

    # This function has only a single output, so it gets only one gradient
    @staticmethod
    def backward(ctx, grad_output):
        # This is a pattern that is very convenient - at the top of backward
        # unpack saved_tensors and initialize all gradients w.r.t. inputs to
        # None. Thanks to the fact that additional trailing Nones are
        # ignored, the return statement is simple even when the function has
        # optional inputs.
        input, weight, bias = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = None

        # These needs_input_grad checks are optional and there only to
        # improve efficiency. If you want to make your code simpler, you can
        # skip them. Returning gradients for inputs that don't require it is
        # not an error.
        if ctx.needs_input_grad[0]:
            grad_input = grad_output.mm(weight)
        if ctx.needs_input_grad[1]:
            grad_weight = grad_output.t().mm(input)
        if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum(0)

        return grad_input, grad_weight, grad_bias
linear = LinearFunction.apply

In [28]:
from torch.autograd import gradcheck

# gradcheck takes a tuple of tensors as input, check if your gradient
# evaluated with these tensors are close enough to numerical
# approximations and returns True if they all verify this condition.
input = (torch.randn(20,20,dtype=torch.double,requires_grad=True), torch.randn(30,20,dtype=torch.double,requires_grad=True))
test = gradcheck(linear, input, eps=1e-6, atol=1e-4)
print(test)

True


## AutoGrad Mechanics: 
https://pytorch.org/docs/stable/notes/autograd.html

## Example of reverse mode autodiff
https://colab.research.google.com/drive/1VpeE6UvEPRz9HmsHh1KS0XxXjYu533EC

## What is the Optimizer? 

Let's replace our previous manually coded optimization step:

```python
with torch.no_grad():
    for p in model.parameters(): p -= p.grad * lr
    model.zero_grad()
```

and instead use just:

```python
opt.step()
opt.zero_grad()
```


In [30]:
class Optimizer():
    def __init__(self, params, lr=0.5): self.params,self.lr=list(params),lr
        
    def step(self):
        with torch.no_grad():
            for p in self.params: p -= p.grad * self.lr

    def zero_grad(self):
        for p in self.params: p.grad.data.zero_()

In [33]:
from torch import nn 
m = 784
nh = 50
model = nn.Sequential(nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10))

In [34]:
opt = Optimizer(model.parameters())