## Outline
* PyTorch
* What are tensors
* Initialising, slicing, reshaping tensors
* Numpy and PyTorch interfacing
* GPU support for PyTorch + Enabling GPUs on Google Colab
* Speed comparisons, Numpy -- PyTorch -- PyTorch on GPU
* Autodiff concepts and application
* Writing a basic learning loop using autograd


In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt

## Initialise tensors

In [2]:
x = torch.ones(3, 2)
print(x)
x = torch.zeros(3, 2)
print(x)
x = torch.rand(3, 2)
print(x)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])
tensor([[0.8130, 0.6875],
        [0.9981, 0.1580],
        [0.7532, 0.2163]])


In [3]:
x = torch.empty(3, 2)
print(x)
y = torch.zeros_like(x)
print(y)

tensor([[3.0580e-35, 0.0000e+00],
        [3.3631e-44, 0.0000e+00],
        [       nan, 0.0000e+00]])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])


In [4]:
x = torch.linspace(0, 1, steps=5)
print(x)

tensor([0.0000, 0.2500, 0.5000, 0.7500, 1.0000])


In [5]:
x = torch.tensor([[1, 2], 
                 [3, 4], 
                 [5, 6]])
print(x)

tensor([[1, 2],
        [3, 4],
        [5, 6]])


## Slicing tensors

In [6]:
print(x.size())
print(x[:, 1]) 
print(x[0, :]) 

torch.Size([3, 2])
tensor([2, 4, 6])
tensor([1, 2])


In [7]:
y = x[1, 1]
print(y)
print(y.item())

tensor(4)
4


## Reshaping tensors

In [8]:
print(x)
y = x.view(2, 3)
print(y)

tensor([[1, 2],
        [3, 4],
        [5, 6]])
tensor([[1, 2, 3],
        [4, 5, 6]])


In [9]:
y = x.view(6,-1) 
print(y)

tensor([[1],
        [2],
        [3],
        [4],
        [5],
        [6]])


## Simple Tensor Operations

In [10]:
x = torch.ones([3, 2])
y = torch.ones([3, 2])
z = x + y
print(z)
z = x - y
print(z)
z = x * y
print(z)

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])
tensor([[1., 1.],
        [1., 1.],
        [1., 1.]])


In [11]:
z = y.add(x)
print(z)
print(y)

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]])
tensor([[1., 1.],
        [1., 1.],
        [1., 1.]])


In [47]:
z = y.add_(x) # result to z and y also
print(z)
print(y)

RuntimeError: ignored

## Numpy <> PyTorch

In [13]:
x_np = x.numpy()
print(type(x), type(x_np))
print(x_np)

<class 'torch.Tensor'> <class 'numpy.ndarray'>
[[1. 1.]
 [1. 1.]
 [1. 1.]]


In [14]:
a = np.random.randn(5)
print(a)
a_pt = torch.from_numpy(a)
print(type(a), type(a_pt))
print(a_pt)

[ 1.64013399  1.24587804 -0.11715579  0.40086871 -1.42542099]
<class 'numpy.ndarray'> <class 'torch.Tensor'>
tensor([ 1.6401,  1.2459, -0.1172,  0.4009, -1.4254], dtype=torch.float64)


In [15]:
np.add(a, 1, out=a)
print(a)
print(a_pt) 

[ 2.64013399  2.24587804  0.88284421  1.40086871 -0.42542099]
tensor([ 2.6401,  2.2459,  0.8828,  1.4009, -0.4254], dtype=torch.float64)


In [16]:
%%time
for i in range(100):
  a = np.random.randn(100,100)
  b = np.random.randn(100,100)
  c = np.matmul(a, b)

CPU times: user 161 ms, sys: 102 ms, total: 263 ms
Wall time: 149 ms


In [17]:
%%time
for i in range(100):
  a = torch.randn([100, 100])
  b = torch.randn([100, 100])
  c = torch.matmul(a, b)

CPU times: user 70.4 ms, sys: 79.6 ms, total: 150 ms
Wall time: 101 ms


In [18]:
%%time
for i in range(10):
  a = np.random.randn(10000,10000)
  b = np.random.randn(10000,10000)
  c = a + b

CPU times: user 1min 27s, sys: 716 ms, total: 1min 28s
Wall time: 1min 28s


In [19]:
%%time
for i in range(10):
  a = torch.randn([10000, 10000])
  b = torch.randn([10000, 10000])
  c = a + b

CPU times: user 25.3 s, sys: 13.9 ms, total: 25.3 s
Wall time: 25.3 s


## CUDA support

In [48]:
print(torch.cuda.device_count())

1


In [49]:
print(torch.cuda.device(0))
print(torch.cuda.get_device_name(0))

<torch.cuda.device object at 0x7f006791a048>
Tesla K80


In [50]:
cuda0 = torch.device('cuda:0')

In [23]:
a = torch.ones(3, 2, device=cuda0)
b = torch.ones(3, 2, device=cuda0)
c = a + b
print(c)

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]], device='cuda:0')


In [24]:
print(a)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]], device='cuda:0')


In [25]:
%%time
for i in range(10):
  a = np.random.randn(10000,10000)
  b = np.random.randn(10000,10000)
  np.add(b, a)

CPU times: user 1min 28s, sys: 200 ms, total: 1min 28s
Wall time: 1min 28s


In [26]:
%%time
for i in range(10):
  a_cpu = torch.randn([10000, 10000])
  b_cpu = torch.randn([10000, 10000])
  b_cpu.add_(a_cpu)

CPU times: user 25.3 s, sys: 200 ms, total: 25.5 s
Wall time: 25.5 s


In [27]:
%%time
for i in range(10):
  a = torch.randn([10000, 10000], device=cuda0)
  b = torch.randn([10000, 10000], device=cuda0)
  b.add_(a)

CPU times: user 880 µs, sys: 4 ms, total: 4.88 ms
Wall time: 9.18 ms


In [28]:
%%time
for i in range(10):
  a = np.random.randn(10000,10000)
  b = np.random.randn(10000,10000)
  np.matmul(b, a)

CPU times: user 20min 23s, sys: 5.31 s, total: 20min 29s
Wall time: 11min 10s


In [29]:
%%time
for i in range(10):
  a_cpu = torch.randn([10000, 10000])
  b_cpu = torch.randn([10000, 10000])
  torch.matmul(a_cpu, b_cpu)

CPU times: user 4min 49s, sys: 195 ms, total: 4min 49s
Wall time: 4min 49s


In [30]:
%%time
for i in range(10):
  a = torch.randn([10000, 10000], device=cuda0)
  b = torch.randn([10000, 10000], device=cuda0)
  torch.matmul(a, b)

CPU times: user 9.55 ms, sys: 8.99 ms, total: 18.5 ms
Wall time: 19.5 ms


## Autodiff

In [31]:
x = torch.ones([3, 2], requires_grad=True)
print(x)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]], requires_grad=True)


In [32]:
y = x + 5
print(y)

tensor([[6., 6.],
        [6., 6.],
        [6., 6.]], grad_fn=<AddBackward0>)


In [33]:
z = y*y + 1
print(z)

tensor([[37., 37.],
        [37., 37.],
        [37., 37.]], grad_fn=<AddBackward0>)


In [34]:
t = torch.sum(z)
print(t)

tensor(222., grad_fn=<SumBackward0>)


In [35]:
t.backward()

In [36]:
print(x.grad) # partial dervative t w.r to x

tensor([[12., 12.],
        [12., 12.],
        [12., 12.]])


$t = \sum_i z_i, z_i = y_i^2 + 1, y_i = x_i + 5$

$\frac{\partial t}{\partial x_i} = \frac{\partial z_i}{\partial x_i} = \frac{\partial z_i}{\partial y_i} \frac{\partial y_i}{\partial x_i} = 2y_i \times 1$


At x = 1, y = 6, $\frac{\partial t}{\partial x_i} = 12$

In [37]:
x = torch.ones([3, 2], requires_grad=True)
y = x + 5
r = 1/(1 + torch.exp(-y))
print(r)
s = torch.sum(r)
s.backward()
print(x.grad)

tensor([[0.9975, 0.9975],
        [0.9975, 0.9975],
        [0.9975, 0.9975]], grad_fn=<MulBackward0>)
tensor([[0.0025, 0.0025],
        [0.0025, 0.0025],
        [0.0025, 0.0025]])


In [52]:
x = torch.ones([3, 2], requires_grad=True)
y = x + 5
r = 1/(1 + torch.exp(-y))
a = torch.ones([3, 2])
r.backward(a)
print(x.grad)

tensor([[0.0025, 0.0025],
        [0.0025, 0.0025],
        [0.0025, 0.0025]])


$\frac{\partial{s}}{\partial{x}} = \frac{\partial{s}}{\partial{r}} \cdot \frac{\partial{r}}{\partial{x}}$

For the above code $a$ represents $\frac{\partial{s}}{\partial{r}}$ and then $x.grad$ gives directly $\frac{\partial{s}}{\partial{x}}$



## Autodiff example that looks like what we have been doing

In [58]:
x = torch.randn([20, 1], requires_grad=True)
y = 3*x - 2

In [64]:
w = torch.tensor([1.], requires_grad=True) #tensor size - 1
b = torch.tensor([1.], requires_grad=True)
print(w.item(),b.item())
y_hat = w*x + b

loss = torch.sum((y_hat - y)**2)

1.0 1.0


In [60]:
print(loss)

tensor(284.3013, grad_fn=<SumBackward0>)


In [61]:
loss.backward()

In [62]:
print(w.grad, b.grad)

tensor([-107.8957]) tensor([117.6037])


## Do it in a loop

In [67]:
learning_rate = 0.01

w = torch.tensor([1.], requires_grad=True)
b = torch.tensor([1.], requires_grad=True)

print(w.item(), b.item())

for i in range(10):
  
  x = torch.randn([20, 1])
  y = 3*x - 2
  
  y_hat = w*x + b
  loss = torch.sum((y_hat - y)**2)
  print('--')
  loss.backward()
  
  with torch.no_grad():  #compute until minimum error
    w -= learning_rate * w.grad
    b -= learning_rate * b.grad
    
    w.grad.zero_() #freshing fro each iteration
    b.grad.zero_()

  print(w.item(), b.item())
  

1.0 1.0
--
1.6038391590118408 -0.2625373601913452
--
2.097586154937744 -1.0080065727233887
--
2.7363381385803223 -1.4815884828567505
--
2.8204002380371094 -1.6788045167922974
--
2.9242725372314453 -1.8136911392211914
--
2.9649055004119873 -1.8946359157562256
--
2.9633376598358154 -1.9331893920898438
--
2.9763641357421875 -1.9574577808380127
--
2.9844417572021484 -1.975643515586853
--
2.9930670261383057 -1.985713243484497


## Do it for a large problem

In [78]:
%%time
learning_rate = 0.001
N = 1000
epochs = 2000

w = torch.rand([N], requires_grad=True)
b = torch.ones([1], requires_grad=True)

print(torch.mean(w).item(), b.item())

for i in range(epochs):
  
  x = torch.randn([N])
  y = torch.dot(3*torch.ones([N]), x) - 2
  
  y_hat = torch.dot(w, x) + b
  loss = torch.sum((y_hat - y)**2)
  
  loss.backward()
  
  with torch.no_grad():
    w -= learning_rate * w.grad
    b -= learning_rate * b.grad
    
    w.grad.zero_()
    b.grad.zero_()

  print(torch.mean(w).item(), b.item())
  

0.5065366625785828 1.0
0.5083543062210083 1.0977109670639038
0.5156665444374084 1.2785065174102783
0.5312063694000244 0.9660989046096802
0.533312976360321 1.0636186599731445
0.5339282155036926 1.120650291442871
0.534656286239624 1.1655592918395996
0.5349912047386169 1.115417242050171
0.5461324453353882 0.875601589679718
0.5469363331794739 0.8162160515785217
0.5474629402160645 0.7568464279174805
0.5500800013542175 0.6322444677352905
0.5512059926986694 0.5618998408317566
0.5533401966094971 0.4792526364326477
0.5675188302993774 0.21040025353431702
0.5695200562477112 0.08418001234531403
0.5784176588058472 0.266879677772522
0.5785346627235413 0.2734367549419403
0.579788863658905 0.22067521512508392
0.5798280835151672 0.21422450244426727
0.5801082253456116 0.19343465566635132
0.5840624570846558 0.35067683458328247
0.5882899761199951 0.48974335193634033
0.5983710885047913 0.2386692762374878
0.5982858538627625 0.279886394739151
0.6069653034210205 0.5014638304710388
0.6073411107063293 0.5537828

In [46]:
%%time
learning_rate = 0.001
N = 10000000
epochs = 200

w = torch.rand([N], requires_grad=True, device=cuda0)
b = torch.ones([1], requires_grad=True, device=cuda0)

# print(torch.mean(w).item(), b.item())

for i in range(epochs):
  
  x = torch.randn([N], device=cuda0)
  y = torch.dot(3*torch.ones([N], device=cuda0), x) - 2
  
  y_hat = torch.dot(w, x) + b
  loss = torch.sum((y_hat - y)**2)
  
  loss.backward()
  
  with torch.no_grad():
    w -= learning_rate * w.grad
    b -= learning_rate * b.grad
    
    w.grad.zero_()
    b.grad.zero_()

  #print(torch.mean(w).item(), b.item())
  

CPU times: user 781 ms, sys: 428 ms, total: 1.21 s
Wall time: 1.25 s
