# Getting Started with PyTorch

In [None]:
%matplotlib inline
import torch
import math
import numpy as np
import matplotlib.pyplot as plt

## 1. Initializing tensors from lists and arrays
Initialize a tensor from a list:

In [None]:
data = [[1, 2],[3, 4]]
x_data = torch.tensor(data)

Initialize a tensor from a `numpy` array:

In [None]:
np_array = np.array(data)
x_np = torch.from_numpy(np_array)

These should be identical:

In [None]:
x_data == x_np

In [None]:
np.random.seed(22)
arr = np.random.uniform(size=(3,3))
x_rand = torch.from_numpy(arr)

Tensor slicing and indexing:

In [None]:
# first row
x_rand[0,:]

In [None]:
# first column
x_rand[:,0]

In [None]:
# last row
x_rand[-1,:]

In [None]:
# last column
x_rand[:,-1]

In [None]:
x_rand

## 2. Native tensor initialization

In [None]:
# this creates an empty tensor of floats
# note: that you can't specify another dtype
torch.Tensor(2,2)

In [None]:
# by contrast with torch.empty 
# one can create tensors of arbitrary dtype
torch.empty(2,2, dtype=torch.float)

In [None]:
torch.ones((3,3))

In [None]:
torch.zeros((2,2))

In [None]:
torch.linspace(0,10,11)

In [None]:
torch.arange(11)

In [None]:
torch.manual_seed(22)
torch.rand((3,3))

# what is 'torch.random.seed()'?

Reshape the tensor:

In [None]:
a = torch.arange(1,5)
a.reshape((2,2))

In [None]:
torch.reshape(a, (2,2))

In [None]:
x_ones = torch.ones_like(x_data) # retains the properties of x_data
print(f"Ones Tensor: \n {x_ones} \n")

In [None]:
torch.manual_seed(22)
x_rand = torch.rand_like(x_data, dtype=torch.float) # overrides the datatype of x_data
print(f"Random Tensor: \n {x_rand} \n")

In [None]:
shape = (2,3,)
rand_tensor = torch.rand(shape)
ones_tensor = torch.ones(shape)
zeros_tensor = torch.zeros(shape)

print(f"Random Tensor: \n {rand_tensor} \n")
print(f"Ones Tensor: \n {ones_tensor} \n")
print(f"Zeros Tensor: \n {zeros_tensor}")

Attributes of a tensor:

In [None]:
tensor = torch.rand(3,4)

print(f"Shape of tensor: {tensor.shape}")
print(f"Datatype of tensor: {tensor.dtype}")
print(f"Device tensor is stored on: {tensor.device}")

Torch data types can be specified at initialization or tensors can be converted using `to()` method:

In [None]:
a = torch.ones((2, 3), dtype=torch.int16)
print(a)

b = torch.rand((2, 3), dtype=torch.float64) * 20.
print(b)

c = b.to(torch.int32)
print(c)

## 3. Operations on Tensors

By default, tensors are created on the CPU. We need to explicitly move tensors to the GPU using
``.to`` method (after checking for GPU availability). Keep in mind that copying large tensors
across devices can be expensive in terms of time and memory!

In [None]:
# We move our tensor to the GPU if available
if torch.cuda.is_available():
    tensor = tensor.to('cuda')

Arithmetic operations on tensors:

In [None]:
ones = torch.ones(2, 3)
print(ones)

twos = torch.ones(2, 3) * 2 # every element is multiplied by 2
print(twos)

threes = ones + twos       # additon allowed because shapes are similar
print(threes)              # tensors are added element-wise
print(threes.shape)        # this has the same dimensions as input tensors

r1 = torch.rand(2, 3)
r2 = torch.rand(3, 2)
# uncomment this line to get a runtime error
# r3 = r1 + r2

Each element of the tensor `twos` raised to the power specified in the second tensor:

In [None]:
twos = torch.ones(2,2)*2
powers2 = twos ** torch.tensor([[1, 2], [3, 4]])
print(powers2)

Tensor broadcasting performs an operation between tensors of similar shapes by repeating the elements of the tensor with lower dimensions. Broadcasting is important for deep learning, for example multiplying a tensor of learned weights by a batch of input tensors, as this operation is applied to each instance in the batch separately, and returning a tensor of identical shape.

In [None]:
rand = torch.rand(2, 4)
doubled = rand * (torch.ones(1, 4) * 2)

print(rand)
print(doubled)

The rules for broadcasting are:

+ Each tensor must have at least one dimension - no empty tensors.
+ **Comparing the dimension sizes of the two tensors, going from last to first:**
    + Each dimension must be equal, or 
    + One of the dimensions must be of size 1, or
    + The dimension does not exist in one of the tensors

More examples of broadcasting:

In [None]:
a = torch.ones(4, 3, 2)

b = a * torch.rand(   3, 2) # 3rd & 2nd dims identical to a, dim 1 absent
print(b)

c = a * torch.rand(   3, 1) # 3rd dim = 1, 2nd dim identical to a
print(c)

d = a * torch.rand(   1, 2) # 3rd dim identical to a, 2nd dim = 1
print(d)

Here’s a small sample of the mathematical operations available:

In [None]:
r = (torch.rand(2, 2) - 0.5) * 2 # values between -1 and 1
print('A random matrix, r:')
print(r)

# Common mathematical operations are supported:
print('\nAbsolute value of r:')
print(torch.abs(r))

# ...as are trigonometric functions:
print('\nInverse sine of r:')
print(torch.asin(r))

# ...and linear algebra operations like determinant and singular value decomposition
print('\nDeterminant of r:')
print(torch.det(r))
print('\nSingular value decomposition of r:')
print(torch.svd(r))

# ...and statistical and aggregate operations:
print('\nAverage and standard deviation of r:')
print(torch.std_mean(r))
print('\nMaximum value of r:')
print(torch.max(r))

Get unique values of a tensor:

In [None]:
torch.unique(torch.tensor([1, 2, 1, 2, 1, 2]))

Concatenate tensors:

In [None]:
t1 = torch.cat([tensor, tensor, tensor], dim=1)
print(t1)
print(t1.shape)

In [None]:
t2 = torch.cat([tensor, tensor], dim=0)
print(t2)
print(t2.shape)

`torch.stack` creates an extra dimension for accumulating tensors:

In [None]:
t3 = torch.stack([tensor, tensor], dim=0)
print(t3)
print(t3.shape)

Matrix multiplication:

In [None]:
# This computes the matrix multiplication between two tensors. y1, y2, y3 will have the same value
y1 = tensor @ tensor.T
y2 = tensor.matmul(tensor.T)

In [None]:
y1 == y2

In [None]:
y3 = torch.rand_like(tensor)
torch.matmul(tensor, tensor.T, out=y3)

In [None]:
y1 == y3

Element-wise product:

In [None]:
# This computes the element-wise product. z1, z2, z3 will have the same value
z1 = tensor * tensor
z2 = tensor.mul(tensor)

z3 = torch.rand_like(tensor)
torch.mul(tensor, tensor, out=z3)

To add a dummy dimension to a tensor, use `unsqueeze`:

In [None]:
a = torch.rand(3, 226, 226)
b = a.unsqueeze(0)

print(a.shape)
print(b.shape)

To reshape a tensor use `reshape`:

In [None]:
output3d = torch.rand(6, 20, 20)
print(output3d.shape)

input1d = output3d.reshape(6 * 20 * 20)
print(input1d.shape)

# can also call it as a method on the torch module:
print(torch.reshape(output3d, (6 * 20 * 20,)).shape)

If you have a one-element tensor, for example by aggregating all
values of a tensor into one value, you can convert it to a Python
numerical value using ``item()``:

In [None]:
agg = tensor.sum()
agg_item = agg.item()
print(agg_item, type(agg_item))

**In-place operations**
Operations that store the result into the operand are called in-place. They are denoted by a ``_`` suffix.
For example: ``x.copy_(y)``, ``x.t_()``, will change ``x``.

In [None]:
print(tensor, "\n")
tensor.add_(5)
print(tensor)

<div class="alert alert-info"><h4>Note</h4><p>In-place operations save some memory, but can be problematic when computing derivatives because of an immediate loss
     of history. Hence, their use is discouraged.</p></div>
     
As with any object in Python, assigning a tensor to a variable makes the variable a label of the tensor, and does not copy it. For example:

In [None]:
a = torch.ones(2, 2)
b = a

a[0][1] = 561  # we change a...
print(b)       # ...and b is also altered

To create a separate copy, the `clone()` method can be used:

In [None]:
a = torch.ones(2, 2)
b = a.clone()

assert b is not a      # different objects in memory...
print(torch.eq(a, b))  # ...but still with the same contents!

a[0][1] = 561          # a changes...
print(b)               # ...but b is still all ones

**If your source tensor has autograd, enabled then so will the clone.** In many cases, this will be what you want. For example, if your model has multiple computation paths in its forward() method, and both the original tensor and its clone contribute to the model’s output, then to enable model learning you want autograd turned on for both tensors. 

On the other hand, if you’re doing a computation where neither the original tensor nor its clone need to track gradients, then as long as the source tensor has autograd turned off, you’re good to go.

There is a third case, though: Imagine you’re performing a computation in your model’s forward() function, where gradients are turned on for everything by default, but you want to pull out some values mid-stream to generate some metrics. In this case, you don’t want the cloned copy of your source tensor to track gradients - performance is improved with autograd’s history tracking turned off. For this, you can use the .detach() method on the source tensor:

In [None]:
a = torch.rand(2, 2, requires_grad=True) # turn on autograd
print(a)

b = a.clone()
print(b)

c = a.detach().clone()
print(c)

print(a)

Tensor to NumPy array:

In [None]:
t = torch.ones(5)
print(f"t: {t}")
n = t.numpy()
print(f"n: {n}")

A change in the tensor reflects in the NumPy array

In [None]:
t.add_(1)
print(f"t: {t}")
print(f"n: {n}")

numpy array to tensor:

In [None]:
n = np.ones(5)
t = torch.from_numpy(n)

Changes in the NumPy array reflects in the tensor.

In [None]:
np.add(n, 1, out=n)
print(f"t: {t}")
print(f"n: {n}")

## 4. Automatic differentiation with `torch.autograd`

To compute those gradients, PyTorch has a built-in differentiation engine
called ``torch.autograd``. It supports automatic computation of gradient for any
computational graph.

Consider the simplest one-layer neural network, with input ``x``,
parameters ``w`` and ``b``, and some loss function. It can be defined in
PyTorch in the following manner:

In [None]:
x = torch.ones(5)  # input tensor
y = torch.zeros(3)  # expected output
w = torch.randn(5, 3, requires_grad=True)
b = torch.randn(3, requires_grad=True)
z = torch.matmul(x, w)+b
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y) # with logits! (no softmax)

A function that we apply to tensors to construct computational graph is
in fact an object of class ``Function``. This object knows how to
compute the function in the *forward* direction, and also how to compute
its derivative during the *backward propagation* step. A reference to
the backward propagation function is stored in ``grad_fn`` property of a
tensor.

In [None]:
print('Gradient function for z =', z.grad_fn)
print('Gradient function for loss =', loss.grad_fn)

To optimize weights of parameters in the neural network, we need to
compute the derivatives of our loss function with respect to parameters,
namely, we need $\frac{\partial loss}{\partial w}$ and
$\frac{\partial loss}{\partial b}$ under some fixed values of
``x`` and ``y``. To compute those derivatives, we call
``loss.backward()``, and then retrieve the values from ``w.grad`` and
``b.grad``:

In [None]:
loss.backward()
print(w.grad)
print(b.grad)

<div class="alert alert-info"><h4>Note</h4><p>

+ We can only obtain the ``grad`` properties for the leaf
    nodes of the computational graph, which have ``requires_grad`` property
    set to ``True``. For all other nodes in our graph, gradients will not be
    available.
    
+ We can only perform gradient calculations using
    ``backward`` once on a given graph, for performance reasons. If we need
    to do several ``backward`` calls on the same graph, we need to pass
    ``retain_graph=True`` to the ``backward`` call.</p></div>
    
    
By default, all tensors with ``requires_grad=True`` are tracking their
computational history and support gradient computation. After we have
trained the model and just want to apply it to some test data, i.e. we
only want to do *forward* computations through the network. We can stop
tracking computations by surrounding our computation code with
``torch.no_grad()`` block:

In [None]:
z = torch.matmul(x, w)+b
print(z.requires_grad)

with torch.no_grad():
    z = torch.matmul(x, w)+b
print(z.requires_grad)

Another way to achieve the same result is to use the ``detach()`` method
on the tensor:

In [None]:
z = torch.matmul(x, w)+b
z_det = z.detach()
print(z_det.requires_grad)

There are reasons you might want to disable gradient tracking:
  - To mark some parameters in your neural network as **frozen parameters**. This is
    a very common scenario for finetuning a pretrained network
  - To **speed up computations** when you are only doing forward pass, because computations on tensors that do
    not track gradients would be more efficient.
    

In many cases, we have a scalar loss function, and we need to compute
the gradient with respect to some parameters. However, there are cases
when the output function is an arbitrary tensor. In this case, PyTorch
allows you to compute so-called **Jacobian product**, and not the actual
gradient.

For a vector function $\vec{y}=f(\vec{x})$, where
$\vec{x}=\langle x_1,\dots,x_n\rangle$ and
$\vec{y}=\langle y_1,\dots,y_m\rangle$, a gradient of
$\vec{y}$ with respect to $\vec{x}$ is given by **Jacobian
matrix**:

\begin{align}J=\left(\begin{array}{ccc}
      \frac{\partial y_{1}}{\partial x_{1}} & \cdots & \frac{\partial y_{1}}{\partial x_{n}}\\
      \vdots & \ddots & \vdots\\
      \frac{\partial y_{m}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}}
      \end{array}\right)\end{align}

Instead of computing the Jacobian matrix itself, PyTorch allows you to
compute **Jacobian Product** $v^T\cdot J$ for a given input vector
$v=(v_1 \dots v_m)$. This is achieved by calling ``backward`` with
$v$ as an argument. The size of $v$ should be the same as
the size of the original tensor, with respect to which we want to
compute the product:

In [None]:
inp = torch.eye(5, requires_grad=True)
out = (inp+1).pow(2)

In [None]:
print(f"Input: {inp}")
print(f"Output: {out}")

In [None]:
out.backward(torch.ones_like(inp), retain_graph=True)
print("First call\n", inp.grad)
out.backward(torch.ones_like(inp), retain_graph=True)
print("\nSecond call\n", inp.grad)

When we call ``backward`` for the second time with the same
argument, the value of the gradient is different. This happens because
when doing ``backward`` propagation, PyTorch **accumulates the
gradients**

In [None]:
inp.grad.zero_()
out.backward(torch.ones_like(inp), retain_graph=True)
print("\nCall after zeroing gradients\n", inp.grad)

Another example of autograd:

In [None]:
a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True)
print(a)

In [None]:
b = torch.sin(a)
plt.plot(a.detach(), b.detach())

When we print `b`, we see an indicator that it is tracking its computation history:

In [None]:
print(b)

This `grad_fn` gives us a hint that when we execute the backpropagation step and compute gradients, we’ll need to compute the derivative of $\sin(x)$ for all this tensor’s inputs.

Let’s perform some more computations:

In [None]:
c = 2 * b
print(c)

d = c + 1
print(d)

Finally, let’s compute a single-element output. When you call .backward() on a tensor with no arguments, it expects the calling tensor to contain only a single element, as is the case when computing a loss function.

In [None]:
out = d.sum()
print(out)

Each `grad_fn` stored with our tensors allows you to walk the computation all the way back to its inputs with its next_functions property. We can see below that drilling down on this property on `d` shows us the gradient functions for all the prior tensors. Note that `a.grad_fn` is reported as None, indicating that this was an input to the function with no history of its own.

In [None]:
print('d:')
print(d.grad_fn)
print(d.grad_fn.next_functions)
print(d.grad_fn.next_functions[0][0].next_functions)
print(d.grad_fn.next_functions[0][0].next_functions[0][0].next_functions)
print(d.grad_fn.next_functions[0][0].next_functions[0][0].next_functions[0][0].next_functions)
print('\nc:')
print(c.grad_fn)
print('\nb:')
print(b.grad_fn)
print('\na:')
print(a.grad_fn)

Call the `backward()` method on the output, and check the input’s `grad` property to inspect the gradients:

In [None]:
out.backward()

In [None]:
print(a.grad)
plt.plot(a.detach(), a.grad.detach())

Let’s define a small model and examine how it changes after a single training batch. First, define a few constants, our model, and some stand-ins for inputs and outputs:

In [None]:
BATCH_SIZE = 16
DIM_IN = 1000
HIDDEN_SIZE = 100
DIM_OUT = 10

class TinyModel(torch.nn.Module):

    def __init__(self):
        super(TinyModel, self).__init__()

        self.layer1 = torch.nn.Linear(1000, 100)
        self.relu = torch.nn.ReLU()
        self.layer2 = torch.nn.Linear(100, 10)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x

some_input = torch.randn(BATCH_SIZE, DIM_IN, requires_grad=False)
ideal_output = torch.randn(BATCH_SIZE, DIM_OUT, requires_grad=False)

model = TinyModel()

One thing you might notice is that we never specify `requires_grad=True` for the model’s layers. Within a subclass of `torch.nn.Module`, it’s assumed that we want to track gradients on the layers’ weights for learning.

If we look at the layers of the model, we can examine the values of the weights, and verify that no gradients have been computed yet:

In [None]:
print(model.layer2.weight[0,:10])
print(model.layer2.weight.grad)

Compute the prediction and $L^2$-loss on `some_input`:

In [None]:
pred = model(some_input)
loss = (ideal_output - pred).pow(2).sum()
print(loss)

Compute the gradients via backpropagation:

In [None]:
loss.backward()
print(model.layer2.weight[0][0:10])
print(model.layer2.weight.grad[0][0:10])

We can see that the gradients have been computed for each learning weight, but the weights remain unchanged, because we haven’t run the optimizer yet.

Initialize and run the optimizer:

In [None]:
optim = torch.optim.SGD(model.parameters(), lr=0.001)
optim.step()

Now the weights in `layer2` are updated:

In [None]:
print(model.layer2.weight[0][0:10])
print(model.layer2.weight.grad[0][0:10])

One important thing about the process: After calling optimizer.step(), you need to call optimizer.zero_grad(), or else every time you run loss.backward(), the gradients on the learning weights will accumulate:

In [None]:
print(model.layer2.weight.grad[0][0:10])

for i in range(0, 5):
    prediction = model(some_input)
    loss = (ideal_output - prediction).pow(2).sum()
    loss.backward()

print(model.layer2.weight.grad[0][0:10])

optim.zero_grad()

print(model.layer2.weight.grad[0][0:10])

After running the cell above, you should see that after running loss.backward() multiple times, the magnitudes of most of the gradients will be much larger. Failing to zero the gradients before running your next training batch will cause the gradients to blow up in this manner, causing incorrect and unpredictable learning results.

The simplest way to enable / disable gradient tracking is to change the `required_grad` flag on a tensor directly:

In [None]:
a = torch.ones(2, 3, requires_grad=True)
print(a)

b1 = 2 * a
print(b1)

a.requires_grad = False
b2 = 2 * a
print(b2)

If you only need autograd turned off temporarily, a better way is to use the torch.no_grad():

In [None]:
a = torch.ones(2, 3, requires_grad=True) * 2
b = torch.ones(2, 3, requires_grad=True) * 3

c1 = a + b
print(c1)

with torch.no_grad():
    c2 = a + b

print(c2)

c3 = a * b
print(c3)

`torch.no_grad()` can also be used as a function or method dectorator:

In [None]:
def add_tensors1(x, y):
    return x + y

@torch.no_grad()
def add_tensors2(x, y):
    return x + y


a = torch.ones(2, 3, requires_grad=True) * 2
b = torch.ones(2, 3, requires_grad=True) * 3

c1 = add_tensors1(a, b)
print(c1)

c2 = add_tensors2(a, b)
print(c2)

Autograd tracks every step of your computation in detail. Such a computation history, combined with timing information, would make a handy profiler - and autograd has that feature:

In [None]:
device = torch.device('cpu')
run_on_gpu = False
if torch.cuda.is_available():
    device = torch.device('cuda')
    run_on_gpu = True

x = torch.randn(2, 3, requires_grad=True)
y = torch.rand(2, 3, requires_grad=True)
z = torch.ones(2, 3, requires_grad=True)

with torch.autograd.profiler.profile(use_cuda=run_on_gpu) as prf:
    for _ in range(1000):
        z = (z / x) * y

print(prf.key_averages().table(sort_by='self_cpu_time_total'))

There is an API on autograd that allows you to calculate the Jacobian and the Hessian matrices of a particular function for particular inputs.

In [None]:
def exp_adder(x, y):
    return 2 * x.exp() + 3 * y

inputs = (torch.rand(1), torch.rand(1)) # arguments for the function
print(inputs)
torch.autograd.functional.jacobian(exp_adder, inputs)

The `torch.autograd.functional.hessian()` method works identically (assuming your function is twice differentiable), but returns a matrix of all second derivatives.

## 5. Data manipulation layers
Max pooling reduces a tensor by combining cells, and assigning the maximum value of the input cells to the output cell.

In [None]:
my_tensor = torch.rand(1, 6, 6)
print(my_tensor)

maxpool_layer = torch.nn.MaxPool2d(3)
print(maxpool_layer(my_tensor))

Normalization layers re-center and normalize the output of one layer before feeding it to another. Centering the and scaling the intermediate tensors has a number of beneficial effects, such as letting you use higher learning rates without exploding/vanishing gradients.

In [None]:
my_tensor = torch.rand(1, 4, 4) * 20 + 5
print(my_tensor)

print(my_tensor.mean())

norm_layer = torch.nn.BatchNorm1d(4)
normed_tensor = norm_layer(my_tensor)
print(normed_tensor)

print(normed_tensor.mean())

Dropout layers work by randomly setting parts of the input tensor to zero during training - dropout layers are always turned off for inference. This forces the model to learn against this masked or reduced dataset. For example:

In [None]:
my_tensor = torch.rand(1, 4, 4)

dropout = torch.nn.Dropout(p=0.4)
print(dropout(my_tensor))
print(dropout(my_tensor))