In [1]:
import torch
import numpy as np

In [2]:
torch.__version__

'1.3.1'

In [3]:
# Similar to numpy
x_torch = torch.ones(5, 4)
print(x_torch)
print(x_torch.shape) #alias for x.size()
print()
y_np = np.ones((5,4))
print(y_np)
print(y_np.shape)

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])
torch.Size([5, 4])

[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]
(5, 4)


In [4]:
# Rand
rand = torch.rand(2,2)
print('rand:', rand)
# From List
from_list = torch.tensor([[1.,2.], [4., 5.]])
print('from_list:', from_list)
x = rand * from_list
print('element wise:', x)
y = rand @ from_list
print('matrix multiplication: ', y)

rand: tensor([[4.0064e-01, 1.6530e-02],
        [3.1573e-04, 3.2196e-01]])
from_list: tensor([[1., 2.],
        [4., 5.]])
element wise: tensor([[4.0064e-01, 3.3060e-02],
        [1.2629e-03, 1.6098e+00]])
matrix multiplication:  tensor([[0.4668, 0.8839],
        [1.2881, 1.6104]])


In [5]:
#  broadcasting also works as expected
arange = torch.arange(2)[:, None]
print('arange:', arange)
res = from_list + arange
print('res:', res)

arange: tensor([[0],
        [1]])
res: tensor([[1., 2.],
        [5., 6.]])


In [6]:
# in place
res.add_(2)
res

tensor([[3., 4.],
        [7., 8.]])

Any operation that mutates a tensor in-place is post-fixed with an ``_``.
    For example: ``x.copy_(y)``, ``x.t_()``, will change ``x``.




In [7]:
#But some functionalities are diffrent
#You have been warned
x_torch = x_torch.view(2, 2, -1)
print(x_torch)

y_np = y_np.reshape(2, 2, -1)
print(y_np)

# In torch there is also `reshape` function. But it beahves difrentlly.
# It sometimes can make a copy of an array instead of a reference.


tensor([[[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]]])
[[[1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]]

 [[1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]]]


If you have a one element tensor, use ``.item()`` to get the value as a
Python number



In [8]:
x = torch.randn(1)
print(x)
print(x.item(), type(x.item()))

tensor([0.3955])
0.3955190181732178 <class 'float'>


# Numpy bridge
Converting a Torch Tensor to a NumPy array and vice versa is a breeze.
Transformation do not need copying underlying C array. Only `python` properties of the object has to be changed

### Conclussion: It is fast
Sometimes it is easier to transofrm data in numpy

In [9]:
a = torch.ones(5)
print(a)

tensor([1., 1., 1., 1., 1.])


In [10]:
b = a.numpy()
print(b)

[1. 1. 1. 1. 1.]


See how the numpy array changed in value.


In [11]:
a.add_(1)
print(a)
print(b)

tensor([2., 2., 2., 2., 2.])
[2. 2. 2. 2. 2.]


### Converting NumPy Array to Torch Tensor

See how changing the np array changed the Torch Tensor automatically



In [12]:
a = np.ones(5)
b = torch.from_numpy(a)
np.add(a, 1, out=a)
print(a)
print(b)

[2. 2. 2. 2. 2.]
tensor([2., 2., 2., 2., 2.], dtype=torch.float64)


All the Tensors on the CPU except a CharTensor support converting to
NumPy and back.

# CUDA Tensors


Tensors can be moved onto any device using the ``.to`` method. To interact with each other both tensors have to be on the same device. 





In [13]:
# let us run this cell only if CUDA is available
# We will use ``torch.device`` objects to move tensors in and out of GPU
if torch.cuda.is_available():
    device = torch.device("cuda")          # a CUDA device object
    y = torch.ones_like(x, device=device)  # directly create a tensor on GPU
    x = x.to(device)                       # or just use strings ``.to("cuda")``
    z = x + y
    print(z)
    print(z.to("cpu", torch.double))       # ``.to`` can also change dtype together!

tensor([1.3955], device='cuda:0')
tensor([1.3955], dtype=torch.float64)


### Recommended way for comapatibillity

On systems with multiple GPU's there will be {`cuda:1`, `cuda:2`...} and so on.

There is no automatic way to decide which GPU is "free"

In [14]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
tensor = torch.zeros(2,2).to(device)
tensor

tensor([[0., 0.],
        [0., 0.]], device='cuda:0')

In [15]:
cpu = torch.ones(1)
try:
    cpu + tensor
except Exception as e:
    print(type(e), e)

<class 'RuntimeError'> expected device cpu but got device cuda:0



Autograd: Automatic Differentiation
===================================

The ``autograd`` package provides automatic differentiation for all operations
on Tensors. It is a define-by-run framework, which means that your backprop is
defined by how your code is run, and that every single iteration can be
different.


Tensor
--------

1. If you set its attribute``.requires_grad`` as ``True``, it starts to track all operations on it. 

2. When
you finish your computation you can call ``.backward()`` and have all the
gradients computed automatically. The gradient for this tensor will be
accumulated into ``.grad`` attribute.

3. To stop a tensor from tracking history, you can call ``.detach()`` to detach
it from the computation history, and to prevent future computation from being
tracked.

4. To prevent tracking history (and using memory), you can also wrap the code block
in ``with torch.no_grad():``. This can be particularly helpful when **evaluating a
model** because the model may have trainable parameters with
``requires_grad=True``, but for which we don't need the gradients.

There’s one more class which is very important for autograd
implementation - a ``Function``.

``Tensor`` and ``Function`` are interconnected and build up an acyclic
graph, that encodes a complete history of computation. Each tensor has
a ``.grad_fn`` attribute that references a ``Function`` that has created
the ``Tensor`` (except for Tensors created by the user - their
``grad_fn is None``).


Create a tensor and set ``requires_grad=True`` to track computation with it



In [16]:
x = torch.ones(2, 3, requires_grad=True)
print(x)

tensor([[1., 1., 1.],
        [1., 1., 1.]], requires_grad=True)


Do a tensor operation:



In [17]:
y = x + 2
print(y)
print(y.requires_grad)

tensor([[3., 3., 3.],
        [3., 3., 3.]], grad_fn=<AddBackward0>)
True


``y`` was created as a result of an operation, so it has a ``grad_fn``.



In [18]:
print(y.grad_fn)

<AddBackward0 object at 0x7fd25690a4e0>


Do more operations on ``y``



In [19]:
z = y * y * 3
out = z.mean()

print(z, out)

tensor([[27., 27., 27.],
        [27., 27., 27.]], grad_fn=<MulBackward0>) tensor(27., grad_fn=<MeanBackward0>)


``.requires_grad_( ... )`` changes an existing Tensor's ``requires_grad``
flag in-place. The input flag defaults to ``False`` if not given.



In [20]:
a = torch.randn(2, 2)
print(a.requires_grad)
a.requires_grad_(True)
print(a.requires_grad)


False
True


Gradients
---------
Let's backprop now.
Because ``out`` contains a single scalar, ``out.backward()`` is
equivalent to ``out.backward(torch.tensor(1.))``.



In [21]:
out.backward()

Print gradients d(out)/dx




In [22]:
print(x.grad)

tensor([[3., 3., 3.],
        [3., 3., 3.]])


Now let's take a look at an example of vector-Jacobian product:



In [23]:
x_1 = torch.ones(2,2, requires_grad=True)
x_2 = (x_1 ** 2 * 3).sum()
x_2.backward(torch.tensor(17.))
print(x_1.grad)
x_3 = (x_1 * 20).sum()
x_3.backward(torch.tensor(2.))
print(x_1.grad)

tensor([[102., 102.],
        [102., 102.]])
tensor([[142., 142.],
        [142., 142.]])


You can also stop autograd from tracking history on Tensors
with ``.requires_grad=True`` either by wrapping the code block in
``with torch.no_grad():``



In [24]:
print(x.requires_grad)
print((x ** 2).requires_grad)

with torch.no_grad():
	print((x ** 2).requires_grad)

True
True
False


Or by using ``.detach()`` to get a new Tensor with the same
content but that does not require gradients:



In [25]:
print(x.requires_grad)
y = x.detach()
print(y.requires_grad)
print(x.eq(y).all())

True
False
tensor(True)



# Neural Networks


Neural networks can be constructed using the ``torch.nn`` package.

Now that you had a glimpse of ``autograd``, ``nn`` depends on
``autograd`` to define models and differentiate them.
An ``nn.Module`` contains layers, and a method ``forward(input)``\ that
returns the ``output``.

For example, look at this network that classifies digit images:


It is a simple feed-forward network. It takes the input, feeds it
through several layers one after the other, and then finally gives the
output.

A typical training procedure for a neural network is as follows:

- Define the neural network that has some learnable parameters (or
  weights)
- Iterate over a dataset of inputs
- Process input through the network
- Compute the loss (how far is the output from being correct)
- Propagate gradients back into the network’s parameters
- Update the weights of the network, typically using a simple update rule:
  ``weight = weight - learning_rate * gradient``

Define the network
------------------

Let’s define this network:



In [26]:
import torch
# torch modules
import torch.nn as nn
# torch funct
import torch.nn.functional as F



class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # an affine operations: y = Wx + b
        self.fc1 = nn.Linear(32 * 32, 200)
        self.fc2 = nn.Linear(200, 10)
        
    def forward(self, x):
        # x.shape => BATCH_SIZE x Height X Width 
        x = x.view(x.shape[0], -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return x 
    

net = Net()
print(net)

Net(
  (fc1): Linear(in_features=1024, out_features=200, bias=True)
  (fc2): Linear(in_features=200, out_features=10, bias=True)
)


You just have to define the ``forward`` function, and the ``backward``
function (where gradients are computed) is automatically defined for you
using ``autograd``.
You can use any of the Tensor operations in the ``forward`` function.

The learnable parameters of a model are returned by ``net.parameters()``



In [27]:
params = list(net.parameters())
print(len(params))
print(params[0].size())  # fc1 .weight

4
torch.Size([200, 1024])


Let's try a random 32x32 input.
Note: expected input size of this net (LeNet) is 32x32. To use this net on
the MNIST dataset, please resize the images from the dataset to 32x32.



In [28]:
input = torch.randn(1, 1, 32, 32)
out = net(input)
print(out)

tensor([[0.2206, 0.0917, 0.0000, 0.0000, 0.1850, 0.2828, 0.0000, 0.3440, 0.0000,
         0.1580]], grad_fn=<ReluBackward0>)


Zero the gradient buffers of all parameters and backprops with random
gradients:



In [29]:
net.zero_grad()
out.backward(torch.randn(1, 10))

``torch.nn`` only supports mini-batches. The entire ``torch.nn``
    package only supports inputs that are a mini-batch of samples, and not
    a single sample.






Loss Function
-------------
A loss function takes the (output, target) pair of inputs, and computes a
value that estimates how far away the output is from the target.





In [30]:
output = net(input)
target = torch.randn(10)  # a dummy target, for example
target = target.view(1, -1)  # make it the same shape as output
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)

tensor(0.6399, grad_fn=<MseLossBackward>)


Now, if you follow ``loss`` in the backward direction, using its
``.grad_fn`` attribute, you will see a graph of computations that looks
like this:



    input -> view -> linear -> relu -> linear -> relu -> linear
          -> MSELoss
          -> loss

So, when we call ``loss.backward()``, the whole graph is differentiated
w.r.t. the loss, and all Tensors in the graph that has ``requires_grad=True``
will have their ``.grad`` Tensor accumulated with the gradient.




Backprop
--------
To backpropagate the error all we have to do is to ``loss.backward()``.
You need to clear the existing gradients though, else gradients will be
accumulated to existing gradients.


Now we shall call ``loss.backward()``, and have a look at linear bias
gradients before and after the backward.



In [31]:
net.zero_grad()     # zeroes the gradient buffers of all parameters

print('fc1.bias.grad before backward')
print(net.fc1.bias.grad)

loss.backward()

print('fc1.bias.grad after backward')
print(net.fc1.bias.grad)

fc1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.])
fc1.bias.grad after backward
tensor([-0.0223,  0.0000,  0.0000,  0.0000,  0.0000, -0.0090,  0.000

Update the weights
------------------
The simplest update rule used in practice is the Stochastic Gradient
Descent (SGD):

 ``weight = weight - learning_rate * gradient``

We can implement this using simple Python code:

```
learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)
```
However, as you use neural networks, you want to use various different
update rules such as SGD, Nesterov-SGD, Adam, RMSProp, etc.
To enable this, we built a small package: ``torch.optim`` that
implements all these methods. Using it is very simple:



In [32]:
import torch.optim as optim

# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()    # Does the update

# Preparing a dataset

A lot of effort in solving any machine learning problem goes in to preparing the data. PyTorch provides many tools to make data loading easy and hopefully, to make your code more readable.

Dataset class
-------------

``torch.utils.data.Dataset`` is an abstract class representing a
dataset.
Your custom dataset should inherit ``Dataset`` and override the following
methods:

-  ``__len__`` so that ``len(dataset)`` returns the size of the dataset.
-  ``__getitem__`` to support the indexing such that ``dataset[i]`` can
   be used to get $i$'th sample


Sample of our dataset will be a dict
``{'image': image, 'label': label}``.

### my_SimpleDataset

Dataset containes images build form zeroes and ones.
Our machine_learning task is to figure out if we have **more ones than zeros** in the picture. (This task is trivial)


In [33]:
PICTURE_SIZE = 10

class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, size):
        self.data = torch.zeros(size, PICTURE_SIZE, PICTURE_SIZE)
        self.data[torch.rand_like(self.data) > 0.5] = 1.
        
        self.labels = torch.zeros(size)
        self.labels[self.data.mean(dim=[1,2]) > 0.5] = 1.
    
    def __getitem__(self, item):
        # Accepts scalars, tuples and dictionaries
        return {
                'image': self.data[item],
                'label': self.labels[item]
        }
    def __len__(self):
        return self.data.shape[0]

dataset = SimpleDataset(2**13)
# __len__
print('dataset_len:', len(dataset))

# __getitem__
print(dataset[15])

for x in dataset:
    pass
print('looping sucesfull')

dataset_len: 8192
{'image': tensor([[1., 1., 1., 0., 0., 1., 1., 0., 1., 1.],
        [0., 0., 1., 0., 0., 1., 1., 1., 1., 0.],
        [0., 0., 1., 0., 1., 1., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 1.],
        [1., 0., 1., 1., 1., 1., 0., 1., 0., 1.],
        [0., 1., 1., 1., 1., 1., 1., 1., 0., 1.],
        [0., 1., 0., 0., 1., 0., 1., 0., 0., 1.],
        [1., 1., 1., 1., 0., 1., 1., 0., 0., 1.],
        [1., 1., 0., 1., 0., 1., 1., 0., 0., 1.],
        [1., 0., 0., 1., 0., 0., 1., 1., 1., 0.]]), 'label': tensor(1.)}
looping sucesfull


However, we are losing a lot of features by using a simple ``for`` loop to
iterate over the data. In particular, we are missing out on:

-  Batching the data
-  Shuffling the data
-  Load the data in parallel using ``multiprocessing`` workers.

``torch.utils.data.DataLoader`` is an iterator which provides all these
features. Parameters used below should be clear.

In [34]:
BATCH_SIZE = 64

dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=2
)

### NeuralNet definition

In [35]:
HIDDEN_SIZE = 800

class Net2(nn.Module):
    def __init__(self):
        super(Net2, self).__init__()
        self.hidden = nn.Sequential(
                nn.Linear(PICTURE_SIZE * PICTURE_SIZE, HIDDEN_SIZE),
                nn.ReLU(),
                nn.Linear(HIDDEN_SIZE, 1)
        )
    def forward(self, x):
        x = x.view(-1, PICTURE_SIZE * PICTURE_SIZE)
        x = self.hidden(x)
        return x

In [36]:
EPOCHS = 20

# Move model weigths to device
device = ('cuda:0' if torch.cuda.is_available() else 'cpu')
net = Net2().to(device)

optimizer = torch.optim.SGD(net.parameters(), lr=0.001)

for e in range(EPOCHS):
    loss_avg = 0.
    acc_avg = 0.
    for i, batch_data in (enumerate(dataloader)):
        # Start with zeroing .grad fields in model
        net.zero_grad()
        
        # Move data to device
        images = batch_data['image'].to(device)
        labels = batch_data['label'].to(device)
        
        #Run neural net
        out = net(images)

        #Compute loss and apply autograd for model parameters
        loss = ((out.view(-1) - labels)** 2).sum()
        loss.backward()
        
        #Update network weigths
        optimizer.step()

        # Statistics generation
        res = torch.where(
                out.view(-1) > 0.5,
                torch.tensor(1., device=device),
                torch.tensor(0., device=device)
        )
        acc = (res == labels).sum()
        loss_avg += loss
        acc_avg += acc
        if i % 30 == 0 and i != 0:
            print(f'epoch: {e + 1:3d} step: {(i * 30):6d} loss: {loss_avg / 30:.3f} acc: {acc_avg / (BATCH_SIZE * 30):.3f}')
            acc_avg = 0.
            loss_avg = 0.
        # End of statistics generation

        






epoch:   1 step:    900 loss: 207.521 acc: 0.379
epoch:   1 step:   1800 loss: 17.871 acc: 0.370
epoch:   1 step:   2700 loss: 17.141 acc: 0.464
epoch:   1 step:   3600 loss: 16.798 acc: 0.467
epoch:   2 step:    900 loss: 16.733 acc: 0.517
epoch:   2 step:   1800 loss: 15.717 acc: 0.537
epoch:   2 step:   2700 loss: 15.542 acc: 0.543
epoch:   2 step:   3600 loss: 15.276 acc: 0.577
epoch:   3 step:    900 loss: 15.188 acc: 0.656
epoch:   3 step:   1800 loss: 14.561 acc: 0.654
epoch:   3 step:   2700 loss: 14.214 acc: 0.648
epoch:   3 step:   3600 loss: 13.831 acc: 0.694
epoch:   4 step:    900 loss: 13.513 acc: 0.745
epoch:   4 step:   1800 loss: 12.247 acc: 0.753
epoch:   4 step:   2700 loss: 11.523 acc: 0.760
epoch:   4 step:   3600 loss: 11.064 acc: 0.773
epoch:   5 step:    900 loss: 10.788 acc: 0.804
epoch:   5 step:   1800 loss: 9.585 acc: 0.810
epoch:   5 step:   2700 loss: 9.390 acc: 0.809
epoch:   5 step:   3600 loss: 11.080 acc: 0.781
epoch:   6 step:    900 loss: 9.428 acc: 

Used materials:
* Deep Learning with PyTorch: A 60 Minute Blitz https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html
* Writing Custom Datasets, DataLoaders and Transforms https://pytorch.org/tutorials/beginner/data_loading_tutorial.html