In [1]:
from __future__ import print_function
import torch

In [13]:
# 1) tensor: basic matrix of the pytorch

# uninitialized tensor: does not contain definite known values
x = torch.empty(5,3)
print(x)

# randomly initialized tensor:
x = torch.rand(5,3)
print(x)

# matrix filled zeros and of dtype long tensor:
x = torch.zeros(5,3,dtype=torch.long)
print(x)

# directly from data tensor:
x = torch.tensor([5.5, 3])
print(x)

# create a tensor out of an existing tensor:
x = x.new_ones(5,3, dtype=torch.double) # new_* methods take in sizes
print(x)

# override dtype, result has same size
x = torch.randn_like(x, dtype=torch.float)
print(x)

# get tensor size
print(x.size())

tensor([[7.2256e+31, 3.2605e-12, 1.8179e+31],
        [2.0703e-19, 2.0615e-19, 7.0062e+22],
        [3.0304e+35, 1.9338e+34, 5.0782e+31],
        [4.2964e+24, 1.7443e+28, 1.0804e+27],
        [1.2735e-14, 1.1819e+22, 7.0976e+22]])
tensor([[0.3149, 0.4621, 0.0845],
        [0.9574, 0.8686, 0.2046],
        [0.2491, 0.9609, 0.3284],
        [0.0239, 0.6233, 0.5115],
        [0.4846, 0.4398, 0.6052]])
tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]])
tensor([5.5000, 3.0000])
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]], dtype=torch.float64)
tensor([[ 0.5224, -0.4999, -0.7121],
        [ 1.6048,  0.7248, -1.7530],
        [-1.3934,  1.0264,  1.6323],
        [ 1.4303,  1.0740, -2.2985],
        [ 0.4919, -1.0918, -0.8380]])
torch.Size([5, 3])


In [21]:
# 2) operations: matrix operations for tensors

# addition in several ways
y = torch.rand(5,3)
print(x + y)
print(torch.add(x,y))
result = torch.empty(5,3); torch.add(x,y, out=result); print(result) # out-tensor
y.add_(x); print(y) # inplace

# Any operation that mutates a tensor in-place is post-fixed with an _
# For example: x.copy_(y), x.t_(), will change x

# numpy like indexing
print(x[:, 1]) # all columns, second row

# resizing/reshaping: use view
a = torch.randn(4,4)
b = a.view(16)
c = a.view(2,8)
d = a.view(-1, 8) # inference based on other dimensions, so column size is inferred
print(a.size(), b.size(), c.size(), d.size())

# one element tensor: get the value
oneelem = torch.randn(1)
print(oneelem)
print(oneelem.item())

tensor([[ 0.9918, -0.0119,  0.1160],
        [ 2.1655,  0.8636, -1.3477],
        [-0.5542,  1.8365,  2.3669],
        [ 2.3406,  1.6816, -1.3072],
        [ 1.3443, -1.0447, -0.6841]])
tensor([[ 0.9918, -0.0119,  0.1160],
        [ 2.1655,  0.8636, -1.3477],
        [-0.5542,  1.8365,  2.3669],
        [ 2.3406,  1.6816, -1.3072],
        [ 1.3443, -1.0447, -0.6841]])
tensor([[ 0.9918, -0.0119,  0.1160],
        [ 2.1655,  0.8636, -1.3477],
        [-0.5542,  1.8365,  2.3669],
        [ 2.3406,  1.6816, -1.3072],
        [ 1.3443, -1.0447, -0.6841]])
tensor([[ 0.9918, -0.0119,  0.1160],
        [ 2.1655,  0.8636, -1.3477],
        [-0.5542,  1.8365,  2.3669],
        [ 2.3406,  1.6816, -1.3072],
        [ 1.3443, -1.0447, -0.6841]])
tensor([-0.4999,  0.7248,  1.0264,  1.0740, -1.0918])
torch.Size([4, 4]) torch.Size([16]) torch.Size([2, 8]) torch.Size([2, 8])
tensor([-1.0428])
-1.0428111553192139


In [25]:
# 3) convert to numpy: torch tensor and numpy array will share their underlying memory locations
#                      and changing one will change another

a_torch = torch.ones(5)
print(a_torch)

b_torch = a_torch.numpy()
print(b_torch)

a_torch.add_(1)
print(a_torch)
print(b_torch)

# converting numpy array to torch tensor
# same memory locality
import numpy as np
aa = np.ones(5)
bb = torch.from_numpy(aa)
np.add(aa, 1, out=aa)
print(aa)
print(bb)

tensor([1., 1., 1., 1., 1.])
[1. 1. 1. 1. 1.]
tensor([2., 2., 2., 2., 2.])
[2. 2. 2. 2. 2.]
[2. 2. 2. 2. 2.]
tensor([2., 2., 2., 2., 2.], dtype=torch.float64)


In [27]:
# 4) CUDA tensors: tensors can be moved onto any device using the .to() method

print(torch.cuda.is_available())

# :( local laptop does not support CUDA
if torch.cuda.is_available():
    device = torch.device('cuda')
    cuda_torch = torch.ones_like(x, device=device)
    x = x.to(device)
    zz = x + cuda_torch
    print(zz)
    print(zz.to("cpu", torch.double))

False


In [40]:
"""
5) Brief introduction to Autograd:

autograd package provides automatic differentiation for all operations on tensors. define-by-run framework,
so backpropagation is defined by how the code is run, and every iteration can be different

by default, autograd tracks all operations on the tensor.
when finished with the computation, I can call .backward() and have all gradients computed automatically.
this result will be at .grad attribute.
this requires memory.

stopping: .detach()
preventing: with torch.no_grad():

Tensor and Function are interconnected and build up an acyclic graph
that encodes a complete history of a compuatation - each tensor has a .grad_fn attribute that references
a Function that has created the Tensor

if you want to compute the derivatives, you can call .backward() on a tensor. if tensor is a scalar,
you don't need to specify any arguments to backward(), but if it has more than that,
you need to specify a gradient argument that is a tensor of a matching shape.
"""
grad_tensor = torch.ones(2,2, requires_grad=True)
print(grad_tensor)

grad_ops = grad_tensor + 2
print(grad_ops)
print(grad_ops.grad_fn) # grad fn defined as AddBackward

gradgrad = grad_ops * grad_ops * 3
out = gradgrad.mean()
print(gradgrad, out)

a = torch.randn(2,2)
a = ((a * 3) / (a - 1))
print(a.requires_grad)
a.requires_grad_(True)
b = (a * a).sum()
print(b.grad_fn) # grad fn defined as SumBackward

out.backward() # equivalent to out.backward(torch.tensor(1.)) because out contains a scalar
out # is MeanBackward
print(grad_tensor.grad)

# Generally speaking, torch.autograd is an engine for computing vector-Jacobian product

x = torch.randn(3, requires_grad=True)

y = x * 2
while y.data.norm() < 1000:
    y = y * 2

print(y)
v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float)
y.backward(v)
print(x.grad)

tensor([[1., 1.],
        [1., 1.]], requires_grad=True)
tensor([[3., 3.],
        [3., 3.]], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x1131aa5f8>
tensor([[27., 27.],
        [27., 27.]], grad_fn=<MulBackward0>) tensor(27., grad_fn=<MeanBackward0>)
False
<SumBackward0 object at 0x113087198>
tensor([[4.5000, 4.5000],
        [4.5000, 4.5000]])
tensor([863.6843, 495.4630, 740.8290], grad_fn=<MulBackward0>)
tensor([5.1200e+01, 5.1200e+02, 5.1200e-02])


In [41]:
# https://pytorch.org/docs/stable/autograd.html#function

"""
Records operation history and defines formulas for differentiating ops.

Extending autograd: https://pytorch.org/docs/stable/notes/extending.html#extending-torch-autograd

Every operation performed on Tensor s creates a new function object,
that performs the computation, and records that it happened.
The history is retained in the form of a DAG of functions,
with edges denoting data dependencies (input <- output).
Then, when backward is called, the graph is processed in the topological ordering,
by calling backward() methods of each Function object, and passing returned gradients on to next Function s.

Normally, the only way users interact with functions is by creating subclasses and defining new operations.
This is a recommended way of extending torch.autograd.
"""

''

In [42]:
"""
6) Neural nets on top of autograd:

nn depends on autograd to define models and differentiate them.
nn.Module contains layers, and a method forward(input) returns the output

A typical training procedure for a neural network is as follows:

1. Define the neural network that has some learnable parameters (or weights)
2. Iterate over a dataset of inputs
3. Process input through the network
4. Compute the loss (how far is the output from being correct)
5. Propagate gradients back into the network’s parameters
6. Update the weights of the network, typically using a simple update rule:
   weight = weight - learning_rate * gradient
"""
import torch
import torch.nn as nn
import torch.nn.functional as functional

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        # 1 input image channel, 6 output channels, 3x3 square convolution
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        
        # affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 6 * 6, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
    
    def forward(self, x):
        x = functional.max_pool2d(functional.relu(self.conv1(x)), (2,2))
        # If the size is a square you can only specify a single number
        x = functional.max_pool2d(functional.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = functional.relu(self.fc1(x))
        x = functional.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    """
    backward function (where gradients are computed) is automatically defined for you using autograd
    You can use any of the Tensor operations in the forward function
    """
    # def backward(self, x):
    #     pass
    
    def num_flat_features(self, x):
        size = x.size()[1:] # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

net = Net()
print(net)


Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [43]:
# learnable parameters of a model are returned by net.parameters()
params = list(net.parameters())
print(len(params))
print(params[0].size())

input_vector = torch.randn(1,1,32,32)
# nn only supports mini-batches, not a single sample. so
# nn.Conv2d takes 4d vector of nSamples x nChannels x Height x Width
out = net(input_vector)
print(out)

# zero the gradient buffers of all parameters and backprops with random gradients
net.zero_grad()
out.backward(torch.randn(1, 10))

10
torch.Size([6, 1, 3, 3])


In [48]:
# Loss function
output = net(input_vector)
target = torch.randn(10)
target = target.view(1, -1)
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)
print(loss.grad_fn) # MSE loss
print(loss.grad_fn.next_functions[0][0]) # linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0]) # relu

tensor(0.8506, grad_fn=<MseLossBackward>)
<MseLossBackward object at 0x1131b8ac8>
<AddmmBackward object at 0x1131aa7f0>
<AccumulateGrad object at 0x1131b8ac8>


In [49]:
# Backpropagation

net.zero_grad()
print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([-0.0012, -0.0011,  0.0016,  0.0094,  0.0061,  0.0110])


In [50]:
# update the weights

# this is the simple python code for gradient descent
learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)
    
import torch.optim as optim

# optimizer (update rules) - SGD, nesterov-SGD, Adam, RMSProp, ...
optimizer = optim.SGD(net.parameters(), lr = 0.01)

# in the training loop:
optimizer.zero_grad()
output = net(input_vector)
loss = criterion(output, target)
loss.backward()
optimizer.step() # update 

In [60]:
# reference: https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict

"""
A state_dict is simply a Python dictionary object that maps each layer to its parameter tensor
Note that only layers with learnable parameters (convolutional layers, linear layers, etc.)
and registered buffers (batchnorm’s running_mean) have entries in the model’s state_dict

Optimizer objects (torch.optim) also have a state_dict, which contains information about the optimizer’s state,
as well as the hyperparameters used
"""
# Print model's state_dict
print("Model's state_dict:")
for param_tensor in net.state_dict():
    print(param_tensor, "\t", net.state_dict()[param_tensor].size())

# Print optimizer's state_dict
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])
torch.save(net.state_dict(), 'torch-net.state')

# this should load the entire weights for each layer, and equivalent tensor
# then my defined tensor to simple matrix should transfer the weights.
# then matrix multiplication HE operation.
torch.load('torch-net.state')

Model's state_dict:
conv1.weight 	 torch.Size([6, 1, 3, 3])
conv1.bias 	 torch.Size([6])
conv2.weight 	 torch.Size([16, 6, 3, 3])
conv2.bias 	 torch.Size([16])
fc1.weight 	 torch.Size([120, 576])
fc1.bias 	 torch.Size([120])
fc2.weight 	 torch.Size([84, 120])
fc2.bias 	 torch.Size([84])
fc3.weight 	 torch.Size([10, 84])
fc3.bias 	 torch.Size([10])
Optimizer's state_dict:
state 	 {}
param_groups 	 [{'lr': 0.01, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}]


OrderedDict([('conv1.weight',
              tensor([[[[ 0.0766,  0.1466, -0.2599],
                        [ 0.1632,  0.0884,  0.1685],
                        [-0.2542,  0.2027, -0.0301]]],
              
              
                      [[[-0.1326, -0.0435,  0.2925],
                        [-0.0614, -0.2994, -0.3049],
                        [-0.2435, -0.2369, -0.2051]]],
              
              
                      [[[ 0.2613,  0.2208,  0.3055],
                        [-0.2224, -0.1650, -0.0793],
                        [-0.2238,  0.2733, -0.2582]]],
              
              
                      [[[ 0.0802,  0.2403,  0.0256],
                        [-0.2966,  0.0563,  0.2104],
                        [ 0.1387, -0.2813,  0.0400]]],
              
              
                      [[[ 0.2897, -0.1730,  0.0207],
                        [ 0.1945,  0.1883,  0.1839],
                        [ 0.2531,  0.1738, -0.2217]]],
              
              
               