In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
%matplotlib inline

# Tensor manipulation

## size & dimension

In [2]:
x = torch.arange(0, 24).view(2, 4, 3)
x

tensor([[[  0.,   1.,   2.],
         [  3.,   4.,   5.],
         [  6.,   7.,   8.],
         [  9.,  10.,  11.]],

        [[ 12.,  13.,  14.],
         [ 15.,  16.,  17.],
         [ 18.,  19.,  20.],
         [ 21.,  22.,  23.]]])

In [3]:
x.size(), x.dim()

(torch.Size([2, 4, 3]), 3)

In [4]:
x.size(0), x.size(1), x.size(2)

(2, 4, 3)

## dtypes

In [5]:
x = torch.tensor([1, 2, 3])
print(x, x.dtype)

tensor([ 1,  2,  3]) torch.int64


In [6]:
x = torch.tensor([1.2, 2, 3])
print(x, x.dtype)

tensor([ 1.2000,  2.0000,  3.0000]) torch.float32


## permute & transpose

In [7]:
x = torch.arange(0, 24).view(2, 4, 3)
x

tensor([[[  0.,   1.,   2.],
         [  3.,   4.,   5.],
         [  6.,   7.,   8.],
         [  9.,  10.,  11.]],

        [[ 12.,  13.,  14.],
         [ 15.,  16.,  17.],
         [ 18.,  19.,  20.],
         [ 21.,  22.,  23.]]])

In [8]:
print(x.size())
print(x.permute(2, 0, 1).size())

torch.Size([2, 4, 3])
torch.Size([3, 2, 4])


## squeeze & unsqueeze

In [9]:
x = torch.randn(1, 2, 2, 1, 2)
print(x)
print(x.squeeze())
print(x.squeeze(0))
print(x.squeeze(3))

tensor([[[[[-0.8638,  1.5170]],

          [[ 0.2666, -1.9463]]],


         [[[-0.6103, -0.2184]],

          [[-2.1175, -0.5560]]]]])
tensor([[[-0.8638,  1.5170],
         [ 0.2666, -1.9463]],

        [[-0.6103, -0.2184],
         [-2.1175, -0.5560]]])
tensor([[[[-0.8638,  1.5170]],

         [[ 0.2666, -1.9463]]],


        [[[-0.6103, -0.2184]],

         [[-2.1175, -0.5560]]]])
tensor([[[[-0.8638,  1.5170],
          [ 0.2666, -1.9463]],

         [[-0.6103, -0.2184],
          [-2.1175, -0.5560]]]])


In [10]:
x = torch.tensor([1, 2, 3])
print(x.unsqueeze(0))
print(x.view(1, -1))
print(x.unsqueeze(1))
print(x.view(-1, 1))

tensor([[ 1,  2,  3]])
tensor([[ 1,  2,  3]])
tensor([[ 1],
        [ 2],
        [ 3]])
tensor([[ 1],
        [ 2],
        [ 3]])


## torch.cat

In [11]:
x = torch.randn(3, 3)
y = torch.randn(3, 3)
# dim=0 -> cat on column
print(torch.cat([x, y], dim=0))
# dim=1 -> cat on row
print(torch.cat([x, y], dim=1))

tensor([[ 1.5668,  1.4982, -0.4502],
        [ 1.8740,  0.3094, -1.2693],
        [ 0.0587,  0.8242, -0.1666],
        [ 1.0118,  0.4650, -1.1774],
        [-1.6743,  0.3937, -1.7300],
        [-0.9730, -0.3424, -0.6019]])
tensor([[ 1.5668,  1.4982, -0.4502,  1.0118,  0.4650, -1.1774],
        [ 1.8740,  0.3094, -1.2693, -1.6743,  0.3937, -1.7300],
        [ 0.0587,  0.8242, -0.1666, -0.9730, -0.3424, -0.6019]])


In [12]:
# axis=0 -> cat on column
print(np.concatenate([x.numpy(), y.numpy()], axis=0))
# axis=1 -> cat on row
print(np.concatenate([x.numpy(), y.numpy()], axis=1))

[[ 1.5667955   1.4981884  -0.45018178]
 [ 1.8740447   0.30939355 -1.269309  ]
 [ 0.0587256   0.8242405  -0.16662787]
 [ 1.0117911   0.4649585  -1.177354  ]
 [-1.6743116   0.3937247  -1.7299552 ]
 [-0.97302073 -0.34240177 -0.60191107]]
[[ 1.5667955   1.4981884  -0.45018178  1.0117911   0.4649585  -1.177354  ]
 [ 1.8740447   0.30939355 -1.269309   -1.6743116   0.3937247  -1.7299552 ]
 [ 0.0587256   0.8242405  -0.16662787 -0.97302073 -0.34240177 -0.60191107]]


## calculate product of matrix and vector

In [13]:
x = torch.tensor([[1, 2, 3], 
                  [3, 2, 1]])
y = torch.tensor([[1], [2], [3]])
x, y

(tensor([[ 1,  2,  3],
         [ 3,  2,  1]]), tensor([[ 1],
         [ 2],
         [ 3]]))

In [14]:
print(x @ y)
print(torch.mm(x, y))

tensor([[ 14],
        [ 10]])
tensor([[ 14],
        [ 10]])


In [15]:
# torch.dot is calculating inner product of vectors
torch.dot(x[0], x[1])

tensor(10)

# Activation function

## softmax
$f_i(x) = \frac{e^{x_i}}{\sum_j{e^{x_j}}}$

In [16]:
x = torch.tensor([[1, 2, 3]], dtype=torch.float32)
print(x)
# Specify dimension
print(F.softmax(x, dim=1))

tensor([[ 1.,  2.,  3.]])
tensor([[ 0.0900,  0.2447,  0.6652]])


In [17]:
x = torch.tensor([[1, 1, 1], [1, 2, 3]], dtype=torch.float32)
print(x)
print(F.softmax(x, dim=1))

tensor([[ 1.,  1.,  1.],
        [ 1.,  2.,  3.]])
tensor([[ 0.3333,  0.3333,  0.3333],
        [ 0.0900,  0.2447,  0.6652]])


In [18]:
# F.log_softmax cannot handle input over 3-rank, leading to wrong result (Depreciated)
x = torch.tensor([[[1, 1, 1], [1, 2, 3]], 
                  [[1, 1, 1], [3, 2, 1]]], dtype=torch.float32)
print(x)
print(F.softmax(x, dim=-1))

tensor([[[ 1.,  1.,  1.],
         [ 1.,  2.,  3.]],

        [[ 1.,  1.,  1.],
         [ 3.,  2.,  1.]]])
tensor([[[ 0.3333,  0.3333,  0.3333],
         [ 0.0900,  0.2447,  0.6652]],

        [[ 0.3333,  0.3333,  0.3333],
         [ 0.6652,  0.2447,  0.0900]]])


In [19]:
print(x[:1])
print(F.softmax(x[:1], dim=-1))

tensor([[[ 1.,  1.,  1.],
         [ 1.,  2.,  3.]]])
tensor([[[ 0.3333,  0.3333,  0.3333],
         [ 0.0900,  0.2447,  0.6652]]])


## log_softmax
$f_i(x) = \log{\frac{e^{x_i}}{\sum_j{e^{x_j}}}}$

In [20]:
x = torch.tensor([1, 2, 3], dtype=torch.float32)
print(x)
print(F.log_softmax(x, dim=-1))
print(torch.log(F.softmax(x, dim=-1)))

tensor([ 1.,  2.,  3.])
tensor([-2.4076, -1.4076, -0.4076])
tensor([-2.4076, -1.4076, -0.4076])


## sigmoid
$f(x) = \frac{1}{1 + e^{-x}}$

In [21]:
x = torch.tensor([1, 2, 3], dtype=torch.float32)
print(x)
print(F.sigmoid(x))

tensor([ 1.,  2.,  3.])
tensor([ 0.7311,  0.8808,  0.9526])


## tanh
$f(x) = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$

In [22]:
x = torch.tensor([1, 2, 3], dtype=torch.float32)
print(x)
print(F.tanh(x))

tensor([ 1.,  2.,  3.])
tensor([ 0.7616,  0.9640,  0.9951])


## ReLU
${ReLU}(x) = max(0, x)$

In [23]:
x = torch.arange(-3, 4)
print(x)
print(F.relu(x))

tensor([-3., -2., -1.,  0.,  1.,  2.,  3.])
tensor([ 0.,  0.,  0.,  0.,  1.,  2.,  3.])


# Loss function

## MSE loss

In [24]:
x = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
y = torch.tensor([1, 1, 2, 2], dtype=torch.float32)
x, y

(tensor([ 1.,  2.,  3.,  4.]), tensor([ 1.,  1.,  2.,  2.]))

In [25]:
loss_func = nn.MSELoss()
loss_func(x, y)

tensor(1.5000)

In [26]:
# set size_average=False, and get SSE loss
loss_func = nn.MSELoss(size_average=False)
loss_func(x, y)

tensor(6.)

# smooth_L1_loss
$$
loss(x, y) = 
\begin{cases}
    0.5 * (x - y)^2, & \text{if } |x - y| < 1 \\
    |x - y| - 0.5,   & \text{otherwise}
\end{cases}
$$

It is less sensitive to outliers than the `MSELoss` and in some cases  
Also known as the Huber loss

In [27]:
x = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
y = torch.tensor([1, 1, 2, 2], dtype=torch.float32)
x, y

(tensor([ 1.,  2.,  3.,  4.]), tensor([ 1.,  1.,  2.,  2.]))

In [28]:
loss_func = nn.SmoothL1Loss()
loss_func(x, y)

tensor(0.6250)

## NLLLoss: negative log likelihood  
$$
loss(x, class) = -x[class] \\
\text{where x is a vector of log likelihood of each class}
$$

The negative log likelihood loss. It is useful to train a classification
problem with n classes. 

In [29]:
x = torch.tensor([[1, 2], [3, 3]], dtype=torch.float32)
y = torch.tensor([1, 0])
x, y

(tensor([[ 1.,  2.],
         [ 3.,  3.]]), tensor([ 1,  0]))

In [30]:
# use log_softmax to calculate log_likelihood 
x = F.log_softmax(x, dim=-1)
x

tensor([[-1.3133, -0.3133],
        [-0.6931, -0.6931]])

In [31]:
loss_func = nn.NLLLoss()
loss_func(x, y)

tensor(0.5032)

## cross entropy loss
$$
\begin{split}
loss(x, class) & = -log(\frac{e^{x[class]}}{\sum_j e^{x[j]}}) \\
               & = -x[class] + log(\sum_j e^{x[j]})
\end{split}
$$

In [32]:
x = torch.tensor([[1, 2], [3, 3]], dtype=torch.float32)
y = torch.tensor([1, 0])
x, y

(tensor([[ 1.,  2.],
         [ 3.,  3.]]), tensor([ 1,  0]))

In [33]:
loss_func = nn.CrossEntropyLoss()
# NO need to calculate log_likelihood 
# get the same result with log_softmax -> NLLLoss
loss_func(x, y)

tensor(0.5032)

# Optimizer

## SGD
Args:  
params (iterable): iterable of parameters to optimize or dicts defining parameter groups  
lr (float): learning rate  
momentum (float, optional): momentum factor (default: 0)  
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)  
dampening (float, optional): dampening for momentum (default: 0)  
nesterov (bool, optional): enables Nesterov momentum (default: False)  

## Adadelta
Arguments:  
params (iterable): iterable of parameters to optimize or dicts defining parameter groups  
rho (float, optional): coefficient used for computing a running average of squared gradients (default: 0.9)  
eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-6)  
lr (float, optional): coefficient that scale delta before it is applied to the parameters (default: 1.0)  
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)  

## Adam
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining parameter groups  
lr (float, optional): learning rate (default: 1e-3)  
betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999))  
eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8)  
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)  

## L-BFGS
Arguments:
lr (float): learning rate (default: 1)  
max_iter (int): maximal number of iterations per optimization step (default: 20)  
max_eval (int): maximal number of function evaluations per optimization step (default: max_iter * 1.25).  
tolerance_grad (float): termination tolerance on first order optimality (default: 1e-5).  
tolerance_change (float): termination tolerance on function value/parameter changes (default: 1e-9).  
history_size (int): update history size (default: 100).  

In [34]:
from sklearn import linear_model
M = 500
N = 5

x = np.random.randn(M, N).astype(np.float32)
y = x.sum(axis=1) + np.random.randn(M).astype(np.float32)

ols = linear_model.LinearRegression()
ols.fit(x, y)
ols.coef_

array([0.97211564, 1.0640465 , 1.052424  , 1.0333021 , 1.0172589 ],
      dtype=float32)

In [35]:
# data
x = torch.from_numpy(x).cuda()
y = torch.from_numpy(y).cuda()
x.size(), y.size()

(torch.Size([500, 5]), torch.Size([500]))

In [36]:
# test for SGD
# network
fc = nn.Linear(5, 1)
fc.cuda()

# loss function
loss_func = nn.MSELoss(size_average=True)
optimizer = optim.SGD(fc.parameters(), lr=0.01)

def closure():
    optimizer.zero_grad()
    y_pred = fc(x)
    # MUST put the predicted varibale at the first argument
    loss = loss_func(y_pred, y.view(-1, 1))
    loss.backward()
    return loss

for epoch in range(10000):
    optimizer.step(closure=closure)
print(list(fc.parameters()))

[Parameter containing:
tensor([[ 0.9721,  1.0640,  1.0524,  1.0333,  1.0173]], device='cuda:0'), Parameter containing:
tensor(1.00000e-02 *
       [-4.2673], device='cuda:0')]


In [37]:
# test for L-BFGS
# network
fc = nn.Linear(5, 1)
fc.cuda()

# loss function
loss_func = nn.MSELoss(size_average=True)
optimizer = optim.LBFGS(fc.parameters())

def closure():
    optimizer.zero_grad()
    y_pred = fc(x)
    # MUST put the predicted varibale at the first argument
    loss = loss_func(y_pred, y.view(-1, 1))
    loss.backward()
    return loss

# Performs a single optimization step
optimizer.step(closure=closure)

list(fc.parameters())

[Parameter containing:
 tensor([[ 0.9721,  1.0640,  1.0524,  1.0333,  1.0173]], device='cuda:0'),
 Parameter containing:
 tensor(1.00000e-02 *
        [-4.2671], device='cuda:0')]

In [38]:
# test for Adam
# network
fc = nn.Linear(5, 1)
fc.cuda()

# loss function
loss_func = nn.MSELoss(size_average=True)

optimizer = optim.Adam(fc.parameters())

def closure():
    optimizer.zero_grad()
    y_pred = fc(x)
    # MUST put the predicted varibale at the first argument
    loss = loss_func(y_pred, y.view(-1, 1))
    loss.backward()
    return loss

for epoch in range(10000):
    optimizer.step(closure=closure)
print(list(fc.parameters()))

[Parameter containing:
tensor([[ 0.9721,  1.0640,  1.0524,  1.0333,  1.0173]], device='cuda:0'), Parameter containing:
tensor(1.00000e-02 *
       [-4.2673], device='cuda:0')]


# Dropout

In [39]:
x = torch.arange(0, 10)
print(x)

dropout_layer = nn.Dropout(0.5)
dropout_layer(x)

tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.])


tensor([  0.,   2.,   0.,   0.,   0.,  10.,   0.,  14.,  16.,   0.])

In [40]:
dropout_layer = nn.Dropout(0.2)
dropout_layer(x)

tensor([  0.0000,   1.2500,   2.5000,   3.7500,   5.0000,   6.2500,
          7.5000,   8.7500,  10.0000,  11.2500])

In [41]:
dropout_layer.eval()
print(dropout_layer.training)
dropout_layer(x)

False


tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.])

In [42]:
dropout_layer.train()
print(dropout_layer.training)
dropout_layer(x)

True


tensor([  0.0000,   1.2500,   0.0000,   3.7500,   5.0000,   6.2500,
          7.5000,   0.0000,  10.0000,   0.0000])