In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# CUDA Test

In [2]:
# CUDA test
torch.cuda.is_available()

False

In [3]:
x = torch.Tensor([1.0])
xx = x.cuda()
xx

AssertionError: Torch not compiled with CUDA enabled

In [4]:
# CUDNN test
from torch.backends import cudnn
cudnn.is_acceptable(xx)

NameError: name 'xx' is not defined

# Tensor Manipulation
## Tensor: Size & Dimension

For a N-dimensional Tensor x, its dimensions are numbered (0, 1, 2, ..., N-1). 
* The 0 dimension is the most outer dimension.
* The N-1 dimension (or -1 dimension) is the most inner dimension.

In [5]:
x = torch.arange(0, 24).view(2, 4, 3)
x

tensor([[[ 0,  1,  2],
         [ 3,  4,  5],
         [ 6,  7,  8],
         [ 9, 10, 11]],

        [[12, 13, 14],
         [15, 16, 17],
         [18, 19, 20],
         [21, 22, 23]]])

In [6]:
print(x.size())
print(x.shape)
print(x.dim())

torch.Size([2, 4, 3])
torch.Size([2, 4, 3])
3


In [7]:
print(x.size(0))
print(x.size(1))
print(x.size(2))

2
4
3


## Tensor: Dtypes

In [8]:
x = torch.tensor([1, 2, 3])
print(x)
print(x.dtype)

tensor([1, 2, 3])
torch.int64


In [9]:
x = torch.tensor([1.2, 2, 3])
print(x)
print(x.dtype)

tensor([1.2000, 2.0000, 3.0000])
torch.float32


## Tensor: Permute & Transpose

In [10]:
x = torch.arange(0, 24).view(2, 4, 3)
x

tensor([[[ 0,  1,  2],
         [ 3,  4,  5],
         [ 6,  7,  8],
         [ 9, 10, 11]],

        [[12, 13, 14],
         [15, 16, 17],
         [18, 19, 20],
         [21, 22, 23]]])

In [11]:
# (2, 0, 1) are the ORIGINAL dimensions
# The original dimension 2 changes to the dimension 0 now. (The first group elements are 0, 1, 2)
# The original dimension 0 changes to the dimension 1 now. (The first group elements are 0, 12)
# The original dimension 1 changes to the dimension 2 now. (The first group elements are 0, 3, 6, 9)
x.permute(2, 0, 1)

tensor([[[ 0,  3,  6,  9],
         [12, 15, 18, 21]],

        [[ 1,  4,  7, 10],
         [13, 16, 19, 22]],

        [[ 2,  5,  8, 11],
         [14, 17, 20, 23]]])

In [12]:
# Equivalent to x.permute(n-1, n-2, ..., 0), if x is n-dimensional. 
x.T

tensor([[[ 0, 12],
         [ 3, 15],
         [ 6, 18],
         [ 9, 21]],

        [[ 1, 13],
         [ 4, 16],
         [ 7, 19],
         [10, 22]],

        [[ 2, 14],
         [ 5, 17],
         [ 8, 20],
         [11, 23]]])

In [13]:
# torch.t() only applies to 2D Tensor. 
x[0].t()

tensor([[ 0,  3,  6,  9],
        [ 1,  4,  7, 10],
        [ 2,  5,  8, 11]])

## Tensor: Squeeze & Unsqueeze

In [14]:
x = torch.randn(1, 2, 2, 1, 2)
x

tensor([[[[[ 0.7703,  1.6178]],

          [[ 1.0111, -1.2006]]],


         [[[ 0.2087,  1.7830]],

          [[ 2.0622,  0.2360]]]]])

In [15]:
# Squeeze: Remove dimensions. 
x.squeeze()

tensor([[[ 0.7703,  1.6178],
         [ 1.0111, -1.2006]],

        [[ 0.2087,  1.7830],
         [ 2.0622,  0.2360]]])

In [16]:
x.squeeze(0)

tensor([[[[ 0.7703,  1.6178]],

         [[ 1.0111, -1.2006]]],


        [[[ 0.2087,  1.7830]],

         [[ 2.0622,  0.2360]]]])

In [17]:
# Unsqueeze: Insert dimensions. 
x = torch.tensor([1, 2, 3])
x.unsqueeze(0)

tensor([[1, 2, 3]])

In [18]:
x.unsqueeze(1)

tensor([[1],
        [2],
        [3]])

In [19]:
x.unsqueeze(-1)

tensor([[1],
        [2],
        [3]])

## Tensor: Concatenate

In [20]:
x = torch.randn(2, 3)
y = torch.randn(2, 3)
# dim=0 means concatenating along the most outer dimension. (This dimension changes size.)
# In this 2D case, it is equivalent to concatenating along rows. 
torch.cat([x, y], dim=0)

tensor([[ 0.9538,  0.3839, -0.2705],
        [ 0.4907, -0.0826, -1.0331],
        [-0.1848, -1.0522,  2.2946],
        [-1.3760,  0.5522, -0.2061]])

In [21]:
# dim=1 means concatenating along the dimension 1. (This dimension changes size.)
# In this 2D case, it is equivalent to concatenating along columns. 
# In this 2D case, it is equivalent to dim=-1. 
torch.cat([x, y], dim=1)

tensor([[ 0.9538,  0.3839, -0.2705, -0.1848, -1.0522,  2.2946],
        [ 0.4907, -0.0826, -1.0331, -1.3760,  0.5522, -0.2061]])

In [22]:
# axis=0 means concatenating along the most outer dimension.
np.concatenate([x.numpy(), y.numpy()], axis=0)

array([[ 0.9537738 ,  0.38392898, -0.27051544],
       [ 0.49068993, -0.08256959, -1.033057  ],
       [-0.18484667, -1.0521578 ,  2.294553  ],
       [-1.3759791 ,  0.5522005 , -0.20614688]], dtype=float32)

In [23]:
# axis=1 means concatenating along the dimension 1.
print(np.concatenate([x.numpy(), y.numpy()], axis=1))

[[ 0.9537738   0.38392898 -0.27051544 -0.18484667 -1.0521578   2.294553  ]
 [ 0.49068993 -0.08256959 -1.033057   -1.3759791   0.5522005  -0.20614688]]


## Tensor: Repeat & Tile

In [24]:
x = torch.arange(6).view(2, 3)
x

tensor([[0, 1, 2],
        [3, 4, 5]])

In [25]:
# torch.repeat behaves similar to numpy.tile, instead of numpy.repeat
# Repeat along the dimension 0 for 2 times. 
# Repeat along the dimension 1 for 3 times. (Overlap the original dimension 0.)
# Repeat along the dimension 2 for 4 times. (Overlap the original dimension 1.)
x.repeat(2, 3, 4)

tensor([[[0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2],
         [3, 4, 5, 3, 4, 5, 3, 4, 5, 3, 4, 5],
         [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2],
         [3, 4, 5, 3, 4, 5, 3, 4, 5, 3, 4, 5],
         [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2],
         [3, 4, 5, 3, 4, 5, 3, 4, 5, 3, 4, 5]],

        [[0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2],
         [3, 4, 5, 3, 4, 5, 3, 4, 5, 3, 4, 5],
         [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2],
         [3, 4, 5, 3, 4, 5, 3, 4, 5, 3, 4, 5],
         [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2],
         [3, 4, 5, 3, 4, 5, 3, 4, 5, 3, 4, 5]]])

# Tensor Calculation

In [26]:
x = torch.arange(3, dtype=torch.float)
y = torch.arange(3, dtype=torch.float)
print(x)
print(y)

# torch.dot: Vector-Vector inner production
print(x.dot(y))
# torch.matmal: Vector-Vector -> Scalar
print(x.matmul(y))

tensor([0., 1., 2.])
tensor([0., 1., 2.])
tensor(5.)
tensor(5.)


In [27]:
x = torch.arange(6, dtype=torch.float).view(2, 3)
y = torch.arange(6, dtype=torch.float).view(3, 2)
print(x)
print(y)

# torch.mm: Matrix-Matrix production
print(x.mm(y))
# torch.matmal: Matrix-Matrix -> Matrix
print(x.matmul(y))

tensor([[0., 1., 2.],
        [3., 4., 5.]])
tensor([[0., 1.],
        [2., 3.],
        [4., 5.]])
tensor([[10., 13.],
        [28., 40.]])
tensor([[10., 13.],
        [28., 40.]])


In [28]:
x = torch.arange(12, dtype=torch.float).view(2, 2, 3)
y = torch.arange(12, dtype=torch.float).view(2, 3, 2)
print(x)
print(y)

# torch.bmm: Batched Matrix-Matrix production
print(x.bmm(y))
# torch.matmal: Batched Matrix-Batched Matrix -> Batched Matrix
print(x.matmul(y))

tensor([[[ 0.,  1.,  2.],
         [ 3.,  4.,  5.]],

        [[ 6.,  7.,  8.],
         [ 9., 10., 11.]]])
tensor([[[ 0.,  1.],
         [ 2.,  3.],
         [ 4.,  5.]],

        [[ 6.,  7.],
         [ 8.,  9.],
         [10., 11.]]])
tensor([[[ 10.,  13.],
         [ 28.,  40.]],

        [[172., 193.],
         [244., 274.]]])
tensor([[[ 10.,  13.],
         [ 28.,  40.]],

        [[172., 193.],
         [244., 274.]]])


In [29]:
x = torch.arange(12, dtype=torch.float).view(2, 2, 3)
y = torch.arange(6, dtype=torch.float).view(1, 3, 2)
print(x)
print(y)

# torch.matmal: The non-matrix (i.e. batch) dimensions are broadcasted. 
print(x.matmul(y))

tensor([[[ 0.,  1.,  2.],
         [ 3.,  4.,  5.]],

        [[ 6.,  7.,  8.],
         [ 9., 10., 11.]]])
tensor([[[0., 1.],
         [2., 3.],
         [4., 5.]]])
tensor([[[10., 13.],
         [28., 40.]],

        [[46., 67.],
         [64., 94.]]])


In [30]:
x = torch.arange(6, dtype=torch.float).view(2, 3)
y = torch.arange(3, dtype=torch.float)
print(x)
print(y)

# torch.matmal: Matrix-Vector -> Vector
x.matmul(y)

tensor([[0., 1., 2.],
        [3., 4., 5.]])
tensor([0., 1., 2.])


tensor([ 5., 14.])

# Activation Functions
## Softmax

$$
f_i(x) = \frac{e^{x_i}}{\sum_j{e^{x_j}}}
$$

In [31]:
x = torch.tensor([[1, 1, 1], 
                  [1, 2, 3]], dtype=torch.float)

# Specify the most outer dimension
print(F.softmax(x, dim=0))
# Specify the most inner dimension
print(F.softmax(x, dim=-1))

tensor([[0.5000, 0.2689, 0.1192],
        [0.5000, 0.7311, 0.8808]])
tensor([[0.3333, 0.3333, 0.3333],
        [0.0900, 0.2447, 0.6652]])


In [32]:
x = torch.tensor([[[1, 1, 1], 
                   [1, 2, 3]], 
                  [[1, 1, 1], 
                   [3, 2, 1]]], dtype=torch.float)

# Specify the most inner dimension
F.softmax(x, dim=-1)

tensor([[[0.3333, 0.3333, 0.3333],
         [0.0900, 0.2447, 0.6652]],

        [[0.3333, 0.3333, 0.3333],
         [0.6652, 0.2447, 0.0900]]])

In [33]:
# F.log_softmax cannot handle input over 3-rank, leading to wrong result (Depreciated)
x = torch.tensor([[[1, 1, 1], [1, 2, 3]], 
                  [[1, 1, 1], [3, 2, 1]]], dtype=torch.float32)
print(x)
print(F.softmax(x, dim=-1))

tensor([[[1., 1., 1.],
         [1., 2., 3.]],

        [[1., 1., 1.],
         [3., 2., 1.]]])
tensor([[[0.3333, 0.3333, 0.3333],
         [0.0900, 0.2447, 0.6652]],

        [[0.3333, 0.3333, 0.3333],
         [0.6652, 0.2447, 0.0900]]])


## Log-Softmax
$$ 
f_i(x) = \log \left( \frac{e^{x_i}}{\sum_j{e^{x_j}}} \right)
$$

In [34]:
x = torch.tensor([[1, 1, 1], 
                  [1, 2, 3]], dtype=torch.float)

# Specify the most outer dimension
print(F.log_softmax(x, dim=0))
# Specify the most inner dimension
print(F.log_softmax(x, dim=-1))

# Check...
print(F.softmax(x, dim=-1).log())

tensor([[-0.6931, -1.3133, -2.1269],
        [-0.6931, -0.3133, -0.1269]])
tensor([[-1.0986, -1.0986, -1.0986],
        [-2.4076, -1.4076, -0.4076]])
tensor([[-1.0986, -1.0986, -1.0986],
        [-2.4076, -1.4076, -0.4076]])


## Sigmoid
$$
f(x) = \frac{1}{1 + e^{-x}}
$$

In [35]:
x = torch.tensor([1, 2, 3], dtype=torch.float)

F.sigmoid(x)

tensor([0.7311, 0.8808, 0.9526])

## Tanh
$$
f(x) = \frac{e^x - e^{-x}}{e^x + e^{-x}}
$$

In [36]:
x = torch.tensor([1, 2, 3], dtype=torch.float)

F.tanh(x)

tensor([0.7616, 0.9640, 0.9951])

## ReLU
$$
\mathrm{ReLU}(x) = \max (0, x)  
$$

In [37]:
x = torch.arange(-3, 4, dtype=torch.float)

print(x)
print(F.relu(x))

tensor([-3., -2., -1.,  0.,  1.,  2.,  3.])
tensor([0., 0., 0., 0., 1., 2., 3.])


# Loss Functions
## MSE Loss

In [38]:
x = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
y = torch.tensor([1, 1, 2, 2], dtype=torch.float32)

loss_func = nn.MSELoss()
print(loss_func(x, y))
print(((x - y)**2).mean())

tensor(1.5000)
tensor(1.5000)


In [39]:
# SSE Loss
loss_func = nn.MSELoss(reduction='sum')
print(loss_func(x, y))
print(((x - y)**2).sum())

tensor(6.)
tensor(6.)


## Smooth L1 Loss
$$
\mathrm{SmoothL1Loss}(x, y) = 
\begin{cases}
    0.5 * (x - y)^2, & \text{if } |x - y| < 1 \\
    |x - y| - 0.5,   & \text{otherwise}
\end{cases}
$$

It is less sensitive to outliers than the `MSELoss`. Also known as the Huber loss. 

In [40]:
x = torch.tensor([1, 2, 3, 4], dtype=torch.float)
y = torch.tensor([1, 1, 2, 2], dtype=torch.float)

loss_func = nn.SmoothL1Loss()
loss_func(x, y)

tensor(0.6250)

## NLLLoss (Negative Log-Likelihood Loss) 
$$
\mathrm{NLLLoss}(x, class) = -x[class] \\
$$
where x is a vector of log-likelihood of each class. 

The negative log likelihood loss. It is useful to train a classification task with n classes. 

In [41]:
x = torch.tensor([[1, 1, 1], 
                  [1, 2, 3]], dtype=torch.float)
y = torch.tensor([1, 0])

# Use log_softmax to calculate log-likelihood 
x_ll = F.log_softmax(x, dim=-1)
x_ll

tensor([[-1.0986, -1.0986, -1.0986],
        [-2.4076, -1.4076, -0.4076]])

In [42]:
loss_func = nn.NLLLoss(reduction='sum')
loss_func(x_ll, y)

tensor(3.5062)

## Cross Entropy Loss
$$
\begin{aligned}
\mathrm{CrossEntropyLoss}(x, class) &= -\mathrm{LogSoftmax}(x) [class] \\
                                    &= -\log \left( \frac{e^{x[class]}}{\sum_j e^{x[j]}} \right) \\
                                    &= -x[class] + \log \left( \sum_j e^{x[j]} \right)
\end{aligned}
$$
where $\mathrm{Softmax}(x)$ is a vector of likelihood of each class. 

In [43]:
x = torch.tensor([[1, 1, 1], 
                  [1, 2, 3]], dtype=torch.float)
y = torch.tensor([1, 0])

loss_func = nn.CrossEntropyLoss(reduction='sum')
# NO need to calculate log_likelihood 
# Get the same result with log_softmax -> NLLLoss
loss_func(x, y)

tensor(3.5062)

# Optimizers
## SGD
Args:  
params (iterable): iterable of parameters to optimize or dicts defining parameter groups  
lr (float): learning rate  
momentum (float, optional): momentum factor (default: 0)  
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)  
dampening (float, optional): dampening for momentum (default: 0)  
nesterov (bool, optional): enables Nesterov momentum (default: False)  

## Adadelta
Arguments:  
params (iterable): iterable of parameters to optimize or dicts defining parameter groups  
rho (float, optional): coefficient used for computing a running average of squared gradients (default: 0.9)  
eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-6)  
lr (float, optional): coefficient that scale delta before it is applied to the parameters (default: 1.0)  
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)  

## Adam
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining parameter groups  
lr (float, optional): learning rate (default: 1e-3)  
betas (Tuple [float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999))  
eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8)  
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)  

## L-BFGS
Arguments:
lr (float): learning rate (default: 1)  
max_iter (int): maximal number of iterations per optimization step (default: 20)  
max_eval (int): maximal number of function evaluations per optimization step (default: max_iter * 1.25).  
tolerance_grad (float): termination tolerance on first order optimality (default: 1e-5).  
tolerance_change (float): termination tolerance on function value/parameter changes (default: 1e-9).  
history_size (int): update history size (default: 100).  

In [44]:
from sklearn.linear_model import LinearRegression
M = 500
N = 5

np.random.seed(515)
x = np.random.randn(M, N).astype(np.float32)
y = x.sum(axis=1) + np.random.randn(M).astype(np.float32)

# Estimate the model with standard OLS. 
ols = LinearRegression()
ols.fit(x, y)
ols.coef_

array([0.9825298, 0.9661818, 0.9434212, 0.9811514, 1.0246055],
      dtype=float32)

In [45]:
# Map data to Tensors
x = torch.from_numpy(x)
y = torch.from_numpy(y)

In [46]:
# Test for SGD
# Network
fc = nn.Linear(5, 1)

# Loss function
loss_func = nn.MSELoss(reduction='mean')
optimizer = optim.SGD(fc.parameters(), lr=0.01)

def closure():
    optimizer.zero_grad()
    y_pred = fc(x)
    # MUST put the predicted varibale at the first argument
    loss = loss_func(y_pred, y.view(-1, 1))
    loss.backward()
    return loss

for epoch in range(10000):
    optimizer.step(closure=closure)

fc.weight

Parameter containing:
tensor([[0.9825, 0.9662, 0.9434, 0.9811, 1.0246]], requires_grad=True)

In [47]:
# Test for L-BFGS
# Network
fc = nn.Linear(5, 1)

# Loss function
loss_func = nn.MSELoss(reduction='mean')
optimizer = optim.LBFGS(fc.parameters())

def closure():
    optimizer.zero_grad()
    y_pred = fc(x)
    # MUST put the predicted varibale at the first argument
    loss = loss_func(y_pred, y.view(-1, 1))
    loss.backward()
    return loss

# Performs a single optimization step
optimizer.step(closure=closure)

fc.weight

Parameter containing:
tensor([[0.9825, 0.9662, 0.9434, 0.9812, 1.0246]], requires_grad=True)

In [48]:
# Test for Adam
# Network
fc = nn.Linear(5, 1)

# Loss function
loss_func = nn.MSELoss(reduction='mean')
optimizer = optim.Adam(fc.parameters())

def closure():
    optimizer.zero_grad()
    y_pred = fc(x)
    # MUST put the predicted varibale at the first argument
    loss = loss_func(y_pred, y.view(-1, 1))
    loss.backward()
    return loss

for epoch in range(10000):
    optimizer.step(closure=closure)

fc.weight

Parameter containing:
tensor([[0.9825, 0.9662, 0.9434, 0.9812, 1.0246]], requires_grad=True)

# Dropout

In [49]:
torch.manual_seed(0)

x = torch.arange(0, 10, dtype=torch.float)
print(x)

dropout_layer = nn.Dropout(p=0.5)
print(dropout_layer(x))

tensor([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])
tensor([ 0.,  0.,  0.,  6.,  8.,  0.,  0., 14., 16., 18.])


In [50]:
dropout_layer = nn.Dropout(p=0.2)
print(dropout_layer(x))

tensor([ 0.0000,  1.2500,  2.5000,  0.0000,  5.0000,  6.2500,  7.5000,  0.0000,
         0.0000, 11.2500])


In [51]:
dropout_layer.eval()
print(dropout_layer.training)
print(dropout_layer(x))

False
tensor([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])


In [52]:
dropout_layer.train()
print(dropout_layer.training)
print(dropout_layer(x))

True
tensor([ 0.0000,  1.2500,  2.5000,  3.7500,  5.0000,  6.2500,  0.0000,  8.7500,
         0.0000, 11.2500])
