# PyTorch Cheatsheet

In [1]:
import numpy as np
import torch
from torch import nn


# Tensors

#### Constructing tensors

In [2]:
x = torch.empty(2, 3, 4)    # creates an empty tensor with size as in brackets
x

tensor([[[-9.7390e+32,  1.2093e-42,  0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00]],

        [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00]]])

In [3]:
x = torch.rand(2, 3, 4) # creates a tensor with random variables
x

tensor([[[0.3747, 0.2258, 0.3562, 0.4638],
         [0.4179, 0.7920, 0.6487, 0.5922],
         [0.3470, 0.7915, 0.4666, 0.0724]],

        [[0.1872, 0.6803, 0.6419, 0.4384],
         [0.5298, 0.2240, 0.6821, 0.8202],
         [0.0125, 0.1911, 0.7220, 0.8062]]])

In [4]:
x = torch.zeros(2, 3, 4) # creates a tensor with 0 values
x

tensor([[[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]])

In [5]:
x = torch.ones(2, 3, 4, dtype=torch.int) # creates a tensor with 1 values
x

tensor([[[1, 1, 1, 1],
         [1, 1, 1, 1],
         [1, 1, 1, 1]],

        [[1, 1, 1, 1],
         [1, 1, 1, 1],
         [1, 1, 1, 1]]], dtype=torch.int32)

In [6]:
x.dtype # shows type of values in tensor

torch.int32

In [7]:
x.size() # shows size of tensor

torch.Size([2, 3, 4])

In [8]:
x = torch.tensor([0.3, 4])  # creating a particular tensor
x

tensor([0.3000, 4.0000])

Slicing is also avaliable with torch.tensor (such as with numpy.array).

In [9]:
x = torch.rand(2, 3, 4)
print(x[0, 0, 0])
x[0, 0, 0].item()   # gets an actual value, could be used only with scalar values 
                    # (not the hole or part of the tensor)

tensor(0.4382)


0.43815428018569946

In [10]:
x = torch.rand(4, 4)
print(x)
y = x.view(16) # reshaping the tensor to the line
y

tensor([[0.1936, 0.4871, 0.7111, 0.8741],
        [0.2779, 0.3834, 0.5349, 0.5656],
        [0.2934, 0.8145, 0.2094, 0.4715],
        [0.3662, 0.5761, 0.7963, 0.7209]])


tensor([0.1936, 0.4871, 0.7111, 0.8741, 0.2779, 0.3834, 0.5349, 0.5656, 0.2934,
        0.8145, 0.2094, 0.4715, 0.3662, 0.5761, 0.7963, 0.7209])

In [11]:
x = torch.rand(4, 4)
print(x)
y = x.view(-1, 8) # reshaping the tensor automatically choosing the fist size
y, y.size()

tensor([[0.2045, 0.3150, 0.8065, 0.5565],
        [0.7225, 0.7268, 0.8573, 0.2516],
        [0.1199, 0.2765, 0.2772, 0.8521],
        [0.4304, 0.2469, 0.6202, 0.9457]])


(tensor([[0.2045, 0.3150, 0.8065, 0.5565, 0.7225, 0.7268, 0.8573, 0.2516],
         [0.1199, 0.2765, 0.2772, 0.8521, 0.4304, 0.2469, 0.6202, 0.9457]]),
 torch.Size([2, 8]))

`requires_grad=True` is needed if we will calculate gradient for our tensor later. So this tensor is a variable we need to optimize.

In [12]:
x = torch.ones(5, requires_grad=True)
x

tensor([1., 1., 1., 1., 1.], requires_grad=True)

#### Converting np.array to tensor

In [13]:
a = torch.ones(5)
b = a.numpy()   # converts tensor to np.array
b, type(b)

(array([1., 1., 1., 1., 1.], dtype=float32), numpy.ndarray)

But they are sharing one memory cell (if tensor is on CPU, NOT GPU). So, if you change one of them, these changes will apply to the other too.

In [14]:
a.add_(1)   # a = a + 1
a, b

(tensor([2., 2., 2., 2., 2.]), array([2., 2., 2., 2., 2.], dtype=float32))

In [15]:
a = np.ones(5)
b = torch.from_numpy(a) # converting np.array to torch.tensor
a, b

(array([1., 1., 1., 1., 1.]),
 tensor([1., 1., 1., 1., 1.], dtype=torch.float64))

In [16]:
a += 1  # the same story with sharing memory
a, b

(array([2., 2., 2., 2., 2.]),
 tensor([2., 2., 2., 2., 2.], dtype=torch.float64))

#### Operations with tensors

In [17]:
x = torch.rand(2, 2)
y = torch.rand(2, 2)
z = x + y   # Elementwise addition
x, y, z

(tensor([[0.4980, 0.8795],
         [0.4574, 0.6478]]),
 tensor([[0.1464, 0.7183],
         [0.8562, 0.6711]]),
 tensor([[0.6443, 1.5978],
         [1.3136, 1.3189]]))

In [18]:
x = torch.rand(2, 2)
y = torch.rand(2, 2)
z = torch.add(x, y) # Elementwise addition
x, y, z

(tensor([[0.8336, 0.1083],
         [0.6702, 0.1303]]),
 tensor([[0.2980, 0.0935],
         [0.9309, 0.4018]]),
 tensor([[1.1315, 0.2018],
         [1.6011, 0.5321]]))

In [19]:
x = torch.rand(2, 2)
y = torch.rand(2, 2)
print(x, y)
y.add_(x)   # Functions with _ are inplace operations
x, y

tensor([[0.8100, 0.1251],
        [0.9283, 0.3404]]) tensor([[0.0856, 0.2132],
        [0.5123, 0.8222]])


(tensor([[0.8100, 0.1251],
         [0.9283, 0.3404]]),
 tensor([[0.8956, 0.3383],
         [1.4406, 1.1626]]))

In [20]:
x = torch.rand(2, 2)
y = torch.rand(2, 2)
z = x - y   # elementwise subtruction
x, y, z

(tensor([[0.8689, 0.4082],
         [0.8632, 0.6450]]),
 tensor([[0.3799, 0.8045],
         [0.5644, 0.4554]]),
 tensor([[ 0.4891, -0.3963],
         [ 0.2988,  0.1896]]))

In [21]:
x = torch.rand(2, 2)
y = torch.rand(2, 2)
z = torch.sub(x, y)   # elementwise subtruction
x, y, z

(tensor([[0.6045, 0.6227],
         [0.2960, 0.7843]]),
 tensor([[0.8419, 0.8693],
         [0.8237, 0.3145]]),
 tensor([[-0.2375, -0.2466],
         [-0.5277,  0.4698]]))

In [22]:
x = torch.rand(2, 2)
y = torch.rand(2, 2)
print(x, y)
y.sub_(x)   # Functions with _ are inplace operations
x, y

tensor([[0.6239, 0.9952],
        [0.9643, 0.4517]]) tensor([[0.6626, 0.1762],
        [0.9179, 0.3216]])


(tensor([[0.6239, 0.9952],
         [0.9643, 0.4517]]),
 tensor([[ 0.0387, -0.8190],
         [-0.0463, -0.1301]]))

In [23]:
x = torch.rand(2, 2)
y = torch.rand(2, 2)
z = x * y   # elementwise multiplication
x, y, z

(tensor([[0.4448, 0.7858],
         [0.1357, 0.2543]]),
 tensor([[0.5523, 0.3400],
         [0.4163, 0.0301]]),
 tensor([[0.2457, 0.2672],
         [0.0565, 0.0076]]))

In [24]:
x = torch.rand(2, 2)
y = torch.rand(2, 2)
z = torch.mul(x, y)   # elementwise multiplication
x, y, z

(tensor([[0.7212, 0.0465],
         [0.3764, 0.4470]]),
 tensor([[0.6167, 0.4125],
         [0.7121, 0.1986]]),
 tensor([[0.4448, 0.0192],
         [0.2680, 0.0888]]))

In [25]:
x = torch.rand(2, 2)
y = torch.rand(2, 2)
print(x, y)
y.mul_(x)   # Functions with _ are inplace operations
x, y

tensor([[0.3253, 0.9098],
        [0.4145, 0.0831]]) tensor([[0.7265, 0.9648],
        [0.7026, 0.4358]])


(tensor([[0.3253, 0.9098],
         [0.4145, 0.0831]]),
 tensor([[0.2363, 0.8777],
         [0.2912, 0.0362]]))

In [26]:
x = torch.rand(2, 2)
y = torch.rand(2, 2)
z = x / y   # elementwise division
x, y, z

(tensor([[0.5987, 0.0482],
         [0.2291, 0.1803]]),
 tensor([[0.7077, 0.8180],
         [0.6514, 0.0656]]),
 tensor([[0.8460, 0.0590],
         [0.3518, 2.7499]]))

In [27]:
x = torch.rand(2, 2)
y = torch.rand(2, 2)
z = torch.div(x, y)   # elementwise division
x, y, z

(tensor([[0.6146, 0.3036],
         [0.2903, 0.7932]]),
 tensor([[0.6035, 0.1601],
         [0.2590, 0.3992]]),
 tensor([[1.0184, 1.8968],
         [1.1208, 1.9871]]))

In [28]:
x = torch.rand(2, 2)
y = torch.rand(2, 2)
print(x, y)
y.div_(x)   # Functions with _ are inplace operations
x, y

tensor([[0.4587, 0.4169],
        [0.2566, 0.4475]]) tensor([[0.2330, 0.5729],
        [0.1319, 0.9636]])


(tensor([[0.4587, 0.4169],
         [0.2566, 0.4475]]),
 tensor([[0.5080, 1.3742],
         [0.5140, 2.1533]]))

#### Meeting CUDA

In [29]:
device = torch.device("cpu")
# cheking CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
device

device(type='cuda')

In [30]:
# putting torch.tensors to CUDA
x = torch.ones(5, device=device)

y = torch.ones(4)
y = y.to(device)

In [31]:
y.numpy()

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

GPU tensors can NOT convert to np.array. We have to put it to CPU first.

In [32]:
y = y.cpu() # or y.to("cpu")

In [33]:
y.numpy()
y

tensor([1., 1., 1., 1.])

# Autograd package

#### Gradient monitoring

In [71]:
x = torch.randn(3, requires_grad=True)
y = x + 2
y

tensor([2.2309, 2.7799, 1.6150], grad_fn=<AddBackward0>)

`grad_fn=<AddBackward0>` means that pytorch made a derivative function for backpropogation. 

In [35]:
z = y * y * 2
z = z.mean()
z

tensor(8.3874, grad_fn=<MeanBackward0>)

In [36]:
z.backward()    # calculates dz/dx

In [37]:
x.grad  # returns the gradients

tensor([2.2925, 3.1803, 2.6451])

In [38]:
z = y * y * 2
z

tensor([ 5.9126, 11.3787,  7.8710], grad_fn=<MulBackward0>)

In [39]:
z.backward()

RuntimeError: grad can be implicitly created only for scalar outputs

So, if z is a vector, we can't use z.backward() with no arguments, 
because it counts the Jacobian product.

In [40]:
v = torch.tensor([1, 1, 1])
z.backward(v)
x.grad

tensor([ 9.1701, 12.7212, 10.5803])

In [41]:
# There are 3 ways to prevent tracking the gradients.
x.requires_grad_(False)
x.detach()  # Creating a new tensor that doesn't require the gradient
with torch.no_grad():
    ...


Whenever we call the `backword` function, then the gradient for this tensor will be accumulated into the `grad` attribute. So, their values will be summed up.

In [42]:
weights = torch.ones(4, requires_grad=True)

for _epoch in range(3):
    model_output = (weights * 3).sum()

    model_output.backward()
    print(weights.grad)

tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])
tensor([9., 9., 9., 9.])


In [43]:
weights = torch.ones(4, requires_grad=True)

for epoch in range(3):
    model_output = (weights * 3).sum()

    model_output.backward()
    print(weights.grad)

    weights.grad.zero_()    # zeroing gradients !!!

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])


In [45]:
weights = torch.ones(4, requires_grad=True)

optimizer = torch.optim.SGD([weights], lr=0.01)   # defining an optimizer
optimizer.step()    # optimizer is making step
optimizer.zero_grad()   # zeroing gradients !!!

#### Backpropagation

$$
loss = (w \cdot x - y)^2
$$
$$
w = 1, x = 1, y = 2
$$
Let's define functions for the Chain rule illustration:
$$
\hat{y} = x \cdot w = 1 \cdot 1 = 1, s = \hat{y} - y = 1 - 2 = -1, loss = s^2 = (-1)^2 = 1
$$
The Chain rule:
$$
\frac{d(loss)}{d(w)} = \frac{d(loss)}{d(s)} \cdot \frac{d(s)}{d(\hat{y})} \cdot \frac{d(\hat{y})}{d(w)} = (2 \cdot s) \cdot (1) \cdot (x) = (2 \cdot (-1)) \cdot (1) \cdot (1) = 1
$$

In [46]:
x = torch.tensor(1.0)
y = torch.tensor(2.)

w = torch.tensor(1., requires_grad=True)


In [47]:
# forword pass and compute the loss
y_hat = w * x
loss = (y_hat - y)**2

In [48]:
# backword pass
loss.backward()
w.grad

tensor(-2.)

After this are going update weights and next forward and backwards...

# Linear Regression

#### Manual implementation

In [3]:
# f = w * x

# In our example let it be f = 2 * x

X = np.array([1, 2, 3, 4], dtype=np.float32)
Y = np.array([2, 4, 6, 8], dtype=np.float32)

w = 0.

# model prediction
def forward(x):
    return w * x

# loss -- MSE
def loss(y, y_predicted):
    return ((y_predicted - y)**2).mean()

# gradient
# MSE = 1 / N * (w * x - y)**2
# dJ/dw = 1 / N * 2 * x * (w * x - y)
def gradient(x, y, y_predicted):
    return np.dot(2 * x, y_predicted - y).mean()


In [4]:
print(f'Prediction before training: f(5) = {forward(5):.3f}')

Prediction before training: f(5) = 0.000


In [5]:
# Training
learning_rate = 0.01
n_iters = 10

for epoch in range(n_iters):
    # prediction = forward pass
    y_pred = forward(X)

    # loss
    l = loss(Y, y_pred)

    # gradients
    dw = gradient(X, Y, y_pred)

    # update weights
    w -= dw * learning_rate

    if epoch % 1 == 0:
        print(f'epoch {epoch + 1}: w = {w:.3f}, loss = {l:.8f}')

print(f'Prediction after training: f(5) = {forward(5):.3f}')

epoch 1: w = 1.200, loss = 30.00000000
epoch 2: w = 1.680, loss = 4.79999924
epoch 3: w = 1.872, loss = 0.76800019
epoch 4: w = 1.949, loss = 0.12288000
epoch 5: w = 1.980, loss = 0.01966083
epoch 6: w = 1.992, loss = 0.00314570
epoch 7: w = 1.997, loss = 0.00050332
epoch 8: w = 1.999, loss = 0.00008053
epoch 9: w = 1.999, loss = 0.00001288
epoch 10: w = 2.000, loss = 0.00000206
Prediction after training: f(5) = 9.999


#### Manual implementation with automatic grad calculation

In [6]:
# f = w * x

# In our example let it be f = 2 * x

X = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
Y = torch.tensor([2, 4, 6, 8], dtype=torch.float32)

w = torch.tensor(0., dtype=torch.float32, requires_grad=True)

# model prediction
def forward(x):
    return w * x

# loss -- MSE
def loss(y, y_predicted):
    return ((y_predicted - y)**2).mean()

# gradient
# MSE = 1 / N * (w * x - y)**2
# dJ/dw = 1 / N * 2 * x * (w * x - y)
# will be automatically calculated by pytorch


In [7]:
print(f'Prediction before training: f(5) = {forward(5):.3f}')

Prediction before training: f(5) = 0.000


In [8]:
# Training
learning_rate = 0.01
n_iters = 20

for epoch in range(n_iters):
    # prediction = forward pass
    y_pred = forward(X)

    # loss
    l = loss(Y, y_pred)

    # gradients -- backward pass
    l.backward() # dl/dw

    # update weights
    with torch.no_grad():
        w -= learning_rate * w.grad
    
    # zero gradients
    w.grad.zero_()

    if epoch % 3 == 0:
        print(f'epoch {epoch + 1}: w = {w:.3f}, loss = {l:.8f}')

print(f'Prediction after training: f(5) = {forward(5):.3f}')

epoch 1: w = 0.300, loss = 30.00000000
epoch 4: w = 0.956, loss = 11.31448650
epoch 7: w = 1.359, loss = 4.26725292
epoch 10: w = 1.606, loss = 1.60939169
epoch 13: w = 1.758, loss = 0.60698116
epoch 16: w = 1.851, loss = 0.22892261
epoch 19: w = 1.909, loss = 0.08633806
Prediction after training: f(5) = 9.612


#### Manual implementation with automatic grad and loss calculation

1. Design model (input, output size, forward pass).
2. Construct the loss and the optimizer.
3. Training loop
    - forward pass: compute predictions
    - backward pass: gradients
    - update our weights

In [9]:
# f = w * x

# In our example let it be f = 2 * x

X = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
Y = torch.tensor([2, 4, 6, 8], dtype=torch.float32)

w = torch.tensor(0., dtype=torch.float32, requires_grad=True)

# model prediction
def forward(x):
    return w * x

# loss -- MSE
# will be automatically calculated by pytorch

# gradient
# MSE = 1 / N * (w * x - y)**2
# dJ/dw = 1 / N * 2 * x * (w * x - y)
# will be automatically calculated by pytorch


In [10]:
print(f'Prediction before training: f(5) = {forward(5):.3f}')

Prediction before training: f(5) = 0.000


In [11]:
# Training
learning_rate = 0.01
n_iters = 20

loss = nn.MSELoss() # defining loss function
optimizer = torch.optim.SGD([w], lr=learning_rate)  # defining optimization algorithm

for epoch in range(n_iters):
    # prediction = forward pass
    y_pred = forward(X)

    # loss
    l = loss(Y, y_pred)

    # gradients -- backward pass
    l.backward() # dl/dw

    # update weights
    optimizer.step()
    
    # zero gradients
    optimizer.zero_grad()

    if epoch % 3 == 0:
        print(f'epoch {epoch + 1}: w = {w:.3f}, loss = {l:.8f}')

print(f'Prediction after training: f(5) = {forward(5):.3f}')

epoch 1: w = 0.300, loss = 30.00000000
epoch 4: w = 0.956, loss = 11.31448650
epoch 7: w = 1.359, loss = 4.26725292
epoch 10: w = 1.606, loss = 1.60939169
epoch 13: w = 1.758, loss = 0.60698116
epoch 16: w = 1.851, loss = 0.22892261
epoch 19: w = 1.909, loss = 0.08633806
Prediction after training: f(5) = 9.612


#### Automatic traning prosess calculation

In [12]:
# f = w * x

# In our example let it be f = 2 * x

X = torch.tensor([[1], [2], [3], [4]], dtype=torch.float32)
Y = torch.tensor([[2], [4], [6], [8]], dtype=torch.float32)

X_test = torch.tensor([5], dtype=torch.float32)

n_samples, n_features = X.shape
print(f"n_samples={n_samples}, n_features={n_features}")

input_size = n_features
output_size = n_features

model = nn.Linear(input_size, output_size)  # define our model -- linear regression

# loss -- MSE
# will be automatically calculated by pytorch

# gradient
# MSE = 1 / N * (w * x - y)**2
# dJ/dw = 1 / N * 2 * x * (w * x - y)
# will be automatically calculated by pytorch

n_samples=4, n_features=1


In [13]:
print(f"Prediction before training: f(5) = {model(X_test).item():.3f}")

Prediction before training: f(5) = -1.605


In [14]:
# Training
learning_rate = 0.01
n_iters = 20

loss = nn.MSELoss()  # defining loss function
optimizer = torch.optim.SGD(
    model.parameters(), lr=learning_rate
)  # defining optimization algorithm

for epoch in range(n_iters):
    # prediction = forward pass
    y_pred = model(X)

    # loss
    l = loss(Y, y_pred)

    # gradients -- backward pass
    l.backward()  # dl/dw

    # update weights
    optimizer.step()

    # zero gradients
    optimizer.zero_grad()

    if epoch % 3 == 0:
        [w, b] = model.parameters()
        print(f"epoch {epoch + 1}: w = {w[0][0].item():.3f}, loss = {l:.8f}")

print(f"Prediction after training: f(5) = {model(X_test).item():.3f}")

epoch 1: w = 0.102, loss = 42.01517487
epoch 4: w = 0.849, loss = 14.05231094
epoch 7: w = 1.282, loss = 4.71018982
epoch 10: w = 1.532, loss = 1.58889174
epoch 13: w = 1.677, loss = 0.54585385
epoch 16: w = 1.762, loss = 0.19712749
epoch 19: w = 1.811, loss = 0.08036137
Prediction after training: f(5) = 9.454


In [15]:
# Defining NN class
class LinearRegression(nn.Module):

    def __init__(self, input_dim, output_dim):
        super().__init__()
        # define layers
        self.lin = nn.Linear(input_dim, output_dim)
    
    def forward(self, x):
        return self.lin(x)


model = LinearRegression(input_size, output_size)
