In [98]:
import numpy as np
import torch
from tqdm.auto import tqdm

In [8]:
arr = np.array([
    [1., 2., 3.],
    [4., 5., 6.]
])
arr.dtype

dtype('float64')

In [36]:
a = torch.tensor([
    [1., 2., 3.],
    [4., 5., 6.]
])
print(a.dtype)
a

torch.float32


tensor([[1., 2., 3.],
        [4., 5., 6.]])

In [10]:
b = torch.randn(2, 3)
b

tensor([[ 0.2684,  0.5165,  0.5102],
        [ 0.0191, -0.6206, -0.7226]])

In [11]:
a + b

tensor([[1.2684, 2.5165, 3.5102],
        [4.0191, 4.3794, 5.2774]])

In [12]:
torch.log(a) * torch.exp(b)

tensor([[0.0000, 1.1618, 1.8299],
        [1.4130, 0.8653, 0.8699]])

In [13]:
a.sum(dim=1)

tensor([ 6., 15.])

In [37]:
a.exp()

tensor([[  2.7183,   7.3891,  20.0855],
        [ 54.5981, 148.4132, 403.4288]])

In [38]:
torch.exp(a)

tensor([[  2.7183,   7.3891,  20.0855],
        [ 54.5981, 148.4132, 403.4288]])

### Module `torch.nn`

In [14]:
from torch import nn

In [15]:
nn.Linear?

In [16]:
m = nn.Linear(10, 20)

In [19]:
m.weight.shape

torch.Size([20, 10])

In [21]:
m.bias.shape

torch.Size([20])

In [28]:
m.weight

Parameter containing:
tensor([[ 0.0396,  0.2348, -0.2423,  0.0317, -0.2611,  0.1648, -0.0094,  0.2765,
         -0.1854,  0.0549],
        [ 0.2165, -0.1234, -0.0480, -0.2518, -0.2536,  0.0185,  0.1113, -0.2797,
          0.2622,  0.2261],
        [-0.2583, -0.2331,  0.0629, -0.1224, -0.1140,  0.2575,  0.1226,  0.0546,
         -0.0982,  0.1036],
        [ 0.1150, -0.0920,  0.3155, -0.1483, -0.2483, -0.1029,  0.1410,  0.0962,
          0.2209,  0.1673],
        [ 0.0965,  0.2582,  0.2978,  0.1600,  0.1667,  0.0345, -0.2289, -0.0554,
          0.0463,  0.1088],
        [-0.2658,  0.0392,  0.1408, -0.2819, -0.1762, -0.1330,  0.2910,  0.2217,
          0.0845,  0.3006],
        [-0.0477, -0.1082,  0.0560, -0.1943, -0.1948,  0.0329, -0.0189, -0.2639,
         -0.2066, -0.0566],
        [ 0.2100, -0.2639, -0.0891, -0.0895, -0.1250, -0.1171, -0.1688,  0.1534,
          0.2964,  0.2540],
        [ 0.1191, -0.2045, -0.1911, -0.2563, -0.2796,  0.1011,  0.2600, -0.1555,
          0.2132, -0.2528

### Backpropagation

In [39]:
a = torch.tensor([
    [1., 2., 3.],
    [4., 5., 6.]
], requires_grad=True)
a.grad

In [40]:
l = (a ** 2).sum()
l

tensor(91., grad_fn=<SumBackward0>)

In [41]:
torch.tensor(91.)

tensor(91.)

$$
\frac{dl}{da} = (\frac{dl}{da_1}, ..., \frac{dl}{da_n}) = 2a
$$

$$
\frac{dl}{da_i} = 2a_i
$$

In [42]:
l.backward()

In [43]:
a.grad

tensor([[ 2.,  4.,  6.],
        [ 8., 10., 12.]])

In [72]:
b = torch.randn(3, 3, requires_grad=True)
b2 = torch.randn(3, 3, requires_grad=True)
b2.data.copy_(b.data)

tensor([[ 0.2867,  0.3750,  0.3360],
        [-0.4357, -1.3242, -0.9484],
        [-0.6785,  2.0290, -0.7445]])

In [73]:
l1 = b.abs()

In [74]:
l1.backward()

RuntimeError: grad can be implicitly created only for scalar outputs

In [75]:
l1 = b.abs().prod()
l1.backward()

In [76]:
print('dl1/db')
b.grad

dl1/db


tensor([[ 0.0707,  0.0540,  0.0603],
        [-0.0465, -0.0153, -0.0214],
        [-0.0299,  0.0100, -0.0272]])

In [77]:
l2 = b2.exp().sum()
l2.backward()

In [78]:
print('dl1/db')
b2.grad

dl1/db


tensor([[1.3321, 1.4550, 1.3994],
        [0.6468, 0.2660, 0.3874],
        [0.5074, 7.6068, 0.4750]])

In [79]:
l2 = b.exp().sum()
l2.backward()

In [81]:
print('dl1/db + dl2/db')
b.grad

dl1/db + dl2/db


tensor([[1.4027, 1.5090, 1.4597],
        [0.6003, 0.2507, 0.3660],
        [0.4775, 7.6168, 0.4478]])

In [83]:
b.grad = None
l2 = b.exp().sum()
l2.backward()

In [84]:
print('dl2/db')
b.grad

dl2/db


tensor([[1.3321, 1.4550, 1.3994],
        [0.6468, 0.2660, 0.3874],
        [0.5074, 7.6068, 0.4750]])

In [86]:
x = torch.randn(4, requires_grad=True)
x

tensor([ 0.3978, -0.1723,  0.9669,  0.3814], requires_grad=True)

In [88]:
l = (torch.exp(x) * torch.sin(x ** 3) + torch.log(1 + torch.abs(x)) / torch.sqrt(x)).mean()
l.backward()

In [89]:
x

tensor([ 0.3978, -0.1723,  0.9669,  0.3814], requires_grad=True)

In [91]:
x = torch.randn(4, requires_grad=True)
x

tensor([ 0.5015, -0.7863,  0.1651,  0.7255], requires_grad=True)

In [92]:
l = x.argmax()
l

tensor(3)

## Performance

In [131]:
A = np.random.randn(200, 100).astype(np.float32)
B = np.random.randn(100, 300).astype(np.float32)

**1. Python loops**

In [132]:
%%time
C = np.zeros((200, 300))

for i in range(A.shape[0]):
    for j in range(B.shape[1]):
        for k in range(A.shape[1]):
            C[i, j] += A[i, k] * B[k, j]

CPU times: user 3.26 s, sys: 13.4 ms, total: 3.28 s
Wall time: 3.28 s


**2. Numpy**

In [133]:
%%time
C1 = A @ B

CPU times: user 2.1 ms, sys: 2.86 ms, total: 4.97 ms
Wall time: 2.59 ms


In [134]:
np.abs(C - C1).max()

1.7014681361615658e-05

**3. Torch**

In [135]:
At = torch.from_numpy(A)
Bt = torch.from_numpy(B)

In [136]:
%%time
C2 = At @ Bt

CPU times: user 2.22 ms, sys: 2.93 ms, total: 5.14 ms
Wall time: 7.89 ms


In [137]:
np.abs(C - C2.numpy()).max()

1.7014681361615658e-05

**4. Torch+MPS (MacOS with M1/M2/M3 only)**

In [138]:
device = torch.device('mps')
device

device(type='mps')

In [139]:
At = At.to(device)
Bt = Bt.to(device)

In [140]:
%%time
C3 = At @ Bt

CPU times: user 1.69 ms, sys: 39.2 ms, total: 40.9 ms
Wall time: 85.2 ms


In [142]:
np.abs(C - C3.cpu().numpy()).max()

1.7014681361615658e-05

**5. Torch+CUDA (GPU only, e.g. in Colab)**

In [None]:
!nvidia-smi

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

Not recommended:

In [None]:
A = torch.randn(200, 100).cuda()
B = torch.randn(100, 300).cuda()
A, B

Better:

In [None]:
A = torch.randn(200, 100).to(device)
B = torch.randn(100, 300).to(device)
A, B

In [None]:
A = torch.randn(200, 100, device=device)
B = torch.randn(100, 300, device=device)
A, B

In [None]:
%%time
C4 = A @ B

In [None]:
np.abs(C - C4.cpu().numpy()).max()