In [2]:
import numpy as np
import torch

In [14]:
a = np.array([
    [3., 2., 3.],
    [4., 5., 5.]
])
a

array([[3., 2., 3.],
       [4., 5., 5.]])

In [15]:
a.dtype

dtype('float64')

In [16]:
a = np.arange(6).reshape(2, 3)
a

array([[0, 1, 2],
       [3, 4, 5]])

In [17]:
a.dtype

dtype('int64')

In [18]:
a.sum(axis=1)

array([ 3, 12])

In [21]:
a = torch.tensor([
    [3., 2., 3.],
    [4., 5., 5.]
])
a

tensor([[3., 2., 3.],
        [4., 5., 5.]])

In [22]:
a.shape

torch.Size([2, 3])

In [23]:
a.dtype

torch.float32

In [24]:
b = torch.randn(2, 3)

In [25]:
a + b

tensor([[2.1621, 1.9706, 3.2059],
        [2.2287, 3.9863, 7.0595]])

In [26]:
torch.exp(a) * torch.sin(b / (1 + a) ** 2)

tensor([[-1.0514, -0.0242,  0.2585],
        [-3.8652, -4.1785,  8.4860]])

In [29]:
a.sum(dim=1)

tensor([ 8., 14.])

### Module `torch.nn`

In [30]:
from torch import nn

In [31]:
m = nn.Linear(16, 32)

In [33]:
m.weight.shape

torch.Size([32, 16])

In [34]:
m.bias.shape

torch.Size([32])

In [37]:
m.weight

Parameter containing:
tensor([[-0.2371,  0.1884,  0.1582, -0.1038,  0.2110,  0.0080,  0.1970, -0.0121,
         -0.0140,  0.1950, -0.0003,  0.1218,  0.1931, -0.0900,  0.1377, -0.1761],
        [ 0.0350, -0.1317, -0.0299,  0.2123, -0.0136,  0.0377,  0.0149,  0.2272,
          0.0980,  0.0094,  0.1085, -0.1585, -0.0748,  0.2235, -0.0646,  0.0566],
        [-0.0562,  0.1622, -0.2323,  0.1485,  0.1737, -0.2412, -0.0140, -0.0034,
          0.1272, -0.0396, -0.2141,  0.2277, -0.1338, -0.2223,  0.2253, -0.1279],
        [-0.1788, -0.1062,  0.0319, -0.2034,  0.1251,  0.0298, -0.1290, -0.0395,
          0.1543,  0.0006,  0.1883, -0.0903,  0.1270,  0.1565, -0.1290,  0.0008],
        [ 0.0876,  0.2337, -0.0402,  0.1875,  0.0762, -0.1558,  0.1129, -0.0317,
         -0.1452, -0.1059, -0.1464, -0.0203,  0.1235,  0.2140, -0.1951,  0.1594],
        [ 0.0288,  0.0039, -0.1284,  0.2404,  0.1954,  0.0512, -0.1711,  0.1141,
          0.1629, -0.1116, -0.2275, -0.0529, -0.1554, -0.0803, -0.2002, -0.1263],


### Backpropagation

In [38]:
x = torch.randn(10, 16)

$$
y = xA^T + b
$$

$$
l = \sum_{i, j} y_{ij}
$$

$$
\frac{dl}{dy_{ij}} = 1
$$

$$
\frac{dl}{dA_{jk}} = \sum_{i} \frac{dl}{dy_{ij}} \frac{dy_{ij}}{dA_{jk}} = \sum_{i} x_{ik}
$$

In [40]:
y = m(x)

In [41]:
y.shape

torch.Size([10, 32])

In [42]:
l = y.sum()

In [43]:
l

tensor(22.8026, grad_fn=<SumBackward0>)

In [44]:
torch.tensor(22.8026)

tensor(22.8026)

In [45]:
l.backward()

In [49]:
x.sum(dim=0)

tensor([ 3.3711,  2.1774, -1.4181, -2.0468,  0.7947, -1.1670,  1.4792,  6.0249,
         6.1702, -1.1099, -1.3857,  5.4470,  1.9963,  2.6993, -4.4533,  5.0641])

In [47]:
m.weight.grad

tensor([[ 3.3711,  2.1774, -1.4181, -2.0468,  0.7947, -1.1670,  1.4792,  6.0249,
          6.1702, -1.1099, -1.3857,  5.4470,  1.9963,  2.6993, -4.4533,  5.0641],
        [ 3.3711,  2.1774, -1.4181, -2.0468,  0.7947, -1.1670,  1.4792,  6.0249,
          6.1702, -1.1099, -1.3857,  5.4470,  1.9963,  2.6993, -4.4533,  5.0641],
        [ 3.3711,  2.1774, -1.4181, -2.0468,  0.7947, -1.1670,  1.4792,  6.0249,
          6.1702, -1.1099, -1.3857,  5.4470,  1.9963,  2.6993, -4.4533,  5.0641],
        [ 3.3711,  2.1774, -1.4181, -2.0468,  0.7947, -1.1670,  1.4792,  6.0249,
          6.1702, -1.1099, -1.3857,  5.4470,  1.9963,  2.6993, -4.4533,  5.0641],
        [ 3.3711,  2.1774, -1.4181, -2.0468,  0.7947, -1.1670,  1.4792,  6.0249,
          6.1702, -1.1099, -1.3857,  5.4470,  1.9963,  2.6993, -4.4533,  5.0641],
        [ 3.3711,  2.1774, -1.4181, -2.0468,  0.7947, -1.1670,  1.4792,  6.0249,
          6.1702, -1.1099, -1.3857,  5.4470,  1.9963,  2.6993, -4.4533,  5.0641],
        [ 3.3711,  2.1

In [52]:
x = torch.randn(20, 16)
y = torch.randn(20)
m = nn.Linear(16, 1)

In [54]:
pred = m(x)
pred.shape

torch.Size([20, 1])

In [57]:
l = (y - pred[:, 0]).square().mean()

In [59]:
l.backward()

In [60]:
m.weight.grad

tensor([[ 0.0926,  0.1217,  0.0116, -0.2551,  0.0975,  0.4251,  0.2561, -0.9257,
          0.1846, -0.8890,  1.3015, -0.4800,  0.4307,  0.2382, -0.6516, -0.6163]])

In [62]:
pred.backward()

RuntimeError: grad can be implicitly created only for scalar outputs

In [64]:
amax = torch.argmax(pred[:, 0])

In [65]:
amax

tensor(14)

In [66]:
amax.backward()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [67]:
torch.abs(pred)

tensor([[0.5166],
        [0.0870],
        [0.8847],
        [0.4556],
        [0.6473],
        [0.1229],
        [0.4733],
        [0.1784],
        [0.0688],
        [0.4110],
        [0.3067],
        [0.0992],
        [0.7249],
        [0.2329],
        [0.9163],
        [0.2615],
        [0.3651],
        [0.3841],
        [0.1237],
        [0.2316]], grad_fn=<AbsBackward0>)

In [74]:
x = torch.randn(20, 16)
y = torch.randn(20)
m = nn.Linear(16, 1)

In [75]:
pred = m(x)
l = (y - pred[:, 0]).square().mean()
l.backward()
m.weight.grad

tensor([[-1.2483,  0.3398, -1.2757, -0.1495,  0.4108,  0.5777, -1.1595,  0.1488,
          0.1003,  0.3777,  0.7127, -0.8972,  0.0938,  0.5058,  0.5470, -0.2129]])

In [76]:
pred = m(x)
l = (y - pred[:, 0]).square().mean()
l.backward()
m.weight.grad

tensor([[-2.4966,  0.6797, -2.5513, -0.2991,  0.8215,  1.1555, -2.3190,  0.2976,
          0.2006,  0.7553,  1.4253, -1.7945,  0.1875,  1.0116,  1.0939, -0.4258]])

### Performance

In [101]:
A = np.random.randn(200, 100).astype(np.float32)
B = np.random.randn(100, 300).astype(np.float32)

**1. Python loops**

In [78]:
%%time
C = np.zeros((200, 300))

for i in range(A.shape[0]):
    for j in range(B.shape[1]):
        for k in range(A.shape[1]):
            C[i, j] += A[i, k] * B[k, j]

CPU times: user 3.49 s, sys: 20.2 ms, total: 3.51 s
Wall time: 3.53 s


**2. Numpy**

In [102]:
%%timeit
C1 = A @ B

197 µs ± 36.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


**3. Torch**

In [103]:
At = torch.from_numpy(A)
Bt = torch.from_numpy(B)

In [104]:
%%timeit
C2 = At @ Bt

218 µs ± 14.1 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


**4. Torch+MPS (MacOS with M1/M2/M3 only)**

In [128]:
device = torch.device('mps')
device

device(type='mps')

In [129]:
At = At.to(device)
Bt = Bt.to(device)

In [130]:
%%timeit
C3 = At @ Bt

45.2 µs ± 1.79 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


**5. Torch+CUDA (GPU only, e.g. in Colab)**

In [None]:
%%bash
export CUDA_VISIBLE_DEVICES=0
echo $CUDA_VISIBLE_DEVICES

In [None]:
!nvidia-smi

Not recommended:

In [None]:
A = torch.randn(200, 100).cuda()
B = torch.randn(100, 300).cuda()
A, B

Better:

In [None]:
A = torch.randn(200, 100).to(device)
B = torch.randn(100, 300).to(device)
A, B

In [None]:
A = torch.randn(200, 100, device=device)
B = torch.randn(100, 300, device=device)
A, B

In [None]:
%%timeit
C4 = A @ B