In [2]:
import torch

Кстати тензоры желательно инициализировать...

In [3]:
x = torch.LongTensor(3, 4)
print(x)

tensor([[140338896403408,  94093785865808,              32,              64],
        [ 94105746047532,               0,             112,              64],
        [ 94105747574892,               1,               0,             129]])


<!-- vscode-jupyter-toc -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->
<a id='toc0_'></a>**Содержание**    
- [Autograd](#toc1_)    
- [Накопление градиентов](#toc2_)    
- [Граф градиентов](#toc3_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- /vscode-jupyter-toc -->

# <a id='toc1_'></a>[Autograd](#toc0_)

Tensor object keeps track of how it was created.

`z` knows that it was created by the addition of two tensors `z = x + y`.

In [4]:
x = torch.tensor([1., 2., 3.], requires_grad = True)
print('x: ', x)
y = torch.tensor([10., 20., 30.], requires_grad = True)
print('y: ', y)

z = x + y 
print('\nz = x + y')
print('z:', z)

x:  tensor([1., 2., 3.], requires_grad=True)
y:  tensor([10., 20., 30.], requires_grad=True)

z = x + y
z: tensor([11., 22., 33.], grad_fn=<AddBackward0>)


`s` knows that it was created by the sum of it's numbers

In [5]:
s = z.sum()
print(s)

tensor(66., grad_fn=<SumBackward0>)


У исходных тензоров в атрибуте `grad` появятся частные производные исходя из использованной операции и по форме исходных тензоров

In [6]:
s.backward()
print('x.grad: ', x.grad)
print('y.grad: ', y.grad)

x.grad:  tensor([1., 1., 1.])
y.grad:  tensor([1., 1., 1.])


In [7]:
from torch.autograd import grad

x1 = torch.tensor(2, dtype = torch.float32, requires_grad = True)
x2 = torch.tensor(3, dtype = torch.float32, requires_grad = True)
x3 = torch.tensor(1, dtype = torch.float32, requires_grad = True)
x4 = torch.tensor(4, dtype = torch.float32, requires_grad = True)

z1 = x1 * x2 
z2 = x3 * x4
f = z1 + z2

gradients = grad(outputs=f, inputs = [x1, x2, x3, x4, z1, z2])
gradients

(tensor(3.), tensor(2.), tensor(4.), tensor(1.), tensor(1.), tensor(1.))

A **leaf tensor** is a tensor that is created directly and not as a result of any arithmetic operation.

In the above case, `x1, x2, x3, x4` are leaf tensors while `z1` and `z2` are not.

Instead of specifying all the inputs to calculate the gradient using `grad(outputs=f, inputs = [x1, x2, x3, x4, z1, z2])`, we can use `tensor.backward()` to auto calculate all the gradients

In [8]:
x1 = torch.tensor(2, dtype = torch.float32, requires_grad = True)
x2 = torch.tensor(3, dtype = torch.float32, requires_grad = True)
x3 = torch.tensor(1, dtype = torch.float32, requires_grad = True)
x4 = torch.tensor(4, dtype = torch.float32, requires_grad = True)

z1 = x1 * x2 
z2 = x3 * x4
f = z1 + z2
f.backward()

print(f"Gradient of x1 = {x1.grad}")
print(f"Gradient of x2 = {x2.grad}")
print(f"Gradient of x3 = {x3.grad}")
print(f"Gradient of x4 = {x4.grad}")
print(f"Gradient of z1 = {z1.grad}")
print(f"Gradient of z2 = {z2.grad}")

Gradient of x1 = 3.0
Gradient of x2 = 2.0
Gradient of x3 = 4.0
Gradient of x4 = 1.0
Gradient of z1 = None
Gradient of z2 = None


  return self._grad


# <a id='toc2_'></a>[Накопление градиентов](#toc0_)

In [9]:
class ExampleLinear(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.weight = torch.nn.Parameter(torch.Tensor([1]).float(),
                                         requires_grad=True)

    def forward(self, x):
        return self.weight * x  # model: source (W*x) -> target (2*x)

def calculate_loss(x: torch.Tensor) -> torch.Tensor:
    y = 2 * x           # target
    y_hat = model(x)    # source: W*x
    loss = (y - y_hat) ** 2
    return loss.mean()

In [10]:
model = ExampleLinear()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
data_loader = [torch.tensor([4.0]), torch.tensor([2.0]), torch.tensor([3.0]), torch.tensor([1.0]),]

accumulation_steps = 2
for i, batch in enumerate(data_loader):
    optimizer.zero_grad(set_to_none=True)   # None instead of 0-tensor
    print(f"-> zero_grad: (batch {i}) : W.grad: {model.weight.grad}")

    # The loss needs to be scaled, because the mean should be taken across the whole
    # dataset, which requires the loss to be divided by the number of batches.
    loss = calculate_loss(batch) / accumulation_steps
    print(f"\t-> loss(y=2x, y_hat=Wx) = (2x-Wx)^2/bN/accN = ( 2 * {batch} - {model.weight.data} * {batch} ) ** 2 / 2 / {accumulation_steps} = {loss}")
    loss.backward()

    print(f"\t\t-> loss.backward: (batch {i}) : W: {model.weight.data}")
    print(f"\t\t-> loss.backward: (batch {i}) : W.grad: {model.weight.grad}")
    if (i + 1) % accumulation_steps == 0:
        # Updating the model only after all batches
        optimizer.step()
        print(f"\noptimizer step: (step {i+1}) : W = W - lr*W.grad = W - {optimizer.param_groups[0]['lr']} * {model.weight.grad} = {model.weight.data}\n")

print(f"(final) : W: {model.weight.data}")
print(f"(final) : grad: {model.weight.grad}")

-> zero_grad: (batch 0) : W.grad: None
	-> loss(y=2x, y_hat=Wx) = (2x-Wx)^2/bN/accN = ( 2 * tensor([4.]) - tensor([1.]) * tensor([4.]) ) ** 2 / 2 / 2 = 8.0
		-> loss.backward: (batch 0) : W: tensor([1.])
		-> loss.backward: (batch 0) : W.grad: tensor([-16.])
-> zero_grad: (batch 1) : W.grad: None
	-> loss(y=2x, y_hat=Wx) = (2x-Wx)^2/bN/accN = ( 2 * tensor([2.]) - tensor([1.]) * tensor([2.]) ) ** 2 / 2 / 2 = 2.0
		-> loss.backward: (batch 1) : W: tensor([1.])
		-> loss.backward: (batch 1) : W.grad: tensor([-4.])

optimizer step: (step 2) : W = W - lr*W.grad = W - 0.1 * tensor([-4.]) = tensor([1.4000])

-> zero_grad: (batch 2) : W.grad: None
	-> loss(y=2x, y_hat=Wx) = (2x-Wx)^2/bN/accN = ( 2 * tensor([3.]) - tensor([1.4000]) * tensor([3.]) ) ** 2 / 2 / 2 = 1.6200003623962402
		-> loss.backward: (batch 2) : W: tensor([1.4000])
		-> loss.backward: (batch 2) : W.grad: tensor([-5.4000])
-> zero_grad: (batch 3) : W.grad: None
	-> loss(y=2x, y_hat=Wx) = (2x-Wx)^2/bN/accN = ( 2 * tensor([1.]) -

In [24]:
model = ExampleLinear()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
data_loader = [torch.tensor([4.0]), torch.tensor([2.0]), torch.tensor([3.0]), torch.tensor([1.0]),]
MSEloss = torch.nn.MSELoss()

accumulation_steps = 2
for i, batch in enumerate(data_loader):
    print(f"-> zero_grad: (batch {i}) : W.grad: {model.weight.grad}")

    # The loss needs to be scaled, because the mean should be taken across the whole
    # dataset, which requires the loss to be divided by the number of batches.
    loss = MSEloss(model(batch), 2*batch) / accumulation_steps
    print(f"\t-> loss(y=2x, y_hat=Wx) = (2x-Wx)^2/bN/accN = ( 2 * {batch} - {model.weight.data} * {batch} ) ** 2 / 2 / {accumulation_steps} = {loss}")

    print(f"\t\t-> loss.backward: (batch {i}) : W: {model.weight.data}")
    print(f"\t\t-> loss.backward: (batch {i}) : W.grad: {model.weight.grad}")
    if (i + 1) % accumulation_steps == 0:
        loss.backward()
        # Updating the model only after all batches
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)   # None instead of 0-tensor
        print(f"\noptimizer step: (step {i+1}) : W = W - lr*W.grad = W - {optimizer.param_groups[0]['lr']} * {model.weight.grad} = {model.weight.data}\n")

print(f"(final) : W: {model.weight.data}")
print(f"(final) : grad: {model.weight.grad}")

-> zero_grad: (batch 0) : W.grad: None
	-> loss(y=2x, y_hat=Wx) = (2x-Wx)^2/bN/accN = ( 2 * tensor([4.]) - tensor([1.]) * tensor([4.]) ) ** 2 / 2 / 2 = 8.0
		-> loss.backward: (batch 0) : W: tensor([1.])
		-> loss.backward: (batch 0) : W.grad: None
-> zero_grad: (batch 1) : W.grad: None
	-> loss(y=2x, y_hat=Wx) = (2x-Wx)^2/bN/accN = ( 2 * tensor([2.]) - tensor([1.]) * tensor([2.]) ) ** 2 / 2 / 2 = 2.0
		-> loss.backward: (batch 1) : W: tensor([1.])
		-> loss.backward: (batch 1) : W.grad: None

optimizer step: (step 2) : W = W - lr*W.grad = W - 0.1 * None = tensor([1.4000])

-> zero_grad: (batch 2) : W.grad: None
	-> loss(y=2x, y_hat=Wx) = (2x-Wx)^2/bN/accN = ( 2 * tensor([3.]) - tensor([1.4000]) * tensor([3.]) ) ** 2 / 2 / 2 = 1.6200003623962402
		-> loss.backward: (batch 2) : W: tensor([1.4000])
		-> loss.backward: (batch 2) : W.grad: None
-> zero_grad: (batch 3) : W.grad: None
	-> loss(y=2x, y_hat=Wx) = (2x-Wx)^2/bN/accN = ( 2 * tensor([1.]) - tensor([1.4000]) * tensor([1.]) ) ** 2 /

Общий паттерн: 

    accumulation_steps = 10
    for i, batch in enumerate(batches):
        # Scale the loss to the mean of the accumulated batch size
        loss = calculate_loss(batch) / accumulation_steps
        loss.backward()
        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            # Reset gradients, for the next accumulated batches
            optimizer.zero_grad()

В функции потерь усредняется по фактическому размеру батча, а снаружи добавляется усреднение по "эффективному" размеру батча, в рамках готорого обновляются параметры модели

# <a id='toc3_'></a>[Граф градиентов](#toc0_)

AUTOGRAD MECHANICS

Тензоры собираются в граф, а конкретно дерево - где входные/исходные тензоры соответствуют листьям, а выходные/результирующие - корням.

- Концевые узлы графа, терминальные вершины (листья дерева) - тензоры, созданные конструктором. 
- Узлы ветвления - остальные тензоры, полученные из листьев тензорными операциями, допускающими вычисление производных.

Под капотом тут граф объектов-функций.

Граф строится по ходу прямого прохода по модели (`forward`). When the forwards pass is completed, we evaluate this graph in the backwards pass to compute the gradients. An important thing to note is that the graph is recreated from scratch at every iteration,


In [12]:
a = torch.tensor(2.0, requires_grad=True)
b = torch.tensor(4.0)
c = a + b # => tensor(6., grad_fn=<AddBackward0>)

print(f"{a.requires_grad=}") # => True
print(f"{a.is_leaf=}") # => True
print(f"{b.requires_grad=}") # => False
print(f"{b.is_leaf=}") # => True
print(f"{c.requires_grad=}") # => True
print(f"{c.is_leaf=}") # => False
c

a.requires_grad=True
a.is_leaf=True
b.requires_grad=False
b.is_leaf=True
c.requires_grad=True
c.is_leaf=False


tensor(6., grad_fn=<AddBackward0>)

When defining a custom Python Function, you can use save_for_backward() to save tensors during the forward pass and saved_tensors to retrieve them during the backward pass. See Extending PyTorch for more information.

For operations that PyTorch defines (e.g. torch.pow()), tensors are automatically saved as needed. You can explore (for educational or debugging purposes) which tensors are saved by a certain grad_fn by looking for its attributes starting with the prefix _saved.

In [13]:
x = torch.randn(5, requires_grad=True)
y = x.pow(2)
print(x.equal(y.grad_fn._saved_self))  # True
print(x is y.grad_fn._saved_self)  # True

True
True


Non-determinism

При многопоточном выполнении расчета градиентов может возникать гонка за ресурсы и кривые результаты. В этом случае лучше использовать `torch.autograd.grad()` (который что? кажись, он пересчитывает градиенты от листьев к корням) вместо `backward()` (а он, кажись, просто суммирует листья, которые могут поменять в многопоточном варианте).

Кастомные градиентные функции (питоновские) потокобезопасны т.к. питоновские (GIL), остальные, написанные на CPP (autograd::Function, в т.ч. torch.autograd.grad()) используют мьютексы для блокировок.

Для кастомных CPP функций CPP hooks нужно писать грамотно и потокобезопасно, это проблема уже юзера, на этом полномочия торча все...

**Hooks for saved tensors**

You can control how saved tensors are packed / unpacked by defining a pair of pack_hook / unpack_hook hooks. The pack_hook function should take a tensor as its single argument but can return any python object (e.g. another tensor, a tuple, or even a string containing a filename). The unpack_hook function takes as its single argument the output of pack_hook and should return a tensor to be used in the backward pass. The tensor returned by unpack_hook only needs to have the same content as the tensor passed as input to pack_hook. In particular, any autograd-related metadata can be ignored as they will be overwritten during unpacking.

An example of such pair is:

In [14]:
import os, uuid, tempfile

tmp_dir = tempfile.gettempdir()

class SelfDeletingTempFile():
    def __init__(self):
        self.name = os.path.join(tmp_dir, str(uuid.uuid4()))

    def __del__(self):
        os.remove(self.name)

def pack_hook(tensor):
    temp_file = SelfDeletingTempFile()
    torch.save(tensor, temp_file.name)
    return temp_file

def unpack_hook(temp_file):
    return torch.load(temp_file.name)

tmp_dir

'/tmp/.private/user1'

**Registering hooks for a saved tensor**

You can register a pair of hooks on a saved tensor by calling the register_hooks() method on a SavedTensor object. Those objects are exposed as attributes of a grad_fn and start with the _raw_saved_ prefix.

The pack_hook method is called as soon as the pair is registered. The unpack_hook method is called each time the saved tensor needs to be accessed, either by means of y.grad_fn._saved_self or during the backward pass.

In [15]:
# Only save on disk tensors that have size >= 1000
SAVE_ON_DISK_THRESHOLD = 1000

def pack_hook(x):
    if x.numel() < SAVE_ON_DISK_THRESHOLD:
        return x
    temp_file = SelfDeletingTempFile()
    torch.save(x, temp_file.name)
    return temp_file

def unpack_hook(tensor_or_sctf):
    if isinstance(tensor_or_sctf, torch.Tensor):
        return tensor_or_sctf
    return torch.load(tensor_or_sctf.name)

class Model(torch.nn.Module):
    def forward(self, x):
        with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook):
          # ... compute output
          output = x
        return output

model = Model()
net = torch.nn.DataParallel(model)

In [16]:
x = torch.randn(5, requires_grad=True)
y = x.pow(2)
y.grad_fn._raw_saved_self.register_hooks(pack_hook, unpack_hook)

In [17]:
# Example what NOT to do (hooks do not go through DataParallel)

with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook):
    output = net(input)
    
output

<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x7fa32d5857b0>>

using those hooks disables all the optimization in place to reduce Tensor object creation

In [18]:
with torch.autograd.graph.saved_tensors_hooks(lambda x: x, lambda x: x):
    x = torch.randn(5, requires_grad=True)
    y = x * x

y

tensor([0.0631, 0.4398, 1.6231, 0.7432, 0.3383], grad_fn=<MulBackward0>)

Without the hooks, x, y.grad_fn._saved_self and y.grad_fn._saved_other all refer to the same tensor object. 

With the hooks, PyTorch will pack and unpack x into two new tensor objects that share the same storage with the original x (no copy performed).

