In [1]:
import torch

In [2]:
x = torch.tensor(
    [[1.,  2.,  3.,  4.],
     [5.,  6.,  7.,  8.],
     [9., 10., 11., 12.]], requires_grad=True)

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
x = x.to(device)
x.retain_grad()

In [4]:
function = 10 * (x ** 2).sum()

In [5]:
type(function)

torch.Tensor

In [20]:
function

tensor(6500., grad_fn=<MulBackward0>)

In [7]:
function.backward()

In [8]:
print(x.grad, '<- gradient')

tensor([[ 20.,  40.,  60.,  80.],
        [100., 120., 140., 160.],
        [180., 200., 220., 240.]]) <- gradient


In [9]:
print(function.grad_fn)

<MulBackward0 object at 0x7f68d39b0f90>


In [10]:
print(function.grad_fn.next_functions)

((<SumBackward0 object at 0x7f68d39c3ed0>, 0), (None, 0))


In [11]:
print(function.grad_fn.next_functions[0][0])

<SumBackward0 object at 0x7f68d39cb450>


In [12]:
print(function.grad_fn.next_functions[0][0].next_functions[0][0])

<PowBackward0 object at 0x7f68d39b0f90>


In [13]:
print(function.grad_fn.next_functions[0][0].next_functions[0][0].next_functions[0][0])

<AccumulateGrad object at 0x7f68d39ce090>


In [16]:
hasattr(x, 'data')

True

In [40]:
x.data

RuntimeError: CUDA error: unspecified launch failure

In [17]:
x.data -= 0.001 * x.grad

In [18]:
x.data

tensor([[ 0.9800,  1.9600,  2.9400,  3.9200],
        [ 4.9000,  5.8800,  6.8600,  7.8400],
        [ 8.8200,  9.8000, 10.7800, 11.7600]])

In [19]:
x.grad

tensor([[ 20.,  40.,  60.,  80.],
        [100., 120., 140., 160.],
        [180., 200., 220., 240.]])

In [21]:
x.grad.zero_()

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])

In [22]:
x.grad

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])

In [31]:
w = torch.tensor([[5., 10.], [1., 2.]], requires_grad=True)

In [32]:
w

tensor([[ 5., 10.],
        [ 1.,  2.]], requires_grad=True)

In [33]:
function = (torch.log(torch.log(w + 7))).prod()

In [34]:
function

tensor(0.5463, grad_fn=<ProdBackward0>)

In [35]:
function.backward()

In [36]:
print(w.grad, '<- gradient')

tensor([[0.0201, 0.0109],
        [0.0449, 0.0351]]) <- gradient


In [24]:
w = torch.tensor([[5., 10.], [1., 2.]], requires_grad=True)
alpha = 0.001

for _ in range(500):
    function = (w + 7).log().log().prod()
    function.backward()
    w.data -= 0.001 * w.grad
    w.grad.zero_()

print(w) # Код для самопроверки

tensor([[4.9900, 9.9948],
        [0.9775, 1.9825]], requires_grad=True)


In [25]:
x = torch.tensor([8., 8.], requires_grad=True)
var_history = []
fn_history = []

def function_parabola(variable):
    return 10 * (variable ** 2).sum()

def make_gradient_step(function, variable):
    function_result = function(variable)
    function_result.backward()
    variable.data -= 0.001 * variable.grad
    variable.grad.zero_()

for i in range(500):
    var_history.append(x.data.cpu().numpy().copy())
    fn_history.append(function_parabola(x).data.cpu().numpy().copy())
    make_gradient_step(function_parabola, x)

In [26]:
var_history

[array([8., 8.], dtype=float32),
 array([7.84, 7.84], dtype=float32),
 array([7.6832004, 7.6832004], dtype=float32),
 array([7.5295362, 7.5295362], dtype=float32),
 array([7.3789454, 7.3789454], dtype=float32),
 array([7.2313666, 7.2313666], dtype=float32),
 array([7.086739, 7.086739], dtype=float32),
 array([6.9450045, 6.9450045], dtype=float32),
 array([6.806104, 6.806104], dtype=float32),
 array([6.669982, 6.669982], dtype=float32),
 array([6.5365825, 6.5365825], dtype=float32),
 array([6.405851, 6.405851], dtype=float32),
 array([6.277734, 6.277734], dtype=float32),
 array([6.1521792, 6.1521792], dtype=float32),
 array([6.0291357, 6.0291357], dtype=float32),
 array([5.908553, 5.908553], dtype=float32),
 array([5.790382, 5.790382], dtype=float32),
 array([5.6745744, 5.6745744], dtype=float32),
 array([5.561083, 5.561083], dtype=float32),
 array([5.449861, 5.449861], dtype=float32),
 array([5.3408637, 5.3408637], dtype=float32),
 array([5.2340465, 5.2340465], dtype=float32),
 array([

In [27]:
fn_history

[array(1280., dtype=float32),
 array(1229.312, dtype=float32),
 array(1180.6313, dtype=float32),
 array(1133.8783, dtype=float32),
 array(1088.9767, dtype=float32),
 array(1045.8533, dtype=float32),
 array(1004.4374, dtype=float32),
 array(964.66174, dtype=float32),
 array(926.4611, dtype=float32),
 array(889.7732, dtype=float32),
 array(854.5382, dtype=float32),
 array(820.69855, dtype=float32),
 array(788.19885, dtype=float32),
 array(756.98615, dtype=float32),
 array(727.0096, dtype=float32),
 array(698.22, dtype=float32),
 array(670.57043, dtype=float32),
 array(644.01587, dtype=float32),
 array(618.5128, dtype=float32),
 array(594.0197, dtype=float32),
 array(570.4965, dtype=float32),
 array(547.90485, dtype=float32),
 array(526.2078, dtype=float32),
 array(505.37, dtype=float32),
 array(485.35736, dtype=float32),
 array(466.13715, dtype=float32),
 array(447.67816, dtype=float32),
 array(429.95013, dtype=float32),
 array(412.92407, dtype=float32),
 array(396.5723, dtype=float32),


In [28]:
x = torch.tensor(
    [8., 8.], requires_grad=True)
var_history = []
fn_history = []

optimizer = torch.optim.SGD([x], lr=0.001)

def function_parabola(variable):
    return 10 * (variable ** 2).sum()

def make_gradient_step(function, variable):
    function_result = function(variable)
    function_result.backward()
    optimizer.step()
    optimizer.zero_grad()
    
for i in range(500):
    var_history.append(x.data.numpy().copy())
    fn_history.append(function_parabola(x).data.cpu().numpy().copy())
    make_gradient_step(function_parabola, x)

In [29]:
var_history

[array([8., 8.], dtype=float32),
 array([7.84, 7.84], dtype=float32),
 array([7.6832004, 7.6832004], dtype=float32),
 array([7.5295362, 7.5295362], dtype=float32),
 array([7.3789454, 7.3789454], dtype=float32),
 array([7.2313666, 7.2313666], dtype=float32),
 array([7.086739, 7.086739], dtype=float32),
 array([6.9450045, 6.9450045], dtype=float32),
 array([6.806104, 6.806104], dtype=float32),
 array([6.669982, 6.669982], dtype=float32),
 array([6.5365825, 6.5365825], dtype=float32),
 array([6.405851, 6.405851], dtype=float32),
 array([6.277734, 6.277734], dtype=float32),
 array([6.1521792, 6.1521792], dtype=float32),
 array([6.0291357, 6.0291357], dtype=float32),
 array([5.908553, 5.908553], dtype=float32),
 array([5.790382, 5.790382], dtype=float32),
 array([5.6745744, 5.6745744], dtype=float32),
 array([5.561083, 5.561083], dtype=float32),
 array([5.449861, 5.449861], dtype=float32),
 array([5.3408637, 5.3408637], dtype=float32),
 array([5.2340465, 5.2340465], dtype=float32),
 array([

In [31]:
fn_history

[array(1280., dtype=float32),
 array(1229.312, dtype=float32),
 array(1180.6313, dtype=float32),
 array(1133.8783, dtype=float32),
 array(1088.9767, dtype=float32),
 array(1045.8533, dtype=float32),
 array(1004.4374, dtype=float32),
 array(964.66174, dtype=float32),
 array(926.4611, dtype=float32),
 array(889.7732, dtype=float32),
 array(854.5382, dtype=float32),
 array(820.69855, dtype=float32),
 array(788.19885, dtype=float32),
 array(756.98615, dtype=float32),
 array(727.0096, dtype=float32),
 array(698.22, dtype=float32),
 array(670.57043, dtype=float32),
 array(644.01587, dtype=float32),
 array(618.5128, dtype=float32),
 array(594.0197, dtype=float32),
 array(570.4965, dtype=float32),
 array(547.90485, dtype=float32),
 array(526.2078, dtype=float32),
 array(505.37, dtype=float32),
 array(485.35736, dtype=float32),
 array(466.13715, dtype=float32),
 array(447.67816, dtype=float32),
 array(429.95013, dtype=float32),
 array(412.92407, dtype=float32),
 array(396.5723, dtype=float32),


In [33]:
w = torch.tensor([[5., 10.], [1., 2.]], requires_grad=True)
alpha = 0.001
optimizer =  torch.optim.SGD([w], lr=alpha)

for _ in range(500):
    function = (w + 7).log().log().prod()
    function.backward()
    optimizer.step()
    optimizer.zero_grad()

print(w)

tensor([[4.9900, 9.9948],
        [0.9775, 1.9825]], requires_grad=True)
