In [2]:
import torch

In [3]:
def f(x):
    return 3. * x ** 2 + 2. * x - 1
#近视求导，x移动eps单位，也就是离自己很近的一个点的切线
def approximate_derivative(f, x, eps=1e-6):
    return (f(x + eps) - f(x - eps)) / (2. * eps)

print(approximate_derivative(f, 1.))

7.999999999785956


In [4]:

#求偏导数,其中一个数不动，对另外一个变量求导
def g(x1, x2):
    return (x1 + 5) * (x2 ** 2)

def approximate_gradient(g, x1, x2, eps=1e-3):
    dg_x1 = approximate_derivative(lambda x: g(x, x2), x1, eps)
    dg_x2 = approximate_derivative(lambda x: g(x1, x), x2, eps)
    return dg_x1, dg_x2

print(approximate_gradient(g, 2., 3.))

(8.999999999993236, 41.999999999994486)


以上是手动实现

In [5]:
x1 = torch.tensor([2.], requires_grad=True)
x2 = torch.tensor([3.], requires_grad=True)
y = g(x1, x2)

(dy_dx1,) = torch.autograd.grad(y, x1,retain_graph=True)
print(dy_dx1)

tensor([9.])


这个是另外一种接口

In [None]:
x1 = torch.tensor([2.], requires_grad=True)
x2 = torch.tensor([3.], requires_grad=True)
y = g(x1, x2)

# 求偏导数,求梯度
y.backward()
print(x1.grad, x2.grad)

下面是对之前梯度下降的简单展示

In [6]:
#模拟梯度下降算法 SGD
import torch
learning_rate = 0.3
x = torch.tensor(2.0, requires_grad=True)
for _ in range(100):
    z = f(x)
    z.backward()
    x.data.sub_(learning_rate * x.grad) # x -= learning_rate * x.grad，这里就等价于optimizer.step()
    x.grad.zero_() # x.grad -= x.grad, x.grad = 0,梯度清零
print(x)

tensor(-0.3333, requires_grad=True)


In [7]:
#GradientTape与optimizer（优化器）结合使用
learning_rate = 0.01
x = torch.tensor(2.0, requires_grad=True)
optimizer = torch.optim.SGD([x], lr=learning_rate,momentum=0.9)
for _ in range(500):
    z = f(x)
    optimizer.zero_grad() # 梯度变为0，这一步看出优化器能够在没有梯度的时候作用
    z.backward() # dz/dx,求梯度
    # print(x.grad)
    optimizer.step() # x -= learning_rate * x.grad


print(x)


tensor(-0.3333, requires_grad=True)
