In [1]:
import torch

In [3]:
torch.set_default_device("cuda")

In [3]:
#要算梯度，需要把
x = torch.randn(3, requires_grad=True)
print(x)

tensor([-0.9881, -0.7402, -0.2209], device='cuda:0', requires_grad=True)


In [10]:
#计算图，注意grad_fn和Add/Mul backward0
y = x + 2
print(y)

z = 2 * y * y
print(z)

zz = z.mean()
print(zz)


tensor([3., 3., 3.], device='cuda:0', grad_fn=<AddBackward0>)
tensor([18., 18., 18.], device='cuda:0', grad_fn=<MulBackward0>)
tensor(18., device='cuda:0', grad_fn=<MeanBackward0>)


In [5]:
#计算gradient,会自动计算zz关于x的梯度，并存储在x.grad中
#x是叶子节点，默认只计算叶子节点的梯度
zz.backward()
print(x.grad)
print(y.grad)
print(z.grad)


tensor([1.3492, 1.6798, 2.3722], device='cuda:0')
None
None


  return func(*args, **kwargs)


In [8]:
#计算图，注意grad_fn和Add/Mul backward0
x = torch.ones(3, requires_grad=True)
print(x)

y = x + 2
print(y)

loss = y.mean()
print(loss)

loss.backward()
print(x.grad)




tensor([1., 1., 1.], device='cuda:0', requires_grad=True)
tensor([3., 3., 3.], device='cuda:0', grad_fn=<AddBackward0>)
tensor(3., device='cuda:0', grad_fn=<MeanBackward0>)
tensor([0.3333, 0.3333, 0.3333], device='cuda:0')


In [9]:
#计算图，注意grad_fn和Add/Mul backward0
x = torch.ones(3, requires_grad=True)
print(x)

y = x + 2
print(y)
#对非叶子节点张量（non-leaf）保留梯度！！！
y.retain_grad()

loss = y.mean()
print(loss)

loss.backward()
print(x.grad)
print(y.grad)

tensor([1., 1., 1.], device='cuda:0', requires_grad=True)
tensor([3., 3., 3.], device='cuda:0', grad_fn=<AddBackward0>)
tensor(3., device='cuda:0', grad_fn=<MeanBackward0>)
tensor([0.3333, 0.3333, 0.3333], device='cuda:0')
tensor([0.3333, 0.3333, 0.3333], device='cuda:0')


讲到了jacobian matrix，乘积J⋅v表示的是函数L关于x的梯度，J的每一行和v的逐元素相乘（这里面表示了通过不同的中间变量y，对每个x分量的路径不同，因此不能约掉），再相加，得到L对每个x分量的结果

In [11]:
#from pytorch documentation:is_leaf
#可以清晰的知道梯度和叶子张量的关系，就是只要是自定义的、定义了grad需求的且没进行过计算的，都是叶子张量

a = torch.rand(10, requires_grad=True)
print(a.is_leaf)
b = torch.rand(10, requires_grad=True).cuda()
print(b.is_leaf)
c = torch.rand(10, requires_grad=True) + 2
print(c.is_leaf)
d = torch.rand(10).cuda()
print(d.is_leaf)
e = torch.rand(10).cuda().requires_grad_()
print(e.is_leaf)
f = torch.rand(10, requires_grad=True, device="cuda")
print(f.is_leaf)

True
True
False
True
True
True


In [2]:
#阻止pytorch跟踪历史？计算grad_fn贡献
#在training loop中，更新权重，这个操作不应该称为梯度计算的一部分？


#x.requires_grad_(False)
#x.detach(),创建一个新tensor不需要梯度
#with torch.no_grad():，wrap（包装）


x = torch.randn(3,requires_grad=True)
print(x)

#一共有三种方法，可以阻止创建梯度和计算中的跟踪历史

# No.1: modify the variable in-place，后续不计算梯度
x.requires_grad_(False)   
print(x)

# No 2: 停止追踪梯度，不改变现有张量，得到新张量
y = x.detach()  
print(y)

# No 3: 这样做是为了不影响自动梯度计算图
with torch.no_grad():
    z = x + 2
    print(z)
# equal to ->
# z = x + 2
# z.requires_grad_(False)





tensor([ 1.5495,  0.0919, -1.6931], requires_grad=True)
tensor([ 1.5495,  0.0919, -1.6931])
tensor([ 1.5495,  0.0919, -1.6931])
tensor([3.5495, 2.0919, 0.3069])


In [4]:
# a training example
# 这个例子再说明，如果每个epoch后不对梯度进行清零的话，梯度会累加，得到了错误的结果

weights = torch.ones(4, requires_grad=True)

for epoch in range(2):
    model_output = (weights * 3).sum()
    
    model_output.backward()
    
    print("weights grad: ", weights.grad)
    print("weight: ", weights)
    

weights grad:  tensor([3., 3., 3., 3.], device='cuda:0')
weight:  tensor([1., 1., 1., 1.], device='cuda:0', requires_grad=True)
weights grad:  tensor([6., 6., 6., 6.], device='cuda:0')
weight:  tensor([1., 1., 1., 1.], device='cuda:0', requires_grad=True)


In [8]:
# a training example for zero_grad!! 
# # 梯度清零这非常重要！！！

weights = torch.ones(4, requires_grad=True)

for epoch in range(2):
    model_output = (weights * 3).sum()
    
    model_output.backward()
    
    print("weight: ", weights)
    print("weights grad: ", weights.grad)
    
    weights.grad.zero_()
    print("weights grad: ", weights.grad)

weight:  tensor([1., 1., 1., 1.], device='cuda:0', requires_grad=True)
weights grad:  tensor([3., 3., 3., 3.], device='cuda:0')
weights grad:  tensor([0., 0., 0., 0.], device='cuda:0')
weight:  tensor([1., 1., 1., 1.], device='cuda:0', requires_grad=True)
weights grad:  tensor([3., 3., 3., 3.], device='cuda:0')
weights grad:  tensor([0., 0., 0., 0.], device='cuda:0')


In [10]:
# a training example for zero_grad!! 
# # 梯度更新 + 梯度清零

weights = torch.ones(4, requires_grad=True)

lr = 0.01

for epoch in range(2):
    model_output = (weights * 3).sum()
    
    model_output.backward()
    
    print("\n", "weight: ", weights)
    print("weights grad: ", weights.grad)
    
    with torch.no_grad():
        weights -= lr * weights.grad
    print("weight: ", weights)
    
    weights.grad.zero_()
    print("weights grad: ", weights.grad)


 weight:  tensor([1., 1., 1., 1.], device='cuda:0', requires_grad=True)
weights grad:  tensor([3., 3., 3., 3.], device='cuda:0')
weight:  tensor([0.9700, 0.9700, 0.9700, 0.9700], device='cuda:0', requires_grad=True)
weights grad:  tensor([0., 0., 0., 0.], device='cuda:0')

 weight:  tensor([0.9700, 0.9700, 0.9700, 0.9700], device='cuda:0', requires_grad=True)
weights grad:  tensor([3., 3., 3., 3.], device='cuda:0')
weight:  tensor([0.9400, 0.9400, 0.9400, 0.9400], device='cuda:0', requires_grad=True)
weights grad:  tensor([0., 0., 0., 0.], device='cuda:0')


In [None]:
# 优化器

weights = torch.ones(4, requires_grad=True)

optimizer = torch.optim.SGD(weights, lr=0.01)
optimizer.step()
optimizer.zero_grad()



In [None]:
weights = torch.ones(4, requires_grad=True)

z.backward()

weights.grad.zero_()


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Step 1: Define your model architecture
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(10, 5)
        self.fc2 = nn.Linear(5, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Step 2: Define your loss function
loss_fn = nn.MSELoss()

# Step 3: Initialize an optimizer
model = MyModel()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Step 4: Perform forward pas
inputs = torch.randn(1, 10)
outputs = model(inputs)

# Step 5: Calculate the loss
target = torch.randn(1, 1)
loss = loss_fn(outputs, target)

# Step 6: Perform backward pass
optimizer.zero_grad()
loss.backward()

# Step 7: Update the model parameters using the optimizer
optimizer.step()
