# (목적)Gradient를 확인한다
## 전체 Gradient에 3을 곱한것과 parameter 별로 gradient에 3을 곱한 것이 차이가 나는지 확인

In [132]:
import torch
import torch.nn as nn

import torch.nn.functional as F

# 예제 데이터 생성
X = torch.randn(100, 2)
y = torch.randint(0, 2, (100,))

# 모델 정의
class ClassificationModel(nn.Module):
    def __init__(self):
        super(ClassificationModel, self).__init__()
        self.fc1 = nn.Linear(2, 2)
        self.fc2 = nn.Linear(2, 2)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        return x

model = ClassificationModel() # without gradient modifier
model_with_modifier = ClassificationModel()

# Cross Entropy Loss와 KL Divergence Loss 정의
cross_entropy_loss = nn.CrossEntropyLoss()
kl_divergence_loss = nn.KLDivLoss(reduction='batchmean')

# Forward Pass
outputs = model(X)
output_with_modifier = model_with_modifier(X)

# Cross Entropy Loss 계산
ce_loss = F.cross_entropy(outputs, y)

# KL Divergence Loss 계산
p = torch.tensor([0.1, 0.2])  # 예시 분포
kl_loss = kl_divergence_loss(torch.log(outputs), p)
# kl_loss = ce_loss

# Gradient 계산 (torch.autograd.grad 사용)
ce_gradient = torch.autograd.grad(ce_loss, model.parameters(), retain_graph=True)
kl_gradient = torch.autograd.grad(kl_loss, model.parameters(), retain_graph=True)

# Gradient 확인
print("Cross Entropy Loss Gradient:")
for grad_param in ce_gradient:
    print(grad_param)

# print("\nKL Divergence Loss Gradient:")
# for grad_param in kl_gradient:
#     print(grad_param)

Cross Entropy Loss Gradient:
tensor([[ 0.0067, -0.0057],
        [-0.1132,  0.0953]])
tensor([ 0.0062, -0.1049])
tensor([[ 0.0287, -0.0587],
        [-0.0287,  0.0587]])
tensor([ 0.0882, -0.0882])


In [118]:
ce_loss *= 2
ce_gradient = torch.autograd.grad(ce_loss, model.parameters(), retain_graph=True)

# Gradient 확인
print("Cross Entropy Loss Gradient:")
for grad_param in ce_gradient:
    print(grad_param)

Cross Entropy Loss Gradient:
tensor([[-0.0280, -0.0529],
        [-0.0470, -0.0888]])
tensor([0.0762, 0.1279])
tensor([[ 0.0792, -0.1051],
        [-0.0792,  0.1051]])
tensor([-0.1973,  0.1973])


In [119]:
kl_loss

tensor(nan, grad_fn=<DivBackward0>)

In [120]:
total_loss = 1 * ce_loss +  2* kl_loss
total_loss
total_gradient = torch.autograd.grad(total_loss, model.parameters(), retain_graph=True)
print(total_gradient)

(tensor([[-0.5257,  0.0265],
        [-0.8808,  0.0445]]), tensor([2.4115, 4.0392]), tensor([[-0.5026,  0.0906],
        [-3.5980,  0.8394]]), tensor([0.8105, 5.8365]))


In [121]:
names_grads_copy = {}

for param_name, ce_grad, kl_grad in zip(model.parameters(), ce_gradient, kl_gradient):
    
    #kl_grad = None
    
    if not kl_grad == None:
        names_grads_copy[param_name] = torch.tensor(1) * ce_grad + torch.tensor(2) *  kl_grad
    else:
        names_grads_copy[param_name] = torch.tensor(1) * ce_grad 
    
for key, value in names_grads_copy.items():
    print(value)

tensor([[-0.5257,  0.0265],
        [-0.8808,  0.0445]])
tensor([2.4115, 4.0392])
tensor([[-0.5026,  0.0906],
        [-3.5980,  0.8394]])
tensor([0.8105, 5.8365])


## 1. Gradient를 추가적으로 구해도 retain_graph=True를 설정하면, 값이 변하지 않는다

In [122]:
ce_gradient = torch.autograd.grad(ce_loss, model.parameters(), retain_graph=True)
for grad_param in ce_gradient:
    print(grad_param)

tensor([[-0.0280, -0.0529],
        [-0.0470, -0.0888]])
tensor([0.0762, 0.1279])
tensor([[ 0.0792, -0.1051],
        [-0.0792,  0.1051]])
tensor([-0.1973,  0.1973])


# 2. 전체 gradient에 3을 곱한다
## 1) 안된다

In [123]:
ce_gradient =  torch.tensor(3) * torch.autograd.grad(ce_loss, model.parameters(), retain_graph=True)
for grad_param in ce_gradient:
    print(grad_param)

tensor([[-0.0280, -0.0529],
        [-0.0470, -0.0888]])
tensor([0.0762, 0.1279])
tensor([[ 0.0792, -0.1051],
        [-0.0792,  0.1051]])
tensor([-0.1973,  0.1973])
tensor([[-0.0280, -0.0529],
        [-0.0470, -0.0888]])
tensor([0.0762, 0.1279])
tensor([[ 0.0792, -0.1051],
        [-0.0792,  0.1051]])
tensor([-0.1973,  0.1973])
tensor([[-0.0280, -0.0529],
        [-0.0470, -0.0888]])
tensor([0.0762, 0.1279])
tensor([[ 0.0792, -0.1051],
        [-0.0792,  0.1051]])
tensor([-0.1973,  0.1973])


## 2) 덧셈 조차 하면 안된다

In [124]:
total_gradient = ce_gradient + kl_gradient
total_gradient

(tensor([[-0.0280, -0.0529],
         [-0.0470, -0.0888]]),
 tensor([0.0762, 0.1279]),
 tensor([[ 0.0792, -0.1051],
         [-0.0792,  0.1051]]),
 tensor([-0.1973,  0.1973]),
 tensor([[-0.0280, -0.0529],
         [-0.0470, -0.0888]]),
 tensor([0.0762, 0.1279]),
 tensor([[ 0.0792, -0.1051],
         [-0.0792,  0.1051]]),
 tensor([-0.1973,  0.1973]),
 tensor([[-0.0280, -0.0529],
         [-0.0470, -0.0888]]),
 tensor([0.0762, 0.1279]),
 tensor([[ 0.0792, -0.1051],
         [-0.0792,  0.1051]]),
 tensor([-0.1973,  0.1973]),
 tensor([[-0.2488,  0.0397],
         [-0.4169,  0.0667]]),
 tensor([1.1676, 1.9556]),
 tensor([[-0.2909,  0.0979],
         [-1.7594,  0.3671]]),
 tensor([0.5039, 2.8196]))

## 3) loss에 3을 곱해줘야 내가 원하는 값을 얻을 수 있다

In [125]:
ce_gradient =  torch.autograd.grad(torch.tensor(3) * ce_loss, model.parameters(), retain_graph=True)
for grad_param in ce_gradient:
    print(grad_param)

tensor([[-0.0840, -0.1587],
        [-0.1410, -0.2665]])
tensor([0.2286, 0.3838])
tensor([[ 0.2376, -0.3153],
        [-0.2376,  0.3153]])
tensor([-0.5920,  0.5920])


# 3. 각 parameter 별로 3을 곱해준다

In [126]:
# Gradient 확인
print("Cross Entropy Loss Gradient:")
ce_gradient = torch.autograd.grad(ce_loss, model.parameters(), retain_graph=True)
for grad_param in ce_gradient:
    print(torch.tensor(3) * grad_param)

Cross Entropy Loss Gradient:
tensor([[-0.0840, -0.1587],
        [-0.1410, -0.2665]])
tensor([0.2286, 0.3838])
tensor([[ 0.2376, -0.3153],
        [-0.2376,  0.3153]])
tensor([-0.5920,  0.5920])


## 4. weight에 2를 곱하고 gradient를 구하면 값이 달라진다

In [127]:
# 각 레이어의 가중치에 2를 곱함
for param in model.parameters():
    param.data = param.data * 2

ce_gradient = torch.autograd.grad(ce_loss, model.parameters(), retain_graph=True)
for grad_param in ce_gradient:
    print(grad_param)

tensor([[-0.0280, -0.0529],
        [-0.0470, -0.0888]])
tensor([0.0762, 0.1279])
tensor([[ 0.0792, -0.1051],
        [-0.0792,  0.1051]])
tensor([-0.1973,  0.1973])


# 5. 각 parameter 별로 3을 더해준다

In [128]:
# Gradient 확인
print("Cross Entropy Loss Gradient:")
ce_gradient = torch.autograd.grad(ce_loss, model.parameters(), retain_graph=True)
for grad_param in ce_gradient:
    print(grad_param)

Cross Entropy Loss Gradient:
tensor([[-0.0280, -0.0529],
        [-0.0470, -0.0888]])
tensor([0.0762, 0.1279])
tensor([[ 0.0792, -0.1051],
        [-0.0792,  0.1051]])
tensor([-0.1973,  0.1973])


In [129]:
# Gradient 확인
print("Cross Entropy Loss Gradient:")
ce_gradient = torch.autograd.grad(ce_loss, model.parameters(), retain_graph=True)
for grad_param in ce_gradient:
    print(torch.tensor(3) + grad_param)

Cross Entropy Loss Gradient:
tensor([[2.9720, 2.9471],
        [2.9530, 2.9112]])
tensor([3.0762, 3.1279])
tensor([[3.0792, 2.8949],
        [2.9208, 3.1051]])
tensor([2.8027, 3.1973])


## 6. Gradient를 조작했을 떄 뭐가 달라지나?

In [130]:
learning_rate = 0.01
with torch.no_grad():
    for param, grad in zip(model.parameters(), ce_gradient):
        param -= learning_rate * grad
        print(param)

Parameter containing:
tensor([[-0.2732, -0.5517],
        [-0.7075, -0.9522]], requires_grad=True)
Parameter containing:
tensor([-1.2873,  0.1420], requires_grad=True)
Parameter containing:
tensor([[0.0468, 0.0778],
        [0.8205, 1.3724]], requires_grad=True)
Parameter containing:
tensor([-0.3823,  0.1888], requires_grad=True)


In [131]:
optimizer_with_modifier = optim.SGD(model_with_modifier.parameters(), lr=0.01)
optimizer_without_modifier = optim.SGD(model_without_modifier.parameters(), lr=0.01)

Cross Entropy Loss Gradient:


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [2, 2]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [None]:
learning_rate = 0.01
with torch.no_grad():
    for param, grad in zip(model.parameters(), ce_gradient):
        param -= learning_rate * grad
        print(param)