In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class CosineDecay(object):
    def __init__(self,
                max_value,
                min_value,
                num_loops):
        self._max_value = max_value
        self._min_value = min_value
        self._num_loops = num_loops

    def get_value(self, i):
        if i < 0:
            i = 0
        if i >= self._num_loops:
            i = self._num_loops
        value = (math.cos(i * math.pi / self._num_loops) + 1.0) * 0.5
        value = value * (self._max_value - self._min_value) + self._min_value
        return 1-value
    
gradient_decay = CosineDecay(max_value=1, min_value=0, num_loops=100)
[gradient_decay.get_value(i) for i in range(20)]
# 1 + 20 * torch.sigmoid(torch.tensor([1]))

[0.0,
 0.0002467198171342,
 0.0009866357858642205,
 0.002219017698460002,
 0.0039426493427610065,
 0.006155829702431115,
 0.008856374635655584,
 0.012041619030626283,
 0.015708419435684462,
 0.019853157161528467,
 0.02447174185242318,
 0.029559615522887217,
 0.035111757055874326,
 0.041122687158009485,
 0.04758647376699021,
 0.054496737905816106,
 0.061846659978068264,
 0.06962898649802818,
 0.07783603724899246,
 0.08645971286271914]

In [4]:
class Global_T(nn.Module):
    def __init__(self):
        super(Global_T, self).__init__()
        
        self.global_T = nn.Parameter(torch.ones(1), requires_grad=True)
        self.grl = GradientReversal()

    def forward(self, fake_input1, fake_input2, lambda_):
        return self.grl(self.global_T, lambda_)


from torch.autograd import Function
class GradientReversalFunction(Function):
    """
    Gradient Reversal Layer from:
    Unsupervised Domain Adaptation by Backpropagation (Ganin & Lempitsky, 2015)
    Forward pass is the identity function. In the backward pass,
    the upstream gradients are multiplied by -lambda (i.e. gradient is reversed)
    """

    @staticmethod
    def forward(ctx, x, lambda_):
        ctx.lambda_ = lambda_
        return x.clone()

    @staticmethod
    def backward(ctx, grads):
        lambda_ = ctx.lambda_
        lambda_ = grads.new_tensor(lambda_)
        dx = lambda_ * grads
        # print(dx)
        return dx, None


class GradientReversal(torch.nn.Module):
    def __init__(self):
        super(GradientReversal, self).__init__()
        # self.lambda_ = lambda_

    def forward(self, x, lambda_):
        return GradientReversalFunction.apply(x, lambda_)
    

model = Global_T()
input = torch.rand(24,24,24)
input2 = torch.rand(24,24,24)

out = model(input, input2, 2)

print(out)

tensor([1.], grad_fn=<GradientReversalFunctionBackward>)


In [None]:
from models.util import MadKD

m = torch.load("mlp.pt")
[print(m.mlp[i].weight) for i in [0, 2, 4]]

In [66]:
import torch
import torch.nn as nn
import torch.optim as optim

class GradientReversalLayer(nn.Module):
    def __init__(self, weight=1.0):
        super(GradientReversalLayer, self).__init__()
        self.weight = torch.tensor(weight)

    def forward(self, x):
        return GradientReversalFunction.apply(x, self.weight)

class GradientReversalFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, weight):
        ctx.save_for_backward(weight)
        return x

    @staticmethod
    def backward(ctx, grad_output):
        weight, = ctx.saved_tensors
        grad_input = weight * grad_output.neg()
        return grad_input, None

class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 1)
        self.grl = GradientReversalLayer()

    def forward(self, x):
        x = self.fc(x)
        x = self.grl(x)
        return x
    
# 创建模型和优化器
model = SimpleModel()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# 输入数据
input_data = torch.randn(1, 10, requires_grad=True)

# 前向传播
output_before_grl = model(input_data)
loss_before_grl = output_before_grl.sum()

# 反向传播
optimizer.zero_grad()
loss_before_grl.backward()
print("Gradients before GRL:", model.fc.weight.grad)


# 前向传播（梯度翻转层不改变前向传播的结果）
input_data_grl = model(input_data)
# 添加梯度翻转层
grl_layer = GradientReversalLayer()
output_after_grl = grl_layer(input_data_grl)
loss_after_grl = output_after_grl.sum()

# 反向传播
optimizer.zero_grad()
loss_after_grl.backward()
print("Gradients after GRL:", model.fc.weight.grad)

Gradients before GRL: tensor([[ 0.1056, -0.3990, -0.5509, -0.5313,  0.6619, -0.8576,  0.0689, -0.2342,
         -0.5758, -1.0041]])
Gradients after GRL: tensor([[-0.1056,  0.3990,  0.5509,  0.5313, -0.6619,  0.8576, -0.0689,  0.2342,
          0.5758,  1.0041]])


In [59]:
def kd_loss(y_s, y_t, temperature):
    p_s = F.log_softmax(y_s/temperature, dim=1)
    p_t = F.softmax(y_t/temperature, dim=1)
    loss = nn.KLDivLoss(reduction='none')(p_s, p_t) * (temperature**2)
    return loss

a = torch.load('y_s_model.pth')
b = torch.load('y_t_model.pth')
torch.set_printoptions(threshold=10000, linewidth=1000)
loss = kd_loss(a, b, 4)
loss, loss.sum(dim=1), loss.sum(dim=1).shape
torch.topk(loss[0:10,:], k=2, dim=1)

torch.return_types.topk(
values=tensor([[4.5864e+36, 0.0000e+00],
        [0.0000e+00, 0.0000e+00],
        [4.3664e+36, 0.0000e+00],
        [4.3664e+36, 0.0000e+00],
        [4.5864e+36, 0.0000e+00],
        [4.5864e+36, 0.0000e+00],
        [4.5864e+36, 0.0000e+00],
        [4.5864e+36, 1.6489e+30],
        [4.5864e+36, 0.0000e+00],
        [0.0000e+00, 0.0000e+00]], device='cuda:0', grad_fn=<TopkBackward0>),
indices=tensor([[69,  0],
        [ 1,  0],
        [27,  0],
        [27,  0],
        [69,  0],
        [69,  0],
        [69,  0],
        [69, 27],
        [69,  0],
        [ 1,  0]], device='cuda:0'))

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

def kd_loss(y_s, y_t, temperature):
    p_s = F.log_softmax(y_s/temperature, dim=1)
    p_t = F.softmax(y_t/temperature, dim=1)
    loss = nn.KLDivLoss(reduction='batchmean')(p_s, p_t) * (temperature**2)
    return loss

x = torch.randn(2, 8, requires_grad=True)
gt = torch.ones((2), dtype=torch.long)

ms = nn.Linear(8, 10)
mt = nn.Linear(8, 10)

m = nn.Linear(10, 2)

opt = optim.SGD(nn.ModuleList([ms, m]).parameters(), lr=0.1)

p = ms(x)  
with torch.no_grad(): 
    q = mt(x)   

for param in ms.parameters():
    param.requires_grad = False

# train for mlp
p_m = m(p)
with torch.no_grad(): 
    q_m = m(q)

for param in ms.parameters():
    param.requires_grad = True
    
p_md = m(p.detach())
q_md = m(q.detach())

ctr = kd_loss(p_md, q_md, 4) + kd_loss(p_m, q_m, 4)
ctr.backward(retain_graph=True)
ms.weight.grad, m.weight.grad

(tensor([[-4.9106e-02, -7.6506e-02,  5.0916e-02, -3.6265e-02,  7.5159e-02,
           7.8946e-02, -2.7386e-04,  3.9537e-02],
         [ 1.5538e-02,  2.4208e-02, -1.6111e-02,  1.1475e-02, -2.3782e-02,
          -2.4980e-02,  8.6655e-05, -1.2510e-02],
         [-9.3517e-03, -1.4570e-02,  9.6963e-03, -6.9063e-03,  1.4313e-02,
           1.5034e-02, -5.2155e-05,  7.5294e-03],
         [ 4.1071e-03,  6.3987e-03, -4.2584e-03,  3.0331e-03, -6.2861e-03,
          -6.6028e-03,  2.2904e-05, -3.3068e-03],
         [ 3.5342e-02,  5.5062e-02, -3.6645e-02,  2.6101e-02, -5.4093e-02,
          -5.6818e-02,  1.9710e-04, -2.8455e-02],
         [-1.3887e-02, -2.1636e-02,  1.4399e-02, -1.0256e-02,  2.1255e-02,
           2.2326e-02, -7.7446e-05,  1.1181e-02],
         [-1.5707e-02, -2.4471e-02,  1.6285e-02, -1.1600e-02,  2.4040e-02,
           2.5251e-02, -8.7590e-05,  1.2646e-02],
         [ 5.2004e-02,  8.1020e-02, -5.3920e-02,  3.8405e-02, -7.9594e-02,
          -8.3604e-02,  2.9002e-04, -4.1870e-02],


In [21]:
import torch

# 假设有一个损失值张量
losses = torch.tensor([0.0, 0.0, 0.0, 0.0])  # 这里只是举例，实际情况可能是一个批次的损失值

# 计算损失的批次均值
batch_mean = torch.mean(losses)
print("Batch Mean Loss:", batch_mean.item())


Batch Mean Loss: 0.0


In [42]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

def kd_loss(y_s, y_t, temperature):
    p_s = F.log_softmax(y_s/temperature, dim=1)
    p_t = F.softmax(y_t/temperature, dim=1)
    loss = nn.KLDivLoss(reduction='batchmean')(p_s, p_t) * (temperature**2)
    return loss

x = torch.randn(2, 3, requires_grad=True)
gt = torch.ones((2), dtype=torch.long)

ms = nn.Linear(3, 5)
mt = nn.Linear(3, 5)

m = nn.Linear(5, 4)

opt = optim.SGD(nn.ModuleList([ms, m]).parameters(), lr=0.1)

# way1
opt.zero_grad()

p = ms(x)  
with torch.no_grad(): 
    q = mt(x)   

for param in m.parameters():
    param.requires_grad = False

# train for mlp
p_m = m(p)
with torch.no_grad(): 
    q_m = m(q)

for param in m.parameters():
    param.requires_grad = True
    
p_md = m(p.detach())
q_md = m(q.detach())

ctr = kd_loss(p_m, q_m, 4) # kd_loss(p_md, q_md, 4) + 
ctr.backward(retain_graph=True)
print(ms.weight.grad, m.weight.grad)

opt.zero_grad()

# way2
p = ms(x)  
with torch.no_grad(): 
    q = mt(x)   

for param in m.parameters():
    param.requires_grad = False

# train for mlp
p_m = m(p)
with torch.no_grad(): 
    q_m = m(q)

for param in m.parameters():
    param.requires_grad = True
    
p_md = m(p.detach())
q_md = m(q.detach())

ctr = kd_loss(p, q, 4)
ctr.backward(retain_graph=True)
print(ms.weight.grad, m.weight.grad)

tensor([[-0.3086,  0.1316, -0.2693],
        [ 0.0367, -0.0231,  0.0470],
        [-0.3553,  0.1589, -0.3248],
        [-0.3210,  0.1425, -0.2914],
        [-0.2083,  0.0974, -0.1989]]) None
tensor([[ 0.2311, -0.1373,  0.2795],
        [-0.0049, -0.0013,  0.0024],
        [ 0.0251,  0.0059, -0.0114],
        [-0.0203,  0.0341, -0.0687],
        [-0.2310,  0.0986, -0.2018]]) None


In [None]:
import matplotlib.pyplot as plt
a = torch.load("y_s_model.pt")
b = torch.load("y_t_model.pt")

plt.plot(a.cpu().numpy())