In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.distributions import Normal, Categorical
import torch.optim as optim
import random
from collections import deque

# 检查GPU可用性
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 已有数据（保持不变）
seed_value = 1234
np.random.seed(seed_value)
UNIT = 40
MAZE_H = 8
MAZE_W = 8
utilization_ratios_device = 0.1
utilization_ratios_server = 0.1
car_flop = 1.3 * 10**12
car_power = 20
# UAV_flop = 1.3 * 10**12*0.4##0.641 * 10**12
UAV_flop = 0.641 * 10**12
# UAV_power = 20##30
UAV_power = 30
e_flop = 330 * 10**12
e_power = 450
d_fai = 5*10**-29
trans_v_up = 100*1024*1024/4 #553081138.4484484
trans_v_dn = 20*1024*1024/4
p_cm = 0.1
# nums_data = np.array([5, 3, 4, 7, 9])  # 客户端本地数据量
partition_point = [0, 1, 2, 3, 4, 5, 6]

num_img_UAV = 3
num_img_car = 1

device_load = [0.3468e9, 0.3519e9, 2.3408e9, 2.3409e9, 5.3791e9, 9.6951e9, 12.077e9]
server_load = [11.7321e9, 11.727e9, 9.7381e9, 9.738e9, 6.6998e9, 2.3838e9, 0.0019e9]
exchanged_data = [2359296, 2359296, 2359296, 2359296, 1179628, 589824, 294912]
privacy_leak = [0.96122, 0.608901, 0.57954889, 0.593044, 0.535525, 0.007155, 0.054303]

# 转换为NumPy数组
np_partition = np.array(partition_point)
np_device = np.array(device_load)
np_server = np.array(server_load)
np_exchanged = np.array(exchanged_data)
np_privacy = np.array(privacy_leak)



Using device: cuda


In [2]:
def cost_cal(num_data, v_flop, device_power, partition_index):
    partial_device = np_device[partition_index]
    device_time = partial_device * num_data / (v_flop *utilization_ratios_device)

    partial_server = np_server[partition_index]
    server_time = partial_server * num_data / ( e_flop* utilization_ratios_server + 1e-8)
    # print(f"device_time is {device_time}, server_time is {server_time}, cal_time is {device_time+server_time}")

    feature = np_exchanged[partition_index]
    trans_t_up = feature / trans_v_up * num_data
    # print(f"device_time is {device_time}, server_time is {server_time}, cal_time is {device_time+server_time},trans_t_up is {trans_t_up}")
    energy_cal = ((partial_device * device_power) / v_flop + (
            partial_server * e_power * utilization_ratios_server) / e_flop) * num_data
    energy_trans = num_data * p_cm * trans_t_up
    energy = energy_cal + energy_trans
    # print(f"energy cal is{energy_cal}, trans is {energy_trans}")
    landa_trans = 1
    time_all = device_time + server_time + landa_trans * trans_t_up
    return time_all, energy

In [3]:

for partition_num in range(len(partition_point)):
    time_UAV, energy_UAV = cost_cal(3, UAV_flop, UAV_power, partition_num)
    # time_car, energy_car = cost_cal(1, car_flop, partition_num)
            # total_time += time
    reward = -(time_UAV)*0.4 - (energy_UAV)*0.3 - 0.3*np_privacy[partition_num]
    # print(f"partition is {partition_num+1}:time is {time_car+time_UAV}, energy is {energy_car + energy_UAV}, reward is {reward}")
    print(f" reward is {reward}")
    
print(f"______________")
for partition_num in range(len(partition_point)):
    # time_UAV, energy_UAV = cost_cal(3, UAV_flop, partition_num)
    time_car, energy_car = cost_cal(1, car_flop, car_power, partition_num)
            # total_time += time
    reward = -(time_car)*0.4 - (energy_car)*0.3 - 0.3*np_privacy[partition_num]
    # print(f"partition is {partition_num+1}:time is {time_car+time_UAV}, energy is {energy_car + energy_UAV}, reward is {reward}")
    print(f" reward is {reward}")

 reward is -0.4436326264607857
 reward is -0.3382464115090058
 reward is -0.4501338789805701
 reward is -0.454188480314849
 reward is -0.5551497908531352
 reward is -0.6254742198482486
 reward is -0.7676227740355978
______________
 reward is -0.33035584912587407
 reward is -0.22469910944055943
 reward is -0.23108723524009325
 reward is -0.23513653216783215
 reward is -0.2217401566679699
 reward is -0.08652560594405594
 reward is -0.11402850075757574


In [4]:
class PartitionEnv:
    def __init__(self, np_partition, np_device, np_server, np_exchanged, np_privacy, device, device_index):
        self.np_partition = np_partition
        self.np_device = np_device
        self.np_server = np_server
        self.np_exchanged = np_exchanged
        self.np_privacy = np_privacy
        self.device = device
        self.device_index = device_index
        self.current_step = 0
        self.max_steps = 10
        
    def reset(self):
        self.current_step = 0
        # 将状态转移到正确的设备
        return torch.tensor([self.np_partition[self.current_step]], dtype=torch.float32).to(self.device)
    
    def step(self, action):
        partition_num = int(action)
        if self.device_index == 0:
            num_data = 3  # 假定每次选择相同数量的数据
            v_flop = 0.641 * 10**12
            device_power = 30
        else:
            num_data = 1
            v_flop = 1.3 * 10**12
            device_power = 20
        
        time, energy = cost_cal(num_data, v_flop, device_power, partition_num)
        
        # 计算reward: 负值越小越好
        reward = -(time)*0.4 - (energy)*0.3 - 0.3*self.np_privacy[partition_num]
        # print(f"device {self.device_index} reward = {abs(reward)}")
        self.current_step += 1
        
        done = self.current_step >= 10
        # 将下一个状态转移到正确的设备
        return torch.tensor([self.np_partition[partition_num]] if not done else [0], dtype=torch.float32).to(self.device), reward, done


# 经验回放池
class ReplayBuffer:
    def __init__(self, capacity, alpha=0.6):
        self.buffer = deque(maxlen=capacity)
        self.alpha = alpha
        # 存储优先级张量
        self.priorities = deque(maxlen=capacity)  

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
        # 初始优先级设为 1.0（张量形式，与 state 同设备）
        max_priority = torch.tensor(1.0, device=state.device)  
        self.priorities.append(max_priority)  # 存入张量

    def sample(self, batch_size, beta=0.4):
        # 将优先级转为 NumPy 数组（确保在 CPU 上）
        priorities = torch.stack(list(self.priorities)).cpu().detach().numpy() ** self.alpha
        
        probabilities = priorities / priorities.sum()
        indices = np.random.choice(len(self.buffer), batch_size, p=probabilities)
        batch = [self.buffer[idx] for idx in indices]

        # 计算权重（同之前逻辑）
        weights = (len(self.buffer) * probabilities[indices]) ** (-beta)
        weights /= weights.max()

        # 解包并转换为张量（保持设备一致）
        states, actions, rewards, next_states, dones = zip(*batch)
        states = torch.stack(states).to(states[0].device)  # 恢复原始设备
        next_states = torch.stack(next_states).to(next_states[0].device)
        actions = torch.tensor(actions).to(states[0].device)
        rewards = torch.tensor(rewards).to(states[0].device)
        dones = torch.tensor(dones).to(states[0].device).float()
        weights = torch.tensor(weights).to(states[0].device)

        return states, actions, rewards, next_states, dones, weights, indices

    def update_priorities(self, indices, priorities):
        for idx, priority in zip(indices, priorities):
            # 确保 priority 是张量（若不是，需转换）
            if not isinstance(priority, torch.Tensor):
                priority = torch.tensor(priority, device=self.priorities[0].device)
            self.priorities[idx] = priority

# Dueling DQN + Double DQN + Noisy Network
class DuelingQNetwork(nn.Module):
    def __init__(self, input_dim, action_dim, hidden_dim=128):
        super(DuelingQNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        
        # 分别用于计算状态值和动作优势
        self.value_fc = nn.Linear(hidden_dim, 1)
        self.advantage_fc = nn.Linear(hidden_dim, action_dim)
        
        # Noisy layers
        self.noisy_fc1 = nn.Linear(input_dim, hidden_dim)
        self.noisy_fc2 = nn.Linear(hidden_dim, hidden_dim)

        self.action_dim = action_dim

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        
        value = self.value_fc(x)
        advantage = self.advantage_fc(x)

        q_value = value + (advantage - advantage.mean())
        return q_value

    def noisy_forward(self, x):
        x = F.relu(self.noisy_fc1(x))
        x = F.relu(self.noisy_fc2(x))
        
        value = self.value_fc(x)
        advantage = self.advantage_fc(x)

        q_value = value + (advantage - advantage.mean())
        return q_value

# DQN with Double DQN and Dueling DQN
class RainbowDQN:
    def __init__(self, input_dim, action_dim, gamma=0.89, epsilon=0.2, lr=0.01, buffer_size=10000, batch_size=64):
        self.gamma = gamma
        self.epsilon = epsilon
        self.batch_size = batch_size
        self.action_dim = action_dim
        self.input_dim = input_dim

        self.q_network = DuelingQNetwork(input_dim, action_dim).to(device)
        self.target_q_network = DuelingQNetwork(input_dim, action_dim).to(device)
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)

        self.replay_buffer = ReplayBuffer(buffer_size)
        
        self.update_target_network()
    
    def update_target_network(self):
        self.target_q_network.load_state_dict(self.q_network.state_dict())

    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.randint(0, self.action_dim - 1)
        else:
            state = torch.tensor(state, dtype=torch.float32).to(device)
            with torch.no_grad():
                q_values = self.q_network(state)
            return torch.argmax(q_values).item()

    def compute_loss(self, batch):
        states, actions, rewards, next_states, dones, weights, indices = batch
    
        states = states.to(device)
        next_states = next_states.to(device)
        actions = actions.to(device)
        rewards = rewards.to(device)
        dones = dones.to(device).float()  # 将dones转换为float类型
        weights = weights.to(device)
    
        # Double DQN
        next_q_values = self.q_network(next_states)
        next_q_values_target = self.target_q_network(next_states)
        next_action = torch.argmax(next_q_values, dim=1)
    
        target_q_value = rewards + (1 - dones) * self.gamma * next_q_values_target.gather(1, next_action.unsqueeze(1)).squeeze(1)
    
        # 当前Q网络估计的Q值
        q_value = self.q_network(states).gather(1, actions.unsqueeze(1)).squeeze(1)
    
        # 计算损失
        loss = (weights * (q_value - target_q_value) ** 2).mean()
    
        return loss, target_q_value


    def train(self):
        if len(self.replay_buffer.buffer) < self.batch_size:
            return 0

        # 从经验回放池中采样一个batch
        batch = self.replay_buffer.sample(self.batch_size)
        
        # 计算损失
        loss, target_q_value = self.compute_loss(batch)

        # 执行反向传播
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # 更新优先级
        priorities = (target_q_value - loss).abs() + 1e-5
        self.replay_buffer.update_priorities(batch[6], priorities)

        return loss.item()


In [5]:
# def train_RainbowDQN():
#     device_index = 1
#     env = PartitionEnv(np_partition, np_device, np_server, np_exchanged, np_privacy, device, device_index)
#     agent = RainbowDQN(input_dim=1, action_dim=len(np_partition))

#     num_episodes = 1001
#     for episode in range(num_episodes):
#         state = env.reset()
#         done = False
#         total_reward = 0

#         while not done:
#             action = agent.select_action(state)
#             next_state, reward, done = env.step(action)
#             total_reward += reward

#             # 存储经验
#             agent.replay_buffer.push(state, action, reward, next_state, done)

#             # 更新Q网络
#             loss = agent.train()

#             state = next_state
        
#         if episode % 50 == 0:
#             agent.update_target_network()
#             print(f"device {device_index} Episode {episode}, action {action}")

#     return agent

# # 训练Rainbow DQN代理
# agent = train_RainbowDQN()

In [None]:
def train_UAVRainbowDQN():
    device_index = 0
    env = PartitionEnv(np_partition, np_device, np_server, np_exchanged, np_privacy, device, device_index)
    agent = RainbowDQN(input_dim=1, action_dim=len(np_partition))

    num_episodes = 30001
    for episode in range(num_episodes):
        state = env.reset()
        done = False
        total_reward = 0

        while not done:
            action = agent.select_action(state)
            next_state, reward, done = env.step(action)
            total_reward += reward

            # 存储经验
            agent.replay_buffer.push(state, action, reward, next_state, done)

            # 更新Q网络
            loss = agent.train()

            state = next_state
        
        if episode % 10 == 0:
            agent.update_target_network()
            print(f"device {device_index} Episode {episode}, action {action}")
            history_0.append(reward)

    return agent

# 训练Rainbow DQN代理
history_0 = []
agent = train_UAVRainbowDQN()


  from .autonotebook import tqdm as notebook_tqdm
  state = torch.tensor(state, dtype=torch.float32).to(device)


device 0 Episode 0, action 5
device 0 Episode 10, action 1
device 0 Episode 20, action 1
device 0 Episode 30, action 0
device 0 Episode 40, action 1
device 0 Episode 50, action 1
device 0 Episode 60, action 1
device 0 Episode 70, action 1
device 0 Episode 80, action 1
device 0 Episode 90, action 1
device 0 Episode 100, action 1
device 0 Episode 110, action 1
device 0 Episode 120, action 1
device 0 Episode 130, action 5
device 0 Episode 140, action 1
device 0 Episode 150, action 1
device 0 Episode 160, action 1
device 0 Episode 170, action 1
device 0 Episode 180, action 1
device 0 Episode 190, action 1
device 0 Episode 200, action 2
device 0 Episode 210, action 1
device 0 Episode 220, action 1
device 0 Episode 230, action 1
device 0 Episode 240, action 3
device 0 Episode 250, action 1
device 0 Episode 260, action 1
device 0 Episode 270, action 1
device 0 Episode 280, action 1
device 0 Episode 290, action 1
device 0 Episode 300, action 3
device 0 Episode 310, action 4
device 0 Episode 32

In [None]:
import pandas as pd
df = pd.DataFrame(history_0, columns=['Reward'])
excel_file_path = 'rainbow_0.xlsx'
df.to_excel(excel_file_path, index=False)

print(f"数据已成功保存到 {excel_file_path}")

In [None]:
# def train_UAVppo():
#     device_index = 0###is UAV
#     input_dim = 1  # 只考虑当前分割点的索引
#     action_dim = len(np_partition)  # 可选的分割点数量
#     hidden_dim = 128
#     ppo = PPO(input_dim, action_dim, hidden_dim).to(device)  # 确保模型也在device上
#     optimizer = optim.Adam(ppo.parameters(), lr=1e-3)

#     epochs = 1000
#     gamma = 0.99  # 折扣因子
#     epsilon = 0.2  # PPO的裁剪系数
#     env = PartitionEnv(np_partition, np_device, np_server, np_exchanged, np_privacy, device, device_index)
    
#     for epoch in range(epochs):
#         state = env.reset()
#         done = False
#         total_reward = 0
#         while not done:
#             logits, value = ppo(state)
#             dist = Categorical(logits=logits)
#             action = dist.sample()
#             next_state, reward, done = env.step(action)
#             total_reward += reward

#             # 计算优势（reward的差异与当前价值的差异）
#             advantage = reward + gamma * value - value
            
#             # 计算损失函数
#             log_prob = dist.log_prob(action)
#             ratio = torch.exp(log_prob - dist.log_prob(action))  # 当前和旧的策略比率
#             clip_advantage = torch.clamp(ratio, 1 - epsilon, 1 + epsilon) * advantage
#             loss = -torch.min(ratio * advantage, clip_advantage).mean() + 0.5 * advantage.pow(2).mean()

#             # 优化
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()

#         if epoch % 100 == 0:
#             print(f'device {device_index}, split: {action}')
    
#     # 返回训练后找到的最佳分割点
#     return ppo

# # 训练PPO代理
# ppo_model = train_UAVppo()

In [None]:
# for item in history_0:
#     print(item)