In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.distributions import Normal, Categorical
import torch.optim as optim
import random
from collections import deque

# 检查GPU可用性
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 已有数据（保持不变）
seed_value = 1234
np.random.seed(seed_value)
UNIT = 40
MAZE_H = 8
MAZE_W = 8
utilization_ratios_device = 0.1
utilization_ratios_server = 0.1
car_flop = 1.3 * 10**12
car_power = 20
# UAV_flop = 1.3 * 10**12*0.4##0.641 * 10**12
UAV_flop = 0.641 * 10**12
# UAV_power = 20##30
UAV_power = 30
e_flop = 330 * 10**12
e_power = 450
d_fai = 5*10**-29
trans_v_up = 100*1024*1024/4 #553081138.4484484
trans_v_dn = 20*1024*1024/4
p_cm = 0.1
# nums_data = np.array([5, 3, 4, 7, 9])  # 客户端本地数据量
partition_point = [0, 1, 2, 3, 4, 5, 6]

num_img_UAV = 3
num_img_car = 1

device_load = [0.3468e9, 0.3519e9, 2.3408e9, 2.3409e9, 5.3791e9, 9.6951e9, 12.077e9]
server_load = [11.7321e9, 11.727e9, 9.7381e9, 9.738e9, 6.6998e9, 2.3838e9, 0.0019e9]
exchanged_data = [2359296, 2359296, 2359296, 2359296, 1179628, 589824, 294912]
privacy_leak = [0.96122, 0.608901, 0.57954889, 0.593044, 0.535525, 0.007155, 0.054303]

# 转换为NumPy数组
np_partition = np.array(partition_point)
np_device = np.array(device_load)
np_server = np.array(server_load)
np_exchanged = np.array(exchanged_data)
np_privacy = np.array(privacy_leak)



Using device: cuda


In [2]:
def cost_cal(num_data, v_flop, device_power, partition_index):
    partial_device = np_device[partition_index]
    device_time = partial_device * num_data / (v_flop *utilization_ratios_device)

    partial_server = np_server[partition_index]
    server_time = partial_server * num_data / ( e_flop* utilization_ratios_server + 1e-8)
    # print(f"device_time is {device_time}, server_time is {server_time}, cal_time is {device_time+server_time}")

    feature = np_exchanged[partition_index]
    trans_t_up = feature / trans_v_up * num_data
    # print(f"device_time is {device_time}, server_time is {server_time}, cal_time is {device_time+server_time},trans_t_up is {trans_t_up}")
    energy_cal = ((partial_device * device_power) / v_flop + (
            partial_server * e_power * utilization_ratios_server) / e_flop) * num_data
    energy_trans = num_data * p_cm * trans_t_up
    energy = energy_cal + energy_trans
    # print(f"energy cal is{energy_cal}, trans is {energy_trans}")
    landa_trans = 1
    time_all = device_time + server_time + landa_trans * trans_t_up
    return time_all, energy

In [3]:

for partition_num in range(len(partition_point)):
    time_UAV, energy_UAV = cost_cal(3, UAV_flop, UAV_power, partition_num)
    # time_car, energy_car = cost_cal(1, car_flop, partition_num)
            # total_time += time
    reward = -(time_UAV)*0.4 - (energy_UAV)*0.3 - 0.3*np_privacy[partition_num]
    # print(f"partition is {partition_num+1}:time is {time_car+time_UAV}, energy is {energy_car + energy_UAV}, reward is {reward}")
    print(f" reward is {reward}")
    
print(f"______________")
for partition_num in range(len(partition_point)):
    # time_UAV, energy_UAV = cost_cal(3, UAV_flop, partition_num)
    time_car, energy_car = cost_cal(1, car_flop, car_power, partition_num)
            # total_time += time
    reward = -(time_car)*0.4 - (energy_car)*0.3 - 0.3*np_privacy[partition_num]
    # print(f"partition is {partition_num+1}:time is {time_car+time_UAV}, energy is {energy_car + energy_UAV}, reward is {reward}")
    print(f" reward is {reward}")

 reward is -0.4436326264607857
 reward is -0.3382464115090058
 reward is -0.4501338789805701
 reward is -0.454188480314849
 reward is -0.5551497908531352
 reward is -0.6254742198482486
 reward is -0.7676227740355978
______________
 reward is -0.33035584912587407
 reward is -0.22469910944055943
 reward is -0.23108723524009325
 reward is -0.23513653216783215
 reward is -0.2217401566679699
 reward is -0.08652560594405594
 reward is -0.11402850075757574


In [4]:
# 修改 ActorCritic 网络的 forward 函数，确保输入维度一致
class ActorCritic(nn.Module):
    def __init__(self, input_dim, action_dim, hidden_dim=128):
        super(ActorCritic, self).__init__()
        # Actor 网络
        self.fc1_actor = nn.Linear(input_dim, hidden_dim)
        self.fc2_actor = nn.Linear(hidden_dim, hidden_dim)
        self.fc3_actor = nn.Linear(hidden_dim, action_dim)  # 输出策略：选择分割点
        
        # Critic 网络
        self.fc1_critic = nn.Linear(input_dim, hidden_dim)
        self.fc2_critic = nn.Linear(hidden_dim, hidden_dim)
        self.fc3_critic = nn.Linear(hidden_dim, 1)  # 输出价值函数：当前状态的价值

    def forward(self, x):
        # Actor
        x_actor = torch.relu(self.fc1_actor(x))
        x_actor = torch.relu(self.fc2_actor(x_actor))
        action_probs = torch.softmax(self.fc3_actor(x_actor), dim=-1)
        
        # Critic
        x_critic = torch.relu(self.fc1_critic(x))
        x_critic = torch.relu(self.fc2_critic(x_critic))
        state_value = self.fc3_critic(x_critic)

        return action_probs, state_value


# 修改环境中的 reset 和 step 方法，确保状态是正确的维度
class PartitionEnv:
    def __init__(self, np_partition, np_device, np_server, np_exchanged, np_privacy, device, device_index):
        self.np_partition = np_partition
        self.np_device = np_device
        self.np_server = np_server
        self.np_exchanged = np_exchanged
        self.np_privacy = np_privacy
        self.device = device
        self.device_index = device_index
        self.current_step = 0
        self.max_steps = 10
        
    def reset(self):
        self.current_step = 0
        # 将状态转移到正确的设备
        return torch.tensor([self.np_partition[self.current_step]], dtype=torch.float32).to(self.device)
    
    def step(self, action):
        partition_num = int(action)
        if self.device_index == 0:
            num_data = 3  # 假定每次选择相同数量的数据
            v_flop = 0.641 * 10**12
            device_power = 30
        else:
            num_data = 1
            v_flop = 1.3 * 10**12
            device_power = 20
        
        time, energy = cost_cal(num_data, v_flop, device_power, partition_num)
        
        # 计算reward: 负值越小越好
        reward = -(time)*0.4 - (energy)*0.3 - 0.3*self.np_privacy[partition_num]
        # print(f"device {self.device_index} reward = {abs(reward)}")
        self.current_step += 1
        
        done = self.current_step >= 10
        # 将下一个状态转移到正确的设备
        return torch.tensor([self.np_partition[partition_num]] if not done else [0], dtype=torch.float32).to(self.device), reward, done


In [5]:
# def train_carAC():
#     device_index = 1  # 假定UAV
#     input_dim = 1  # 只考虑当前分割点的索引
#     action_dim = len(np_partition)  # 可选的分割点数量
#     hidden_dim = 128
#     model_car = ActorCritic(input_dim, action_dim, hidden_dim).to(device)  # 确保模型也在device上
#     optimizer = optim.Adam(model_car.parameters(), lr=1e-3)

#     epochs = 2001  # 增加训练周期
#     gamma = 0.99  # 折扣因子
#     env = PartitionEnv(np_partition, np_device, np_server, np_exchanged, np_privacy, device, device_index)
#     for epoch in range(epochs):
#         state = env.reset()
#         done = False
#         total_reward = 0
#         while not done:
#             # 获取Actor的策略输出和Critic的状态值
#             action_probs, state_value = model_car(state)
            
#             # 根据概率选择动作
#             dist = Categorical(action_probs)
#             action = dist.sample()
            
#             # 计算log概率
#             log_prob = dist.log_prob(action)
            
#             # 执行动作并得到下一个状态和奖励
#             next_state, reward, done = env.step(action)
#             total_reward += reward
            
#             # 计算目标值（TD误差）
#             _, next_state_value = model_car(next_state)
#             target_value = reward + gamma * next_state_value * (1 - done)
            
#             # 计算优势
#             advantage = target_value - state_value
            
#             # 计算损失
#             actor_loss = -log_prob * advantage.detach()  # Actor损失：策略梯度
#             critic_loss = advantage.pow(2)  # Critic损失：均方误差
#             loss = actor_loss + 0.5 * critic_loss.mean()

#             # 优化
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()

#         # 每100个epoch打印一次
#         if epoch % 50 == 0:
#             print(f'Epoch [{epoch}/{epochs}], Total Reward: {action},reward {reward}')
    
#     # 返回训练后找到的最佳分割点
#     return model_car

# # 训练AC代理
# ac_model = train_carAC()

In [6]:
# Epoch [0/2001], Total Reward: 1,reward -0.22469910944055943
# Epoch [50/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [100/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [150/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [200/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [250/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [300/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [350/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [400/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [450/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [500/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [550/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [600/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [650/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [700/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [750/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [800/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [850/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [900/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [950/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1000/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1050/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1100/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1150/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1200/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1250/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1300/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1350/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1400/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1450/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1500/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1550/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1600/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1650/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1700/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1750/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1800/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1850/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1900/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [1950/2001], Total Reward: 5,reward -0.08652560594405594
# Epoch [2000/2001], Total Reward: 5,reward -0.0865256059440559

In [7]:
def train_UAVAC():
    device_index = 1  # 假定UAV
    input_dim = 1  # 只考虑当前分割点的索引
    action_dim = len(np_partition)  # 可选的分割点数量
    hidden_dim = 128
    model_UAV = ActorCritic(input_dim, action_dim, hidden_dim).to(device)  # 确保模型也在device上
    optimizer = optim.Adam(model_UAV.parameters(), lr=1e-3)

    epochs = 1001  # 增加训练周期
    gamma = 0.99  # 折扣因子
    env = PartitionEnv(np_partition, np_device, np_server, np_exchanged, np_privacy, device, device_index)
    for epoch in range(epochs):
        state = env.reset()
        done = False
        total_reward = 0
        while not done:
            # 获取Actor的策略输出和Critic的状态值
            action_probs, state_value = model_UAV(state)
            
            # 根据概率选择动作
            dist = Categorical(action_probs)
            action = dist.sample()
            
            # 计算log概率
            log_prob = dist.log_prob(action)
            
            # 执行动作并得到下一个状态和奖励
            next_state, reward, done = env.step(action)
            total_reward += reward
            
            # 计算目标值（TD误差）
            _, next_state_value = model_UAV(next_state)
            target_value = reward + gamma * next_state_value * (1 - done)
            
            # 计算优势
            advantage = target_value - state_value
            
            # 计算损失
            actor_loss = -log_prob * advantage.detach()  # Actor损失：策略梯度
            critic_loss = advantage.pow(2)  # Critic损失：均方误差
            loss = actor_loss + 0.5 * critic_loss.mean()

            # 优化
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # 每100个epoch打印一次
        if epoch % 10 == 0:
            # print(f'Epoch [{epoch}/{epochs}], Total Reward: {action}, reward {reward}')
            print(reward)
    
    # 返回训练后找到的最佳分割点
    return model_UAV

# 训练AC代理
UAV_model = train_UAVAC()

-0.23108723524009325
-0.23108723524009325
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.08652560594405594
-0.0865256059