In [1]:
import torch
import torch.nn as nn
import torch.nn.utils as utils
import torch.nn.functional as F
import numpy as np
from torch.distributions import Normal, Categorical
import torch.optim as optim
import random
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler

from collections import deque

# 检查GPU可用性
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 已有数据（保持不变）
seed_value = 1234
np.random.seed(seed_value)
UNIT = 40
MAZE_H = 8
MAZE_W = 8
utilization_ratios_device = 0.1
utilization_ratios_server = 0.1
car_flop = 1.3 * 10**12
car_power = 20
# UAV_flop = 1.3 * 10**12*0.4##0.641 * 10**12
UAV_flop = 0.641 * 10**12
# UAV_power = 20##30
UAV_power = 30
e_flop = 330 * 10**12
e_power = 450
d_fai = 5*10**-29
trans_v_up = 100*1024*1024/4 #553081138.4484484
trans_v_dn = 20*1024*1024/4
p_cm = 0.1
# nums_data = np.array([5, 3, 4, 7, 9])  # 客户端本地数据量
partition_point = [0, 1, 2, 3, 4, 5, 6]

num_img_UAV = 3
num_img_car = 1

device_load = [0.3468e9, 0.3519e9, 2.3408e9, 2.3409e9, 5.3791e9, 9.6951e9, 12.077e9]
server_load = [11.7321e9, 11.727e9, 9.7381e9, 9.738e9, 6.6998e9, 2.3838e9, 0.0019e9]
exchanged_data = [2359296, 2359296, 2359296, 2359296, 1179628, 589824, 294912]
privacy_leak = [0.96122, 0.608901, 0.57954889, 0.593044, 0.535525, 0.007155, 0.054303]

# 转换为NumPy数组
np_partition = np.array(partition_point)
np_device = np.array(device_load)
np_server = np.array(server_load)
np_exchanged = np.array(exchanged_data)
np_privacy = np.array(privacy_leak)



Using device: cuda


In [2]:
def cost_cal(num_data, v_flop, device_power, partition_index):
    partial_device = np_device[partition_index]
    device_time = partial_device * num_data / (v_flop *utilization_ratios_device)

    partial_server = np_server[partition_index]
    server_time = partial_server * num_data / ( e_flop* utilization_ratios_server + 1e-8)
    # print(f"device_time is {device_time}, server_time is {server_time}, cal_time is {device_time+server_time}")

    feature = np_exchanged[partition_index]
    trans_t_up = feature / trans_v_up * num_data
    # print(f"device_time is {device_time}, server_time is {server_time}, cal_time is {device_time+server_time},trans_t_up is {trans_t_up}")
    energy_cal = ((partial_device * device_power) / v_flop + (
            partial_server * e_power * utilization_ratios_server) / e_flop) * num_data
    energy_trans = num_data * p_cm * trans_t_up
    energy = energy_cal + energy_trans
    # print(f"energy cal is{energy_cal}, trans is {energy_trans}")
    landa_trans = 1
    time_all = device_time + server_time + landa_trans * trans_t_up
    return time_all, energy

In [3]:

for partition_num in range(len(partition_point)):
    time_UAV, energy_UAV = cost_cal(3, UAV_flop, UAV_power, partition_num)
    # time_car, energy_car = cost_cal(1, car_flop, partition_num)
            # total_time += time
    reward = -(time_UAV)*0.4 - (energy_UAV)*0.3 - 0.3*np_privacy[partition_num]
    # print(f"partition is {partition_num+1}:time is {time_car+time_UAV}, energy is {energy_car + energy_UAV}, reward is {reward}")
    print(f" reward is {reward}")
    
print(f"______________")
for partition_num in range(len(partition_point)):
    # time_UAV, energy_UAV = cost_cal(3, UAV_flop, partition_num)
    time_car, energy_car = cost_cal(1, car_flop, car_power, partition_num)
            # total_time += time
    reward = -(time_car)*0.4 - (energy_car)*0.3 - 0.3*np_privacy[partition_num]
    # print(f"partition is {partition_num+1}:time is {time_car+time_UAV}, energy is {energy_car + energy_UAV}, reward is {reward}")
    print(f" reward is {reward}")

 reward is -0.4436326264607857
 reward is -0.3382464115090058
 reward is -0.4501338789805701
 reward is -0.454188480314849
 reward is -0.5551497908531352
 reward is -0.6254742198482486
 reward is -0.7676227740355978
______________
 reward is -0.33035584912587407
 reward is -0.22469910944055943
 reward is -0.23108723524009325
 reward is -0.23513653216783215
 reward is -0.2217401566679699
 reward is -0.08652560594405594
 reward is -0.11402850075757574


In [4]:


# 定义环境
class PartitionEnv:
    def __init__(self, np_partition, np_device, np_server, np_exchanged, np_privacy, device, device_index):
        self.np_partition = np_partition
        self.np_device = np_device
        self.np_server = np_server
        self.np_exchanged = np_exchanged
        self.np_privacy = np_privacy
        self.device = device
        self.device_index = device_index
        self.current_step = 0
        self.max_steps = len(np_partition)
        
    def reset(self):
        self.current_step = 0
        # 将状态转移到正确的设备
        return torch.tensor([self.np_partition[self.current_step]], dtype=torch.float32).to(self.device)
    
    def step(self, action):
        partition_num = int(action)
        if self.device_index == 0:
            num_data = 3  # 假定每次选择相同数量的数据
            v_flop = 0.641 * 10**12
            device_power = 30
        else:
            num_data = 1
            v_flop = 1.3 * 10**12
            device_power = 20
        
        time, energy = cost_cal(num_data, v_flop, device_power, partition_num)
        
        # 计算reward: 负值越小越好
        reward = -(time)*0.4 - (energy)*0.3 - 0.3*self.np_privacy[partition_num]
        # print(f"device {self.device_index} reward = {abs(reward)}")
        self.current_step += 1
        
        done = self.current_step >= 10
        # 将下一个状态转移到正确的设备
        return torch.tensor([self.np_partition[partition_num]] if not done else [0], dtype=torch.float32).to(self.device), reward, done


In [5]:
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=64):
        super(ActorCritic, self).__init__()
        # 共享特征层
        self.shared = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh()
        )
        # 策略网络（Actor）：输出动作概率分布
        self.actor = nn.Linear(hidden_dim, action_dim)
        # 价值网络（Critic）：估计状态价值
        self.critic = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.shared(x)
        logits = self.actor(x)  # 动作logits
        value = self.critic(x)  # 状态价值
        return logits, value

    def get_action(self, state):
        """获取动作和对应的概率、价值"""
        logits, value = self.forward(state)
        dist = Categorical(logits=logits)  # 离散动作分布
        action = dist.sample()
        log_prob = dist.log_prob(action)
        return action.item(), log_prob, value.item()


# 2. 定义PPO智能体
class PPOAgent:
    def __init__(self, state_dim, action_dim, lr=3e-4, gamma=0.99, gae_lambda=0.95, 
                 clip_epsilon=0.2, K_epochs=10, batch_size=32):
        self.gamma = gamma
        self.gae_lambda = gae_lambda
        self.clip_epsilon = clip_epsilon
        self.K_epochs = K_epochs
        self.batch_size = batch_size
        self.net = ActorCritic(state_dim, action_dim).to(device)
        self.optimizer = optim.Adam(self.net.parameters(), lr=lr)

    def compute_gae(self, rewards, values, dones):
        advantages = []
        advantage = 0
        for i in reversed(range(len(rewards))):
            delta = rewards[i] + self.gamma * values[i+1] * (1 - dones[i]) - values[i]
            advantage = delta + self.gamma * self.gae_lambda * (1 - dones[i]) * advantage
            advantages.insert(0, advantage)
        returns = [a + v for a, v in zip(advantages, values[:-1])]
        return advantages, returns

    def update(self, states, actions, old_log_probs, advantages, returns):
        states = torch.tensor(states, dtype=torch.float32).unsqueeze(1).to(device)
        actions = torch.tensor(actions, dtype=torch.long).to(device)
        old_log_probs = torch.tensor(old_log_probs, dtype=torch.float32).to(device)
        advantages = torch.tensor(advantages, dtype=torch.float32).to(device)
        returns = torch.tensor(returns, dtype=torch.float32).to(device)

        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        for _ in range(self.K_epochs):
            for idx in BatchSampler(SubsetRandomSampler(range(len(states))), self.batch_size, False):
                logits, values = self.net(states[idx])
                dist = Categorical(logits=logits)
                new_log_probs = dist.log_prob(actions[idx])
                values = values.squeeze()

                ratio = torch.exp(new_log_probs - old_log_probs[idx])
                surr1 = ratio * advantages[idx]
                surr2 = torch.clamp(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * advantages[idx]
                policy_loss = -torch.min(surr1, surr2).mean()

                value_loss = F.mse_loss(values, returns[idx])
                total_loss = policy_loss + 0.5 * value_loss

                self.optimizer.zero_grad()
                total_loss.backward()
                utils.clip_grad_norm_(self.net.parameters(), 0.5)
                self.optimizer.step()

    def get_action(self, state):
        return self.net.get_action(state)

In [6]:
# 3. 训练PPO智能体
def train_ppo(env, agent, episodes=1001):
    print("开始训练PPO智能体...")
    for episode in range(episodes):
        # 初始化轨迹存储
        states, actions, rewards, old_log_probs, values, dones = [], [], [], [], [], []
        state = env.reset()  # 重置环境
        values.append(agent.net(state.unsqueeze(0))[1].item())  # 初始价值

        # 收集轨迹
        while True:
            states.append(state.cpu().numpy()[0])  # 存储状态
            action, log_prob, value = agent.get_action(state.unsqueeze(0))  # 获取动作
            
            # 执行动作
            next_state, reward, done = env.step(action)
            
            # 存储数据
            actions.append(action)
            rewards.append(reward)
            old_log_probs.append(log_prob)
            values.append(value)
            dones.append(done)

            state = next_state
            if done:
                break

        # 计算GAE和回报
        advantages, returns = agent.compute_gae(rewards, values, dones)

        # 更新网络
        agent.update(states, actions, old_log_probs, advantages, returns)

        # 打印训练进度
        if episode % 10 == 0:
            # total_reward = sum(rewards)
            # print(f"Episode {episode}, cation: {action}, reward is {reward}")
            print(reward)

    print("训练完成！")
    return agent


# 4. 测试并找到最优分割位置
# def find_best_partition(env, agent):
#     print("\n测试各分割点的奖励...")
#     agent.net.eval()  # 切换到评估模式
#     reward_dict = {}
    
#     # 遍历所有分割点测试
#     with torch.no_grad():
#         for partition_num in range(7):
#             state = env.reset()
#             _, reward, _ = env.step(partition_num)  # 直接执行该分割点动作
#             reward_dict[partition_num] = abs(reward)  # 存储奖励绝对值

#     # 找到奖励绝对值最小的分割点
#     best_partition = min(reward_dict, key=reward_dict.get)
#     print(f"\n各分割点的奖励绝对值: {reward_dict}")
#     print(f"最优分割位置为: 分割点 {best_partition} (奖励绝对值: {reward_dict[best_partition]:.4f})")
#     return best_partition


# 5. 主函数执行
if __name__ == "__main__":
    # 初始化环境（设备索引0为UAV，1为车载设备，这里用UAV示例）
    env = PartitionEnv(np_partition, np_device, np_server, np_exchanged, np_privacy, device, device_index=1)
    
    # 初始化PPO智能体（状态维度1，动作维度7）
    agent = PPOAgent(state_dim=1, action_dim=7)
    
    # 训练智能体
    trained_agent = train_ppo(env, agent, episodes=2001)
    
    # 找到最优分割位置
    # best_partition = find_best_partition(env, trained_agent)

开始训练PPO智能体...
-0.08652560594405594
-0.33035584912587407
-0.23108723524009325
-0.23108723524009325
-0.23108723524009325
-0.2217401566679699
-0.23108723524009325
-0.23108723524009325
-0.23108723524009325
-0.23108723524009325
-0.11402850075757574
-0.23108723524009325
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574
-0.11402850075757574


In [7]:
Episode 0, cation: 4, reward is -0.2217401566679699
Episode 50, cation: 6, reward is -0.11402850075757574
Episode 100, cation: 5, reward is -0.08652560594405594
Episode 150, cation: 5, reward is -0.08652560594405594
Episode 200, cation: 5, reward is -0.08652560594405594
Episode 250, cation: 5, reward is -0.08652560594405594
Episode 300, cation: 5, reward is -0.08652560594405594
Episode 350, cation: 5, reward is -0.08652560594405594
Episode 400, cation: 5, reward is -0.08652560594405594
Episode 450, cation: 5, reward is -0.08652560594405594
Episode 500, cation: 5, reward is -0.08652560594405594
Episode 550, cation: 5, reward is -0.08652560594405594
Episode 600, cation: 5, reward is -0.08652560594405594
Episode 650, cation: 5, reward is -0.08652560594405594
Episode 700, cation: 5, reward is -0.08652560594405594
Episode 750, cation: 5, reward is -0.08652560594405594
Episode 800, cation: 5, reward is -0.08652560594405594
Episode 850, cation: 5, reward is -0.08652560594405594
Episode 900, cation: 5, reward is -0.08652560594405594
Episode 950, cation: 5, reward is -0.08652560594405594
Episode 1000, cation: 5, reward is -0.08652560594405594
Episode 1050, cation: 5, reward is -0.08652560594405594
Episode 1100, cation: 5, reward is -0.08652560594405594
Episode 1150, cation: 5, reward is -0.08652560594405594
Episode 1200, cation: 5, reward is -0.08652560594405594
Episode 1250, cation: 5, reward is -0.08652560594405594
Episode 1300, cation: 5, reward is -0.08652560594405594
Episode 1350, cation: 5, reward is -0.08652560594405594
Episode 1400, cation: 5, reward is -0.08652560594405594
Episode 1450, cation: 5, reward is -0.08652560594405594
Episode 1500, cation: 5, reward is -0.08652560594405594
Episode 1550, cation: 5, reward is -0.08652560594405594
Episode 1600, cation: 5, reward is -0.08652560594405594
Episode 1650, cation: 5, reward is -0.08652560594405594
Episode 1700, cation: 5, reward is -0.08652560594405594
Episode 1750, cation: 5, reward is -0.08652560594405594
Episode 1800, cation: 5, reward is -0.08652560594405594
Episode 1850, cation: 5, reward is -0.08652560594405594
Episode 1900, cation: 5, reward is -0.08652560594405594
Episode 1950, cation: 5, reward is -0.08652560594405594
Episode 2000, cation: 5, reward is -0.08652560594405594
Episode 2050, cation: 5, reward is -0.08652560594405594
Episode 2100, cation: 5, reward is -0.08652560594405594
Episode 2150, cation: 5, reward is -0.08652560594405594
Episode 2200, cation: 5, reward is -0.08652560594405594
Episode 2250, cation: 5, reward is -0.08652560594405594
Episode 2300, cation: 5, reward is -0.08652560594405594
Episode 2350, cation: 5, reward is -0.08652560594405594
Episode 2400, cation: 5, reward is -0.08652560594405594
Episode 2450, cation: 5, reward is -0.08652560594405594
Episode 2500, cation: 5, reward is -0.08652560594405594
Episode 2550, cation: 5, reward is -0.08652560594405594
Episode 2600, cation: 5, reward is -0.08652560594405594
Episode 2650, cation: 5, reward is -0.08652560594405594
Episode 2700, cation: 5, reward is -0.08652560594405594
Episode 2750, cation: 5, reward is -0.08652560594405594
Episode 2800, cation: 5, reward is -0.08652560594405594
Episode 2850, cation: 5, reward is -0.08652560594405594
Episode 2900, cation: 5, reward is -0.08652560594405594
Episode 2950, cation: 5, reward is -0.08652560594405594
Episode 3000, cation: 5, reward is -0.08652560594405594
训练完成！

SyntaxError: invalid character '！' (U+FF01) (2933829182.py, line 62)