In [1]:
#try custom PPO on PccNs-v0
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import network_sim

In [2]:
# 定义策略网络
class ActorCriticNetwork(nn.Module):
    def __init__(self,n_states,n_actions):
        super().__init__()
        self.fc1 = nn.Linear(n_states, 128)
        self.fc2 = nn.Linear(128, n_actions)  # 输出动作概率

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return F.softmax(self.fc2(x),dim=-1)

In [3]:
# 创建 PPO 代理
class PPOAgent:
    def __init__(self,n_states,n_actions,device):
        self.device=device
        self.actor_critic = ActorCriticNetwork(n_states,n_actions).to(self.device)
        self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=1e-3)

    def act(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        probs = self.actor_critic(state)
        action = probs.multinomial(1)[0].tolist()
        return action

    def train(self, states, actions, rewards, values):
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        values = torch.FloatTensor(values).to(self.device)

        # 计算优势函数
        advantages = rewards - values

        # 计算策略梯度
        log_probs = torch.log(self.actor_critic(states)[range(len(states)), actions]).to(self.device)
        actor_loss = -torch.mean(log_probs * advantages).to(self.device)

        # 计算价值函数损失
        critic_loss = F.mse_loss(self.actor_critic(states).squeeze(-1), rewards).to(self.device)

        # 计算总损失
        loss = actor_loss + 0.5 * critic_loss

        # 更新网络参数
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [4]:
env=gym.make('PccNs-v0')

History length: 10
Features: ['sent latency inflation', 'latency ratio', 'send ratio']
Getting min obs for ['sent latency inflation', 'latency ratio', 'send ratio']




In [5]:
n_states=env.observation_space.shape
n_actions=env.action_space.shape

In [6]:
agent=PPOAgent(n_states[0],n_actions[0],'cuda')
s=env.reset()

Reward: 0.00, Ewma Reward: 0.00


In [7]:
num_eval_episodes=100
eval_rewards=[]
for episode in range(num_eval_episodes):
    state=env.reset()
    done = False
    episode_reward = 0

    while not done:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward
        state = next_state

    eval_rewards.append(episode_reward)
        
print(f"Average evaluation reward: {np.mean(eval_rewards)}")

Reward: 0.00, Ewma Reward: 0.00
Reward: 301.59, Ewma Reward: 3.02
Reward: 1336.16, Ewma Reward: 16.35
Reward: 873.58, Ewma Reward: 24.92
Reward: 261.84, Ewma Reward: 27.29
Reward: 880.79, Ewma Reward: 35.82
Reward: 304.31, Ewma Reward: 38.51
Reward: 997.78, Ewma Reward: 48.10
Reward: 869.97, Ewma Reward: 56.32
Reward: 1108.74, Ewma Reward: 66.84
Reward: 1404.43, Ewma Reward: 80.22
Reward: 514.28, Ewma Reward: 84.56
Reward: 1123.74, Ewma Reward: 94.95
Reward: 772.99, Ewma Reward: 101.73
Reward: 717.62, Ewma Reward: 107.89
Reward: 600.29, Ewma Reward: 112.82
Reward: 484.91, Ewma Reward: 116.54
Reward: 181.56, Ewma Reward: 117.19
Reward: 415.48, Ewma Reward: 120.17
Reward: 193.95, Ewma Reward: 120.91
Reward: 964.00, Ewma Reward: 129.34
Reward: 459.09, Ewma Reward: 132.64
Reward: 1396.99, Ewma Reward: 145.28
Reward: -34.24, Ewma Reward: 143.48
Reward: 458.35, Ewma Reward: 146.63
Reward: 1037.32, Ewma Reward: 155.54
Reward: 1159.45, Ewma Reward: 165.58
Reward: -173.16, Ewma Reward: 162.19
R

In [8]:
num_iteration=10
num_episodes=100
for i in range(num_iteration):
    for episode in range(num_episodes):
        state=env.reset()
        done=False
        episode_reward=0
        
        while not done:
            action=agent.act(state)
            next_state,reward,done,_=env.step(action)
            episode_reward+=reward
            agent.train([state],[action],[reward],[0])
            state=next_state
            
    print(f"Epoch {i+1}/{num_iteration}：Reward {episode_reward}")


Reward: -458.42, Ewma Reward: 227.50


  states = torch.FloatTensor(states).to(self.device)


Reward: 19.76, Ewma Reward: 225.43
Reward: 467.92, Ewma Reward: 227.85
Reward: 1.60, Ewma Reward: 225.59
Reward: 404.03, Ewma Reward: 227.37
Reward: 265.24, Ewma Reward: 227.75
Reward: 608.49, Ewma Reward: 231.56
Reward: 741.28, Ewma Reward: 236.66
Reward: 275.46, Ewma Reward: 237.05
Reward: 61.65, Ewma Reward: 235.29
Reward: 583.45, Ewma Reward: 238.77
Reward: 400.70, Ewma Reward: 240.39
Reward: -1981.31, Ewma Reward: 218.18
Reward: 719.64, Ewma Reward: 223.19
Reward: 191.93, Ewma Reward: 222.88
Reward: 1266.92, Ewma Reward: 233.32
Reward: -0.21, Ewma Reward: 230.98
Reward: -86.85, Ewma Reward: 227.80
Reward: 536.84, Ewma Reward: 230.89
Reward: 359.19, Ewma Reward: 232.18
Reward: 617.00, Ewma Reward: 236.03
Reward: 668.40, Ewma Reward: 240.35
Reward: 1272.02, Ewma Reward: 250.67
Reward: 427.91, Ewma Reward: 252.44
Reward: 673.39, Ewma Reward: 256.65
Reward: 987.10, Ewma Reward: 263.95
Reward: 700.13, Ewma Reward: 268.31
Reward: 200.86, Ewma Reward: 267.64
Reward: 1022.08, Ewma Reward:

In [9]:
num_eval_episodes=100
eval_rewards=[]
for episode in range(num_eval_episodes):
    state=env.reset()
    done = False
    episode_reward = 0

    while not done:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward
        state = next_state

    eval_rewards.append(episode_reward)
        
print(f"Average evaluation reward: {np.mean(eval_rewards)}")

Reward: -251.13, Ewma Reward: 374.91
Reward: 1121.27, Ewma Reward: 382.38
Reward: 1547.77, Ewma Reward: 394.03
Reward: -984.76, Ewma Reward: 380.24
Reward: 1614.11, Ewma Reward: 392.58
Reward: 432.85, Ewma Reward: 392.98
Reward: 1024.92, Ewma Reward: 399.30
Reward: 671.07, Ewma Reward: 402.02
Reward: 403.31, Ewma Reward: 402.03
Reward: 975.82, Ewma Reward: 407.77
Reward: 307.53, Ewma Reward: 406.77
Reward: 47.34, Ewma Reward: 403.18
Reward: 771.80, Ewma Reward: 406.86
Reward: 19.72, Ewma Reward: 402.99
Reward: 263.76, Ewma Reward: 401.60
Reward: 403.62, Ewma Reward: 401.62
Reward: 842.62, Ewma Reward: 406.03
Reward: 1416.07, Ewma Reward: 416.13
Reward: 814.24, Ewma Reward: 420.11
Reward: 964.38, Ewma Reward: 425.55
Reward: -4493.54, Ewma Reward: 376.36
Reward: 174.72, Ewma Reward: 374.35
Reward: 1025.96, Ewma Reward: 380.86
Reward: 330.88, Ewma Reward: 380.36
Reward: 549.10, Ewma Reward: 382.05
Reward: 710.95, Ewma Reward: 385.34
Reward: -54.20, Ewma Reward: 380.94
Reward: 1132.61, Ewm