In [1]:
#try custom PPO on PccNs-v0
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import network_sim

In [2]:
# 定义策略网络
class ActorCriticNetwork(nn.Module):
    def __init__(self,n_states,n_actions):
        super().__init__()
        self.fc1 = nn.Linear(n_states, 128)
        self.fc2 = nn.Linear(128, n_actions)  # 输出动作概率

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return F.softmax(self.fc2(x),dim=-1)

In [8]:
# 创建 PPO 代理
class PPOAgent:
    def __init__(self,n_states,n_actions,device):
        self.device=device
        self.actor_critic = ActorCriticNetwork(n_states,n_actions).to(self.device)
        self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=1e-3)

    def act(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        probs = self.actor_critic(state)
        action = probs.multinomial(1)[0].tolist()
        return action

    def train(self, states, actions, rewards, values):
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        values = torch.FloatTensor(values).to(self.device)

        # 计算优势函数
        advantages = rewards - values

        # 计算策略梯度
        log_probs = torch.log(self.actor_critic(states)[range(len(states)), actions]).to(self.device)
        actor_loss = -torch.mean(log_probs * advantages).to(self.device)

        # 计算价值函数损失
        critic_loss = F.mse_loss(self.actor_critic(states).squeeze(-1), rewards).to(self.device)

        # 计算总损失
        loss = actor_loss + 0.5 * critic_loss

        # 更新网络参数
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [4]:
env=gym.make('PccNs-v0')

History length: 10
Features: ['sent latency inflation', 'latency ratio', 'send ratio']
Getting min obs for ['sent latency inflation', 'latency ratio', 'send ratio']




In [5]:
n_states=env.observation_space.shape
n_actions=env.action_space.shape

In [9]:
agent=PPOAgent(n_states[0],n_actions[0],'cuda')
s=env.reset()

Reward: 0.00, Ewma Reward: 0.00


In [10]:
num_eval_episodes=100
eval_rewards=[]
for episode in range(num_eval_episodes):
    state=env.reset()
    done = False
    episode_reward = 0

    while not done:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward
        state = next_state

    eval_rewards.append(episode_reward)
        
print(f"Average evaluation reward: {np.mean(eval_rewards)}")

Reward: 0.00, Ewma Reward: 0.00
Reward: -780.31, Ewma Reward: -7.80
Reward: 1302.05, Ewma Reward: 5.30
Reward: -3227.75, Ewma Reward: -27.04
Reward: 1221.95, Ewma Reward: -14.55
Reward: 214.60, Ewma Reward: -12.25
Reward: 878.70, Ewma Reward: -3.34
Reward: 366.46, Ewma Reward: 0.35
Reward: -51.17, Ewma Reward: -0.16
Reward: 315.90, Ewma Reward: 3.00
Reward: 581.61, Ewma Reward: 8.79
Reward: 838.75, Ewma Reward: 17.08
Reward: 647.40, Ewma Reward: 23.39
Reward: 900.87, Ewma Reward: 32.16
Reward: 1167.74, Ewma Reward: 43.52
Reward: -2652.61, Ewma Reward: 16.56
Reward: 998.39, Ewma Reward: 26.38
Reward: 396.63, Ewma Reward: 30.08
Reward: 141.16, Ewma Reward: 31.19
Reward: 765.50, Ewma Reward: 38.53
Reward: 859.46, Ewma Reward: 46.74
Reward: 96.69, Ewma Reward: 47.24
Reward: 1263.64, Ewma Reward: 59.40
Reward: 1736.84, Ewma Reward: 76.18
Reward: 584.36, Ewma Reward: 81.26
Reward: 1306.30, Ewma Reward: 93.51
Reward: 386.90, Ewma Reward: 96.45
Reward: -463.19, Ewma Reward: 90.85
Reward: 274.6

In [15]:
num_iteration=10
num_episodes=100
for i in range(num_iteration):
    for episode in range(num_episodes):
        state=env.reset()
        done=False
        episode_reward=0
        
        while not done:
            action=agent.act(state)
            next_state,reward,done,_=env.step(action)
            episode_reward+=reward
            agent.train([state],[action],[reward],[0])
            state=next_state
            
    print(f"Epoch {i+1}/{num_iteration}：Reward {episode_reward}")


Reward: 88.52, Ewma Reward: 302.31


  states = torch.FloatTensor(states).to(self.device)


Reward: 1010.37, Ewma Reward: 309.39
Reward: 1592.44, Ewma Reward: 322.22
Reward: 454.90, Ewma Reward: 323.55
Reward: 706.81, Ewma Reward: 327.38
Reward: 139.83, Ewma Reward: 325.50
Reward: 204.10, Ewma Reward: 324.29
Reward: -922.93, Ewma Reward: 311.82
Reward: 352.20, Ewma Reward: 312.22
Reward: 521.71, Ewma Reward: 314.32
Reward: 1277.06, Ewma Reward: 323.94
Reward: 1208.77, Ewma Reward: 332.79
Reward: 344.75, Ewma Reward: 332.91
Reward: -242.13, Ewma Reward: 327.16
Reward: 1248.94, Ewma Reward: 336.38
Reward: 746.79, Ewma Reward: 340.48
Reward: -267.39, Ewma Reward: 334.40
Reward: 1265.10, Ewma Reward: 343.71
Reward: 336.29, Ewma Reward: 343.64
Reward: 363.65, Ewma Reward: 343.84
Reward: 665.83, Ewma Reward: 347.06
Reward: 1603.64, Ewma Reward: 359.62
Reward: -1487.78, Ewma Reward: 341.15
Reward: -126.51, Ewma Reward: 336.47
Reward: 534.11, Ewma Reward: 338.45
Reward: 793.88, Ewma Reward: 343.00
Reward: 330.21, Ewma Reward: 342.88
Reward: 612.64, Ewma Reward: 345.57
Reward: 1337.39

In [16]:
num_eval_episodes=100
eval_rewards=[]
for episode in range(num_eval_episodes):
    state=env.reset()
    done = False
    episode_reward = 0

    while not done:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward
        state = next_state

    eval_rewards.append(episode_reward)
        
print(f"Average evaluation reward: {np.mean(eval_rewards)}")

Reward: -2349.18, Ewma Reward: 280.61
Reward: 930.49, Ewma Reward: 287.11
Reward: 375.82, Ewma Reward: 288.00
Reward: 700.23, Ewma Reward: 292.12
Reward: 532.98, Ewma Reward: 294.53
Reward: -128.45, Ewma Reward: 290.30
Reward: 855.94, Ewma Reward: 295.95
Reward: 976.72, Ewma Reward: 302.76
Reward: 589.59, Ewma Reward: 305.63
Reward: 703.58, Ewma Reward: 309.61
Reward: 779.73, Ewma Reward: 314.31
Reward: 630.27, Ewma Reward: 317.47
Reward: 909.07, Ewma Reward: 323.39
Reward: 741.61, Ewma Reward: 327.57
Reward: 729.97, Ewma Reward: 331.59
Reward: 390.21, Ewma Reward: 332.18
Reward: 1450.74, Ewma Reward: 343.36
Reward: 1322.65, Ewma Reward: 353.16
Reward: 588.04, Ewma Reward: 355.51
Reward: 367.59, Ewma Reward: 355.63
Reward: 563.36, Ewma Reward: 357.70
Reward: -60.87, Ewma Reward: 353.52
Reward: 544.25, Ewma Reward: 355.43
Reward: 344.81, Ewma Reward: 355.32
Reward: 255.49, Ewma Reward: 354.32
Reward: 1704.00, Ewma Reward: 367.82
Reward: -2958.13, Ewma Reward: 334.56
Reward: -63.83, Ewma