In [1]:

import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
from buffer import ReplayBuffer
from PPO import PPO
import gym
from Discriminator import Discriminator
from reward_env import RewardEnv

import contextlib
import network_sim
from stable_baselines3 import PPO as sb3ppo
import pickle



2024-03-30 09:56:16.725938: I tensorflow/stream_executor/platform/default/dso_loader.cc:50] Successfully opened dynamic library libcudart.so.12


In [2]:
device='cuda'
BUFFER_SIZE=81920
hidden_dim=32

In [3]:

class CustomEnv:
    def __init__(self,env_id):
        with contextlib.redirect_stderr(None):
            self.env=gym.make(env_id)
        self.student_buffer=ReplayBuffer(BUFFER_SIZE)
        self.teacher_buffer=ReplayBuffer(BUFFER_SIZE)
        
    def reset(self):
        with contextlib.redirect_stderr(None),contextlib.redirect_stdout(None):
            return self.env.reset()
    
    def step(self,action):
        with contextlib.redirect_stderr(None),contextlib.redirect_stdout(None):
            return self.env.step(action)
    
    def get_state_dim(self):
        return self.env.observation_space.shape
    
    def get_action_dim(self):
        return self.env.action_space.shape

In [4]:
class ExpertTrajectory(ReplayBuffer):
    def __init__(self, buffer_size):
        super().__init__(buffer_size)
        self.model=None
        self.max_step=buffer_size*5
        
    def load_expert_model(self,path:str):
        self.model=sb3ppo.load(path)
    
    def generate_trajectory(self,env:CustomEnv,max_step=0):
        if max_step:
            self.max_step=max_step
        with tqdm(range(self.max_step)) as pb:
            while True:
                s=env.reset()
                d=False
                while not d:
                    a,_=self.model.predict(s)
                    s_,r,d,_=env.step(a)
                    self.store(s,a,None,s_,r,None,d)
                    s=s_
                    pb.update()
                if pb.n>=self.max_step:
                    break
                

In [5]:
env=CustomEnv('PccNs-v0')

History length: 10
Features: ['sent latency inflation', 'latency ratio', 'send ratio']
Getting min obs for ['sent latency inflation', 'latency ratio', 'send ratio']


In [6]:
et=ExpertTrajectory(10)

In [7]:
et.load_expert_model('verygood.zip')

In [8]:
et.generate_trajectory(env)

400it [00:00, 436.81it/s]                     


In [9]:
class StudentAgent:
    def __init__(self,state_dim,action_dim,env:CustomEnv):
        self.env=env
        self.replay_buffer=env.student_buffer
        self.model=PPO(state_dim,action_dim,env.student_buffer)
        
    def generate_trajectory(self,step:int):
        pb=tqdm(range(step))
        num=0
        for i in pb:
            s=self.env.reset()
            d=False
            while not d:
                a,_,l,v=self.model.select_action(s)
                s_,r,d,_=self.env.step(a)
                self.model.buffer.store(s,a,l,s_,r,v,d)
                s=s_
                num+=1
            #pb.update()
        print('生成',num,'条轨迹')   
        
    def train(self,total_timestep,batch_size):
        pb=tqdm(range(total_timestep))
        for i in pb:
            self.model.update(batch_size)
            pb.update()

In [33]:
sa=StudentAgent(np.prod(env.get_state_dim()),env.get_action_dim()[0],env)

In [11]:
sa.model=torch.load('model/student/studentmodel.zip')

In [22]:
sa.generate_trajectory(8)

100%|██████████| 8/8 [00:02<00:00,  3.06it/s]

生成 3200 条轨迹





In [27]:
sa.train(100,2048)

100%|██████████| 100/100 [00:36<00:00,  2.72it/s]


In [14]:
class TeacherAgent():
    def __init__(self,state_dim,action_dim,env:CustomEnv,expert_trajectory):
        self.trajectory_buffer=env.student_buffer
        self.replay_buffer=env.teacher_buffer
        self.model=PPO(state_dim+action_dim,1,self.replay_buffer)
        self.discriminator=Discriminator(state_dim+action_dim,hidden_dim,64,self.trajectory_buffer,expert_trajectory)
        
    def ComputeReward(self):
        pb=tqdm(range(min(self.trajectory_buffer.index,self.trajectory_buffer.buffer_size)))
        for i in pb:
            sa_pair=torch.cat((self.trajectory_buffer.state[i],self.trajectory_buffer.action[i]),-1)
            reward,_,l,v=self.model.select_action(sa_pair)
            self.replay_buffer.store(sa_pair,
                                     reward,
                                     l,
                                     self.trajectory_buffer.next_state[i],
                                     self.discriminator.model(sa_pair).detach().cpu().numpy(),
                                     v,
                                     self.trajectory_buffer.done[i],
                                     )
            self.trajectory_buffer.reward[i]=reward
            pb.update()
        self.discriminator.collect_expert()
            
    def trainPPO(self,total_timestep:int):
        for i in range(total_timestep):
            self.model.update(1024)
            
    def trainDiscriminator(self,total_timestep:int):
        self.discriminator.update(total_timestep,False)
        
    def train(self,total_timestep:int,PPO_timestep:int,D_timestep:int):
        pb=tqdm(range(total_timestep))
        for i in pb:
            print('Computing reward...')
            self.ComputeReward()
            print('Training Discriminator...')
            self.trainDiscriminator(D_timestep)
            print('Training PPO...')
            self.trainPPO(PPO_timestep)
            pb.update()
        

In [15]:
ta=TeacherAgent(np.prod(env.get_state_dim()),env.get_action_dim()[0],env,et)

In [37]:
env.student_buffer

[]

In [38]:
sa.replay_buffer

[]

In [39]:
sa.model.buffer

[]

In [40]:
ta.trajectory_buffer

[]

In [41]:
ta.ComputeReward()

0it [00:00, ?it/s]


In [42]:
ta.trainDiscriminator(100)

ValueError: a must be greater than 0 unless no samples are taken

In [None]:
ta.trainPPO(100)

In [None]:
for i in range(10):
    print('第',i,'次更新')
    sa.train(1024,4096)
    ta.train(10,100,100)

第 0 次更新


  rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
  return F.mse_loss(input, target, reduction=self.reduction)
  0%|          | 4/1024 [00:05<22:25,  1.32s/it]


KeyboardInterrupt: 

In [None]:
rewards=[]
for i in range(100):
    s=env.reset()
    d=False
    reward=0
    while not d:
        a,_,_,_=sa.model.select_action(s)
        s,r,d,_=env.step(a)
        reward+=r
    rewards.append(reward)

In [None]:
np.mean(rewards)

440.69136894767644

In [None]:
rewards=[]
for i in range(100):
    s=env.reset()
    d=False
    reward=0
    while not d:
        a,_=et.model.predict(s)
        s_,r,d,_=env.step(a)
        reward+=r
        s=s_
    rewards.append(reward)

In [None]:
np.mean(rewards)

608.6122610776468