In [96]:
import network_sim
import gym
import numpy as np
import torch
from specbuffer import SpecReplayBuffer
from PPO import PPO

In [109]:
class SpecReplayBuffer:
    def __init__(self,buffer_size):
        self.buffer_size=buffer_size
        self.index=0
        self.state=[]
        self.action=[]
        self.logprobs=[]
        self.next_state=[]
        self.reward=[]
        self.state_value=[]
        self.done=[]
        self.teacher_reward=torch.zeros(buffer_size)
        self.islatest=False
        self.state_buffer=[]
        self.isAfter=False
        
        
    def store(self,state,action,log_prob=None,next_state=None,reward=None,state_value=None,done=None):
        #buffer_size必须是400的倍数
        self.state_buffer.append(state)
        
        next_state=torch.FloatTensor(next_state).view(-1)
        action=torch.FloatTensor(action)
        reward=torch.FloatTensor(np.array(reward))
        if self.index>=self.buffer_size:
            index=self.index%self.buffer_size
            if done:
                self.state[int(index/400)]=self.state_buffer
                self.state_buffer=[]
            self.action[index]=action
            self.logprobs[index]=log_prob
            self.next_state[index]=next_state
            self.reward[index]=reward
            self.state_value[index]=state_value
            self.done[index]=done
        else:
            if done:
                self.state.append(self.state_buffer)
                self.state_buffer=[]
            self.action.append(action)
            self.logprobs.append(log_prob)
            self.next_state.append(next_state)
            self.reward.append(reward)
            self.state_value.append(state_value)
            self.done.append(done)
        self.index+=1
        
        
    def sample(self,batch_size,return_index=False):
        sample_index=np.random.choice(min(self.buffer_size,self.index),batch_size,replace=True)
        if return_index:
            return sample_index
        length=len(self.state[0])
        state=[torch.FloatTensor(self.get_state_padding(self.state[int(i/length)],i%length,10,True)[0]).view(-1) for i in sample_index]
        action=[self.action[i] for i in sample_index]
        log_prob=[self.logprobs[i] for i in sample_index]
        next_state=[self.next_state[i] for i in sample_index]
        reward=[self.reward[i] for i in sample_index]
        state_value=[self.state_value[i] for i in sample_index]
        done=[self.done[i] for i in sample_index]
        teacher_reward=[self.teacher_reward[i] for i in sample_index]
        return state,action,log_prob,next_state,reward,state_value,done,teacher_reward
    
    def get_state_padding(self,a:list,index:int,offset:int,zero_padding:bool):
        if not zero_padding:
            # return a[max(0,index-offset):min(index+offset,len(a))]
            return a[max(0,index-offset+1):index+1],a[index+1:min(index+offset+1,len(a)+1)]
        b=a.copy()
        zero_padding=np.copy(b[0])
        for _ in range(offset):
            b.insert(0,zero_padding)
            b.append(zero_padding)
        # return b[max(0,index):min(index+2*offset,len(a)+2*offset)]
        return b[max(0,index+1):index+offset+1],b[index+offset+1:min(index+2*offset+1,len(a)+2*offset+1)]
    
    def get_reward_padding(self,a:list,index:int,offset:int,interval:int,zero_padding:bool):
        low=int(index/interval)*interval
        high=(int(index/interval)+1)*interval
        if not zero_padding:
            # return a[max(0,index-offset):min(index+offset,len(a))]
            return a[max(low,index-offset+1):index+1],a[index+1:min(index+offset+1,high)]
        b=a.copy()
        zero_padding=b[low]
        for _ in range(offset):
        # b.insert(low,zero_padding)
            b.insert(high,zero_padding)
        for _ in range(offset):
            b.insert(low,zero_padding)
        # return b[max(0,index):min(index+2*offset,len(a)+2*offset)]
        return b[max(low,index+1):index+offset+1],b[index+offset+1:min(index+2*offset+1,high+2*offset)]
    
    
    def state_from_index(self,index:list):
        if self.isAfter:
            return self.from_after_state(index)
        else:
            return self.from_before_state(index)
    
    def from_before_state(self,index:list):
        length=len(self.state[0])
        state=[torch.FloatTensor(np.array(self.get_state_padding(self.state[int(i/length)],i%length,10,True)[0])).view(-1) for i in index]
        return torch.stack(state,dim=0).detach()
    
    def from_after_state(self,index:list):
        length=len(self.state[0])
        state=[torch.FloatTensor(np.array(self.get_state_padding(self.state[int(i/length)],i%length,10,True)[1])).view(-1) for i in index]
        return torch.stack(state,dim=0).detach()
    
    def from_after_reward(self,index:list):
        length=len(self.state[0])
        state=[torch.FloatTensor(np.array(self.get_reward_padding(self.reward,i,10,length,True)[1])).view(-1) for i in index]
        return torch.stack(state,dim=0).detach()
        
    def clean(self):#这里暂时有个bug，不能解决append那里，不过暂时用不上这个函数，就先不管了
        self.index=0
        self.state=[]
        self.action=[]
        self.logprobs=[]
        self.next_state=[]
        self.reward=[]
        self.state_value=[]
        self.done=[]
        self.teacher_reward=[]
        
    def setAfter(self,isAfter:bool):
        self.isAfter=isAfter

In [110]:

env=gym.make('PccNs-v0')
replaybuffer=SpecReplayBuffer(1600)

History length: 10
Features: ['sent latency inflation', 'latency ratio', 'send ratio']
Getting min obs for ['sent latency inflation', 'latency ratio', 'send ratio']


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [111]:
model=PPO(30,1,replaybuffer)

In [112]:
for _ in range(5):
    s=env.reset()
    d=False
    while not d:
        a,action,l,v=model.select_action(s)
        s_,r,d,_=env.step(a)
        replaybuffer.store(s[-3:],a,l,s_,r,v,d)
        s=s_

  logger.warn(
  logger.warn(
  logger.warn(
  logger.deprecation(
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


Reward: 0.00, Ewma Reward: 0.00
Reward: 192.45, Ewma Reward: 1.92
Reward: -72.28, Ewma Reward: 1.18
Reward: 111.67, Ewma Reward: 2.29
Reward: 268.57, Ewma Reward: 4.95


In [113]:
replaybuffer.from_after_reward([1,300,399])

tensor([[ 1.5535,  1.5902,  1.6580,  1.7017,  1.7142,  1.6959,  1.6786,  1.7096,
          1.7771,  1.7655],
        [ 0.0912,  0.1404,  0.0691,  0.0296, -0.0024,  0.1644,  0.1390, -0.0394,
          0.0940,  0.1136],
        [ 1.5556,  1.5556,  1.5556,  1.5556,  1.5556,  1.5556,  1.5556,  1.5556,
          1.5556,  1.5556]])

In [None]:
model.update(1024)

def get_state_padding(a:list,index:int,offset:int,zero_padding:bool):
    if not zero_padding:
        # return a[max(0,index-offset):min(index+offset,len(a))]
        return a[max(0,index-offset+1):index+1],a[index+1:min(index+offset+1,len(a)+1)]
    b=a.copy()
    zero_padding=np.copy(b[0])
    for _ in range(offset):
        b.insert(0,zero_padding)
        b.append(zero_padding)
    # return b[max(0,index):min(index+2*offset,len(a)+2*offset)]
    return b[max(0,index+1):index+offset+1],b[index+offset+1:min(index+2*offset+1,len(a)+2*offset+1)]

def get_state(a:list,index:int,offset:int):
    return a[max(0,index-offset):min(index+offset,len(a))]
    # return max(0,index-offset+1),min(index+offset+1,len(a))

In [28]:
policy=model.policy.to('cpu')

In [29]:
torch.save(policy.state_dict(),'modelcpu.pt')

In [22]:
model.state_dict().keys()

odict_keys(['actor.0.weight', 'actor.0.bias', 'actor.2.weight', 'actor.2.bias', 'actor.4.weight', 'actor.4.bias', 'critic.0.weight', 'critic.0.bias', 'critic.2.weight', 'critic.2.bias', 'critic.4.weight', 'critic.4.bias'])

In [11]:
model.policy.load_state_dict()

TypeError: load_state_dict() missing 1 required positional argument: 'state_dict'

In [88]:
def get_state_padding(a:list,index:int,offset:int,interval:int,zero_padding:bool):
    low=int(index/interval)*interval
    high=(int(index/interval)+1)*interval
    if not zero_padding:
        # return a[max(0,index-offset):min(index+offset,len(a))]
        return a[max(low,index-offset+1):index+1],a[index+1:min(index+offset+1,high)]
    b=a.copy()
    zero_padding=b[0]
    for _ in range(offset):
    # b.insert(low,zero_padding)
        b.insert(high,zero_padding)
    for _ in range(offset):
        b.insert(low,zero_padding)
    # return b[max(0,index):min(index+2*offset,len(a)+2*offset)]
    return b[max(low,index+1):index+offset+1],b[index+offset+1:min(index+2*offset+1,high+2*offset)]

In [40]:
import numpy as np

In [92]:
l=[i for i in range(1600)]

In [86]:
l.insert(5,0)

In [87]:
l

[0, 1, 2, 3, 4, 0, 0, 0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [93]:
get_state_padding(l,399,15,400,True)

([385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [46]:
offset=10
max=20
index=47
low=int(index/max)*max
high=int()
l[max(low,index-offset+1):index+1],l[index+1:min(index+offset+1,high+1)]

([0, 1, 2, 3, 4, 5, 6, 7], [8, 9, 10])

In [95]:
import importlib
from specbuffer import SpecReplayBuffer

In [None]:
importlib.reload(SpecReplayBuffer)

In [None]:
env=gym.make('PccNs-v0')
replaybuffer=ReplayBuffer(1600)

In [1]:
from glob import glob

[os.path.splitext(os.path.basename(file))[0] for file in filelist]

In [19]:
path='/home/tools/DR/MyProject/RILE/model/student/deeptrain/studentmodel0855.pt'

In [23]:
import torch
import torch.nn as nn
from torch.distributions import MultivariateNormal
from torch.distributions import Categorical
import numpy as np
from torch.utils.tensorboard.writer import SummaryWriter

################################## set device ##################################
device = torch.device('cpu')
if (torch.cuda.is_available()):
    device = torch.device('cuda:0')
    torch.cuda.empty_cache()
debug=False

class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim, has_continuous_action_space, action_std_init):
        super(ActorCritic, self).__init__()

        self.has_continuous_action_space = has_continuous_action_space

        if has_continuous_action_space:
            self.action_dim = action_dim
            self.action_var = torch.full(
                (action_dim,), action_std_init * action_std_init).to(device)
        # actor
        if has_continuous_action_space:
            self.actor = nn.Sequential(
                nn.Linear(state_dim, hidden_dim),
                nn.Tanh(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.Tanh(),
                nn.Linear(hidden_dim, action_dim),
                nn.Tanh()
            )
        else:
            self.actor = nn.Sequential(
                nn.Linear(state_dim, hidden_dim),
                nn.Tanh(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.Tanh(),
                nn.Linear(hidden_dim, action_dim),
                nn.Softmax(dim=-1)
            )
        # critic
        self.critic = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, 1)
        )

    def set_action_std(self, new_action_std):
        if self.has_continuous_action_space:
            self.action_var = torch.full(
                (self.action_dim,), new_action_std * new_action_std).to(device)

    def forward(self):
        raise NotImplementedError

    def act(self, state):
        if self.has_continuous_action_space:
            action_mean = self.actor(state)
            cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
            dist = MultivariateNormal(action_mean, cov_mat)
        else:
            action_probs = self.actor(state)
            dist = Categorical(action_probs)

        action = dist.sample()
        action_logprob = dist.log_prob(action)
        state_val = self.critic(state)

        return action.detach(), action_logprob.detach(), state_val.detach()

    def evaluate(self, state, action):
        action_mean = self.actor(state)
        action_var = self.action_var.expand_as(action_mean)
        cov_mat = torch.diag_embed(action_var).to(device)
        dist = MultivariateNormal(action_mean, cov_mat)
        if self.action_dim == 1:
            action = action.reshape(-1, self.action_dim)
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_values = self.critic(state)

        return action_logprobs, state_values, dist_entropy


class PPO:
    def __init__(self, state_dim, action_dim, hidden_dim, replay_buffer, isStu=True, lr_actor=1e-3, lr_critic=1e-3, gamma=0.99, K_epochs=80, eps_clip=0.2, has_continuous_action_space=True, action_std_init=0.6):

        self.has_continuous_action_space = has_continuous_action_space

        self.action_std = action_std_init

        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs

        self.buffer = replay_buffer
        # print(self.buffer,replay_buffer)

        self.policy = ActorCritic(state_dim, action_dim, hidden_dim,
                                  has_continuous_action_space, action_std_init).to(device)
        self.optimizer = torch.optim.Adam([
            {'params': self.policy.actor.parameters(), 'lr': lr_actor},
            {'params': self.policy.critic.parameters(), 'lr': lr_critic}
        ])

        self.policy_old = ActorCritic(
            state_dim, action_dim, hidden_dim, has_continuous_action_space, action_std_init).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())

        self.MseLoss = nn.MSELoss()
        
        #Tensorboard Writer
        if isStu:
            self.tensorboard_writer=SummaryWriter('./runs/PPOee/Student/')
        else:
            self.tensorboard_writer=SummaryWriter('./runs/PPOee/Teacher/')
        self.loss=[]
        self.value_loss=[]
        self.policy_entropy=[]
        self.ptr=0

    def set_action_std(self, new_action_std):
        self.action_std = new_action_std
        self.policy.set_action_std(new_action_std)
        self.policy_old.set_action_std(new_action_std)

    def decay_action_std(self, action_std_decay_rate, min_action_std):
        self.action_std = self.action_std - action_std_decay_rate
        self.action_std = round(self.action_std, 4)
        if (self.action_std <= min_action_std):
            self.action_std = min_action_std
            print("setting actor output action_std to min_action_std : ",
                  self.action_std)
        else:
            print("setting actor output action_std to : ", self.action_std)
        self.set_action_std(self.action_std)

    def select_action(self, state):
        with torch.no_grad():
            state = torch.FloatTensor(state).view(-1).to(device)
            action, action_logprob, state_val = self.policy_old.act(state)

        return action.detach().cpu().numpy().flatten(), action, action_logprob, state_val

    def update(self, batch_size):
        # 改成sample一些buffer
        index = self.buffer.sample(batch_size, True)

        # Monte Carlo estimate of returns
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed([self.buffer.reward[i] for i in index]), reversed([self.buffer.done[i] for i in index])):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)

        # Normalizing the rewards
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

        old_states = self.buffer.state_from_index(index).to(device)
        old_actions = torch.stack([self.buffer.action[i]
                                  for i in index], dim=0).detach().to(device)
        old_logprobs = torch.stack([self.buffer.logprobs[i]
                                   for i in index], dim=0).detach().to(device)
        old_state_values = torch.stack(
            [self.buffer.state_value[i] for i in index], dim=0).detach().to(device)

        # calculate advantages
        advantages = rewards.detach() - old_state_values.detach()

        # Optimize policy for K epochs
        for _ in range(self.K_epochs):

            # Evaluating old actions and values
            logprobs, state_values, dist_entropy = self.policy.evaluate(
                old_states, old_actions)

            # match state_values tensor dimensions with rewards tensor
            # state_values = torch.squeeze(state_values)

            # Finding the ratio (pi_theta / pi_theta__old)
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip,
                                1+self.eps_clip) * advantages

            # final loss of clipped objective PPO
            loss = -torch.min(surr1, surr2) + 0.5 * \
                self.MseLoss(state_values, rewards) - 0.01 * dist_entropy
                
            # self.loss.append(loss)
            # self.value_loss.append(self.MseLoss(state_values, rewards))
            # self.policy_entropy.append(dist_entropy)
            print(self.MseLoss(state_values, rewards))
            self.tensorboard_writer.add_scalar("Loss", loss.mean(), self.ptr)
            self.tensorboard_writer.add_scalar("Value Loss", self.MseLoss(state_values, rewards), self.ptr)
            self.tensorboard_writer.add_scalar("Entropy", dist_entropy.mean(), self.ptr)
            self.ptr+=1

            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        # Copy new weights into old policy
        self.policy_old.load_state_dict(self.policy.state_dict())

        # clear buffer
        # self.buffer.clear()
        
    def WriteTensorboard(self,logdir:str):
        
        for i in range(len(self.loss)):
            self.tensorboard_writer.add_scalar("Loss", self.loss[i], i)
            self.tensorboard_writer.add_scalar("Value Loss", self.value_loss[i], i)
            self.tensorboard_writer.add_scalar("Entropy", self.policy_entropy[i], i)
        

    def save(self, checkpoint_path):
        torch.save(self.policy_old.state_dict(), checkpoint_path)

    def load(self, checkpoint_path):
        self.policy_old.load_state_dict(torch.load(
            checkpoint_path, map_location=lambda storage, loc: storage))
        self.policy.load_state_dict(torch.load(
            checkpoint_path, map_location=lambda storage, loc: storage))


In [24]:
from tqdm import tqdm
class StudentAgent:
    def __init__(self, state_dim, action_dim, env):
        self.env = env
        # self.replay_buffer=self.env.student_buffer
        self.replay_buffer = env.student_buffer
        self.model = PPO(state_dim, action_dim, 32, self.replay_buffer)

    def generate_trajectory(self, step: int):
        pb = tqdm(range(step))
        num = 0
        for i in pb:
            s = self.env.reset()
            d = False
            while not d:
                a, _, l, v = self.model.select_action(s)
                s_, r, d, _ = self.env.step(a)
                self.model.buffer.store(s, a, l, s_, r, v, d)
                s = s_
                num += 1
            # pb.update()
        print('生成', num, '条轨迹')

    def train(self, total_timestep, batch_size):
        pb = tqdm(range(total_timestep))
        for i in pb:
            self.model.update(batch_size)
            pb.update()

In [25]:
class Student(StudentAgent):
    def __init__(self, state_dim, action_dim, env):
        super().__init__(state_dim, action_dim, env)
        
    def generate_trajectory(self, step: int):
        pb=tqdm(range(step))
        for _ in pb:
            s=self.env.reset()
            d=False
            while not d:
                a,action,l,v=self.model.select_action(s)
                s_,r,d,_=self.env.step(a)
                self.model.buffer.store(s[-3:],a,l,s_,r,v,d)
                s=s_
            pb.update()

In [13]:
from CustomEnv import CustomEnv
import network_sim

In [26]:
env=CustomEnv('PccNs-v0',4096)
sa=Student(np.prod(env.get_state_dim()),env.get_action_dim()[0],env)

History length: 10
Features: ['sent latency inflation', 'latency ratio', 'send ratio']
Getting min obs for ['sent latency inflation', 'latency ratio', 'send ratio']


In [27]:
sa.generate_trajectory(5)

100%|██████████| 5/5 [00:01<00:00,  3.13it/s]


In [28]:
sa.model.update(32)

tensor(1.0456, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(1.0169, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.9952, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.9804, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.9719, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.9689, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.9701, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.9736, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.9773, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.9797, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.9804, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.9795, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.9776, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.9751, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.9727, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.9708, device='cuda:0', grad_fn=<MseLossBackward0>)
tensor(0.9695, device='cuda:0', grad_fn=