In [96]:
import network_sim
import gym
import numpy as np
import torch
from specbuffer import SpecReplayBuffer
from PPO import PPO

In [109]:
class SpecReplayBuffer:
    def __init__(self,buffer_size):
        self.buffer_size=buffer_size
        self.index=0
        self.state=[]
        self.action=[]
        self.logprobs=[]
        self.next_state=[]
        self.reward=[]
        self.state_value=[]
        self.done=[]
        self.teacher_reward=torch.zeros(buffer_size)
        self.islatest=False
        self.state_buffer=[]
        self.isAfter=False
        
        
    def store(self,state,action,log_prob=None,next_state=None,reward=None,state_value=None,done=None):
        #buffer_size必须是400的倍数
        self.state_buffer.append(state)
        
        next_state=torch.FloatTensor(next_state).view(-1)
        action=torch.FloatTensor(action)
        reward=torch.FloatTensor(np.array(reward))
        if self.index>=self.buffer_size:
            index=self.index%self.buffer_size
            if done:
                self.state[int(index/400)]=self.state_buffer
                self.state_buffer=[]
            self.action[index]=action
            self.logprobs[index]=log_prob
            self.next_state[index]=next_state
            self.reward[index]=reward
            self.state_value[index]=state_value
            self.done[index]=done
        else:
            if done:
                self.state.append(self.state_buffer)
                self.state_buffer=[]
            self.action.append(action)
            self.logprobs.append(log_prob)
            self.next_state.append(next_state)
            self.reward.append(reward)
            self.state_value.append(state_value)
            self.done.append(done)
        self.index+=1
        
        
    def sample(self,batch_size,return_index=False):
        sample_index=np.random.choice(min(self.buffer_size,self.index),batch_size,replace=True)
        if return_index:
            return sample_index
        length=len(self.state[0])
        state=[torch.FloatTensor(self.get_state_padding(self.state[int(i/length)],i%length,10,True)[0]).view(-1) for i in sample_index]
        action=[self.action[i] for i in sample_index]
        log_prob=[self.logprobs[i] for i in sample_index]
        next_state=[self.next_state[i] for i in sample_index]
        reward=[self.reward[i] for i in sample_index]
        state_value=[self.state_value[i] for i in sample_index]
        done=[self.done[i] for i in sample_index]
        teacher_reward=[self.teacher_reward[i] for i in sample_index]
        return state,action,log_prob,next_state,reward,state_value,done,teacher_reward
    
    def get_state_padding(self,a:list,index:int,offset:int,zero_padding:bool):
        if not zero_padding:
            # return a[max(0,index-offset):min(index+offset,len(a))]
            return a[max(0,index-offset+1):index+1],a[index+1:min(index+offset+1,len(a)+1)]
        b=a.copy()
        zero_padding=np.copy(b[0])
        for _ in range(offset):
            b.insert(0,zero_padding)
            b.append(zero_padding)
        # return b[max(0,index):min(index+2*offset,len(a)+2*offset)]
        return b[max(0,index+1):index+offset+1],b[index+offset+1:min(index+2*offset+1,len(a)+2*offset+1)]
    
    def get_reward_padding(self,a:list,index:int,offset:int,interval:int,zero_padding:bool):
        low=int(index/interval)*interval
        high=(int(index/interval)+1)*interval
        if not zero_padding:
            # return a[max(0,index-offset):min(index+offset,len(a))]
            return a[max(low,index-offset+1):index+1],a[index+1:min(index+offset+1,high)]
        b=a.copy()
        zero_padding=b[low]
        for _ in range(offset):
        # b.insert(low,zero_padding)
            b.insert(high,zero_padding)
        for _ in range(offset):
            b.insert(low,zero_padding)
        # return b[max(0,index):min(index+2*offset,len(a)+2*offset)]
        return b[max(low,index+1):index+offset+1],b[index+offset+1:min(index+2*offset+1,high+2*offset)]
    
    
    def state_from_index(self,index:list):
        if self.isAfter:
            return self.from_after_state(index)
        else:
            return self.from_before_state(index)
    
    def from_before_state(self,index:list):
        length=len(self.state[0])
        state=[torch.FloatTensor(np.array(self.get_state_padding(self.state[int(i/length)],i%length,10,True)[0])).view(-1) for i in index]
        return torch.stack(state,dim=0).detach()
    
    def from_after_state(self,index:list):
        length=len(self.state[0])
        state=[torch.FloatTensor(np.array(self.get_state_padding(self.state[int(i/length)],i%length,10,True)[1])).view(-1) for i in index]
        return torch.stack(state,dim=0).detach()
    
    def from_after_reward(self,index:list):
        length=len(self.state[0])
        state=[torch.FloatTensor(np.array(self.get_reward_padding(self.reward,i,10,length,True)[1])).view(-1) for i in index]
        return torch.stack(state,dim=0).detach()
        
    def clean(self):#这里暂时有个bug，不能解决append那里，不过暂时用不上这个函数，就先不管了
        self.index=0
        self.state=[]
        self.action=[]
        self.logprobs=[]
        self.next_state=[]
        self.reward=[]
        self.state_value=[]
        self.done=[]
        self.teacher_reward=[]
        
    def setAfter(self,isAfter:bool):
        self.isAfter=isAfter

In [110]:

env=gym.make('PccNs-v0')
replaybuffer=SpecReplayBuffer(1600)

History length: 10
Features: ['sent latency inflation', 'latency ratio', 'send ratio']
Getting min obs for ['sent latency inflation', 'latency ratio', 'send ratio']


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [111]:
model=PPO(30,1,replaybuffer)

In [112]:
for _ in range(5):
    s=env.reset()
    d=False
    while not d:
        a,action,l,v=model.select_action(s)
        s_,r,d,_=env.step(a)
        replaybuffer.store(s[-3:],a,l,s_,r,v,d)
        s=s_

  logger.warn(
  logger.warn(
  logger.warn(
  logger.deprecation(
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


Reward: 0.00, Ewma Reward: 0.00
Reward: 192.45, Ewma Reward: 1.92
Reward: -72.28, Ewma Reward: 1.18
Reward: 111.67, Ewma Reward: 2.29
Reward: 268.57, Ewma Reward: 4.95


In [113]:
replaybuffer.from_after_reward([1,300,399])

tensor([[ 1.5535,  1.5902,  1.6580,  1.7017,  1.7142,  1.6959,  1.6786,  1.7096,
          1.7771,  1.7655],
        [ 0.0912,  0.1404,  0.0691,  0.0296, -0.0024,  0.1644,  0.1390, -0.0394,
          0.0940,  0.1136],
        [ 1.5556,  1.5556,  1.5556,  1.5556,  1.5556,  1.5556,  1.5556,  1.5556,
          1.5556,  1.5556]])

In [None]:
model.update(1024)

def get_state_padding(a:list,index:int,offset:int,zero_padding:bool):
    if not zero_padding:
        # return a[max(0,index-offset):min(index+offset,len(a))]
        return a[max(0,index-offset+1):index+1],a[index+1:min(index+offset+1,len(a)+1)]
    b=a.copy()
    zero_padding=np.copy(b[0])
    for _ in range(offset):
        b.insert(0,zero_padding)
        b.append(zero_padding)
    # return b[max(0,index):min(index+2*offset,len(a)+2*offset)]
    return b[max(0,index+1):index+offset+1],b[index+offset+1:min(index+2*offset+1,len(a)+2*offset+1)]

def get_state(a:list,index:int,offset:int):
    return a[max(0,index-offset):min(index+offset,len(a))]
    # return max(0,index-offset+1),min(index+offset+1,len(a))

In [28]:
policy=model.policy.to('cpu')

In [29]:
torch.save(policy.state_dict(),'modelcpu.pt')

In [22]:
model.state_dict().keys()

odict_keys(['actor.0.weight', 'actor.0.bias', 'actor.2.weight', 'actor.2.bias', 'actor.4.weight', 'actor.4.bias', 'critic.0.weight', 'critic.0.bias', 'critic.2.weight', 'critic.2.bias', 'critic.4.weight', 'critic.4.bias'])

In [11]:
model.policy.load_state_dict()

TypeError: load_state_dict() missing 1 required positional argument: 'state_dict'

In [88]:
def get_state_padding(a:list,index:int,offset:int,interval:int,zero_padding:bool):
    low=int(index/interval)*interval
    high=(int(index/interval)+1)*interval
    if not zero_padding:
        # return a[max(0,index-offset):min(index+offset,len(a))]
        return a[max(low,index-offset+1):index+1],a[index+1:min(index+offset+1,high)]
    b=a.copy()
    zero_padding=b[0]
    for _ in range(offset):
    # b.insert(low,zero_padding)
        b.insert(high,zero_padding)
    for _ in range(offset):
        b.insert(low,zero_padding)
    # return b[max(0,index):min(index+2*offset,len(a)+2*offset)]
    return b[max(low,index+1):index+offset+1],b[index+offset+1:min(index+2*offset+1,high+2*offset)]

In [40]:
import numpy as np

In [92]:
l=[i for i in range(1600)]

In [86]:
l.insert(5,0)

In [87]:
l

[0, 1, 2, 3, 4, 0, 0, 0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [93]:
get_state_padding(l,399,15,400,True)

([385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [46]:
offset=10
max=20
index=47
low=int(index/max)*max
high=int()
l[max(low,index-offset+1):index+1],l[index+1:min(index+offset+1,high+1)]

([0, 1, 2, 3, 4, 5, 6, 7], [8, 9, 10])

In [95]:
import importlib
from specbuffer import SpecReplayBuffer

In [None]:
importlib.reload(SpecReplayBuffer)

In [None]:
env=gym.make('PccNs-v0')
replaybuffer=ReplayBuffer(1600)

In [1]:
from glob import glob

In [2]:
filelist=glob('model/student/'+'011048'+'/studentmodel*.zip')

In [13]:
import os
for file in filelist:
    os.remove(file)

In [11]:
model.policy_old.state_dict

NameError: name 'model' is not defined

In [10]:
[os.path.splitext(os.path.basename(file))[0] for file in filelist]

['studentmodel1541',
 'studentmodel1336',
 'studentmodel1235',
 'studentmodel1554',
 'studentmodel1210',
 'studentmodel1516',
 'studentmodel1107',
 'studentmodel1426',
 'studentmodel1324',
 'studentmodel1247',
 'studentmodel1401',
 'studentmodel1438',
 'studentmodel1133',
 'studentmodel1349',
 'studentmodel1158',
 'studentmodel1054',
 'studentmodel1312',
 'studentmodel1529',
 'studentmodel1145',
 'studentmodel1503',
 'studentmodel1223',
 'studentmodel1120',
 'studentmodel1259',
 'studentmodel1450',
 'studentmodel1413']

In [19]:
path='/home/tools/DR/MyProject/RILE/model/student/deeptrain/studentmodel0855.pt'

In [20]:
import torch
from PPOee import ActorCritic

In [21]:
model=torch.load(path)

In [22]:
policy=ActorCritic(30,1,256,True,0.6)

In [24]:
policy.load_state_dict(model)

<All keys matched successfully>

In [25]:
policy.eval()

ActorCritic(
  (actor): Sequential(
    (0): Linear(in_features=30, out_features=256, bias=True)
    (1): Tanh()
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): Tanh()
    (4): Linear(in_features=256, out_features=1, bias=True)
    (5): Tanh()
  )
  (critic): Sequential(
    (0): Linear(in_features=30, out_features=256, bias=True)
    (1): Tanh()
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): Tanh()
    (4): Linear(in_features=256, out_features=1, bias=True)
  )
)

In [26]:
policy.state_dict()

OrderedDict([('actor.0.weight',
              tensor([[ 0.0374,  0.0195, -0.0213,  ...,  0.0164,  0.1193,  0.1862],
                      [ 0.0241,  0.0792, -0.2097,  ...,  0.0701, -0.0685,  0.0475],
                      [ 0.1589, -0.2461, -0.1575,  ..., -0.0352, -0.1951, -0.1851],
                      ...,
                      [-0.0811,  0.1265,  0.0634,  ..., -0.0206, -0.0903, -0.1365],
                      [ 0.0016,  0.0238,  0.0131,  ..., -0.0886,  0.0612,  0.0979],
                      [-0.2013,  0.0040,  0.0246,  ...,  0.0758, -0.0171, -0.1183]])),
             ('actor.0.bias',
              tensor([ 0.1957,  0.0230, -0.1045, -0.0657, -0.1229, -0.0330, -0.1649, -0.1021,
                       0.0048,  0.1326,  0.1886,  0.0113,  0.0102, -0.1487, -0.1113,  0.0252,
                       0.1452,  0.0172,  0.1518,  0.0358, -0.0635, -0.1791,  0.1580, -0.0981,
                      -0.1446, -0.1661, -0.1724,  0.0569,  0.1991, -0.0569,  0.1114,  0.1207,
                       0.159