In [11]:
import gym
import numpy as np
from my_ppo_2 import PPO
from SAC import SAC
import torch

In [13]:
class ReplayBuffer:
    def __init__(self,buffer_size,state_dim,action_dim):
        self.buffer_size=buffer_size
        self.index=0
        self.state=np.zeros((buffer_size,state_dim))
        self.action=np.zeros((buffer_size,action_dim))
        self.next_state=np.zeros((buffer_size,state_dim))
        self.reward=np.zeros(buffer_size,dtype=float)
        self.done=np.zeros(buffer_size,dtype=bool)
        
    def store(self,state,action,next_state,done):
        index=self.index%self.buffer_size
        self.state[index]=state
        self.action[index]=action
        self.next_state[index]=next_state
        self.reward[index]=0.0
        self.done[index]=done
        self.index+=1
        
    def sample(self,batch_size,return_index=False):
        sample_index=np.random.choice(batch_size,self.buffer_size,replace=True)
        if return_index:
            return sample_index
        state=self.state[sample_index]
        action=self.action[sample_index]
        next_state=self.next_state[sample_index]
        reward=self.reward[sample_index]
        done=self.done[sample_index]
        return state,action,next_state,reward,done
    
    def clean(self):
        self.index=0
            

In [14]:
class CustomEnv:
    def __init__(self,env_id):
        self.env=gym.make(env_id)
        self.replay_buffer=ReplayBuffer(8192,self.env.observation_space.shape[0],self.env.action_space.shape[0])
        
    def reset(self):
        return self.env.reset()
    
    def step(self,state,action):
        state_,reward,done,info=self.env.step(action)
        self.replay_buffer.store(state,action,state_,done)
        return state_,reward,done,info
    
    def get_state_dim(self):
        return self.env.observation_space.shape
    
    def get_action_dim(self):
        return self.env.action_space.shape

In [15]:
class StudentAgent:
    def __init__(self,state_dim,action_dim):
        self.model=PPO(state_dim,action_dim,1e-3,1e-3,0.99,80,0.2,True)
        
    def update(self,replay_buffer,batch_size):
        index=replay_buffer.sample(batch_size,True)
        self.model.update()

In [16]:
custom_env=CustomEnv('Pendulum-v0')

In [17]:
d=False
s=custom_env.reset()
i=0
while not d:
    a=custom_env.env.action_space.sample()
    s_,r,d,_=custom_env.step(s,a)
    s=s_
    i+=1

In [18]:
i=custom_env.replay_buffer.sample(64,True)

In [7]:
sa=StudentAgent(custom_env.get_state_dim()[0],custom_env.get_action_dim()[0])

In [12]:
sa.update(custom_env.replay_buffer,64)

[[-0.95834779 -0.28560377 -0.32741271]
 [-0.99808779  0.06181238 -1.321099  ]
 [-0.99977756 -0.0210911   0.93504803]
 ...
 [-0.99907628 -0.04297188  1.03695861]
 [-0.93054645 -0.36617386  0.17125319]
 [-0.9760131  -0.21771181  1.28945615]]


IndexError: index 1 is out of bounds for dimension 1 with size 1

In [14]:
i=custom_env.replay_buffer.sample(64,return_index=True)

In [9]:
s=torch.FloatTensor(custom_env.replay_buffer.state[i])
a=torch.FloatTensor(custom_env.replay_buffer.action[i])

In [21]:
i=custom_env.replay_buffer.sample(10,True)

In [24]:
custom_env.replay_buffer.state[i]

array([[-0.97553698,  0.21983537, -0.75659917],
       [-0.9831534 ,  0.18278237, -0.82048096],
       [-0.95259862,  0.30422996, -0.23870135],
       ...,
       [-0.99976169,  0.02183023, -0.77791089],
       [-0.99976169,  0.02183023, -0.77791089],
       [-0.96535131,  0.26095373, -0.84728643]])

In [26]:
type(i[0])

numpy.int64