In [1]:
## setup

import gym
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

In [12]:
## hyperparameters

# https://gym.openai.com/envs/Pendulum-v0/
ENV = 'Pendulum-v0'
THETA = 0.15
DT = 1e-2
BUFFER_CAPACITY = 100000
BATCH_SIZE = 64

In [9]:
## environment

env= gym.make(ENV)

num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

print(f'state space: {num_states}')
print(f'action space: {num_actions}')
print(f'continuous action max: {upper_bound}')
print(f'continuous action min: {lower_bound}')

state space: 3
action space: 1
continuous action max: 2.0
continuous action min: -2.0


In [11]:
## ornstein-uhlenbeck process

class OUActionNoise:
    """
    Ornstein-Uhlenbeck process models the exploration noise process
    Use temporally correlated noise in order to explore well 
    in physical environments that have momentum.
    In paper, theta = 0.1g, sigma = 0.2
    
    https://en.wikipedia.org/wiki/Ornstein%E2%80%93Uhlenbeck_process

    dt = derivative of t, time
    """
    # x_initial?
    def __init__(self, mean, std, theta = THETA, dt = DT, x_initial = None):
        self.theta = theta
        self.mean = mean
        self.std = std
        self.dt = dt
        self.x_initial = x_initial
        self.reset()

    def __call__(self):
        x = (
            self.x_prev
             + self.theta * (self.mean - self.x_prev) * self.dt
             + self.std * np.sqrt(self.dt) * np.random.normal(size = self.mean.shape)
        )
        # it makes next noise dependent on current noise
        self.x_prev = x
        return x

    def reset(self):
        # default x_initial is None
        if self.x_initial is not None:
            self.x_prev = self.x_initial
        else:
            self.x_prev = np.zeros_like(self.mean)

In [None]:
## experience replay buffer

class Buffer:
    """
    Experience replay buffer
    """
    def __init__(self, buffer_capacity = BUFFER_CAPACITY, batch_size = BATCH_SIZE):
        self.buffer_capacity = buffer_capacity
        self.batch_size = batch_size
        # initialize buffer_counter which is incremented by record method
        self.buffer_counter = 0
        self.state_buffer = np.zeros((self.buffer_capacity, num_states))
        self.action_buffer = np.zeros((self.buffer_capacity, num_actions))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, num_states))

    def record(self, obs_tuple):
        """
        When buffer_counter > buffer_capacity,
        index has a new index starting from 0 by %
        """
        index = self.buffer_counter % self.buffer_capacity
        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]
        self.buffer_counter += 1

    def learn(self):
        """
        This method computes the loss and update the parameters
        """
        record_range = min(self.buffer_counter, self.buffer_capacity)
