In [2]:
import gym
import torch

from algorithms import Reinforce

# if GPU is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. CartPole environment

In [3]:
""" Environment Information
ref: https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py#L17

Action Space
    The action is a `ndarray` with shape `(1,)` which can take values `{0, 1}` indicating the direction
     of the fixed force the cart is pushed with.
    | Num | Action                 |
    |-----|------------------------|
    | 0   | Push cart to the left  |
    | 1   | Push cart to the right |
    **Note**: The velocity that is reduced or increased by the applied force is not fixed and it depends on the angle
     the pole is pointing. The center of gravity of the pole varies the amount of energy needed to move the cart underneath it
Observation Space
    The observation is a `ndarray` with shape `(4,)` with the values corresponding to the following positions and velocities:
    | Num | Observation           | Min                 | Max               |
    |-----|-----------------------|---------------------|-------------------|
    | 0   | Cart Position         | -4.8                | 4.8               |
    | 1   | Cart Velocity         | -Inf                | Inf               |
    | 2   | Pole Angle            | ~ -0.418 rad (-24°) | ~ 0.418 rad (24°) |
    | 3   | Pole Angular Velocity | -Inf                | Inf               |
    **Note:** While the ranges above denote the possible values for observation space of each element,
        it is not reflective of the allowed values of the state space in an unterminated episode. Particularly:
    -  The cart x-position (index 0) can be take values between `(-4.8, 4.8)`, but the episode terminates
       if the cart leaves the `(-2.4, 2.4)` range.
    -  The pole angle can be observed between  `(-.418, .418)` radians (or **±24°**), but the episode terminates
       if the pole angle is not in the range `(-.2095, .2095)` (or **±12°**)
Rewards
    Since the goal is to keep the pole upright for as long as possible, a reward of `+1` for every step taken,
    including the termination step, is allotted. The threshold for rewards is 475 for v1.
Episode End
    The episode ends if any one of the following occurs:
    1. Termination: Pole Angle is greater than ±12°
    2. Termination: Cart Position is greater than ±2.4 (center of the cart reaches the edge of the display)
    3. Truncation: Episode length is greater than 500 (200 for v0)
"""
env = gym.make("CartPole-v1")
# env.reset()
# env.render()

Sample environment image

<img width=300 src="cartpole.png" />

In [4]:
print("observation_space: ", env.observation_space)
print("action_space: ", env.action_space)
state = env.reset()
print("sample obs: ", state)

observation_space:  Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
action_space:  Discrete(2)
sample obs:  [-0.04331493  0.01159795 -0.01566737  0.01882051]


In [5]:
# sample step
new_state, reward, done, info = env.step(0)
print("sample step: ", (new_state, reward, done, info))

sample step:  (array([-0.04308297, -0.18329585, -0.01529096,  0.3065193 ], dtype=float32), 1.0, False, {})


# 2. REINFORCE: Vanilla Policy Gradient

In [70]:
env = gym.make("CartPole-v1")
reinforce_agent = Reinforce(env)
reinforce_agent.train(num_episodes=int(500), max_steps=200)

100%|██████████| 500/500 [00:03<00:00, 134.83it/s]


In [76]:
# test the trained agent in 1 episode
trajectory, done = reinforce_agent.get_trajectory(max_steps=501)
print("done status: ", done)
print("step taken (maximum 500): ", len(trajectory))

done status:  True
step taken (maximum 500):  500


In [77]:
# test the trained agent
def evaluate_policy(agent, num_episodes=100, max_steps=500):
    win = 0
    for _ in range(num_episodes):
        trajectory, _ = agent.get_trajectory(max_steps=max_steps)
        # CUSTOMIZE: for cartpole environment
        if len(trajectory) >= 490:
            win += 1
    return win / num_episodes

test_episodes = 100
win_rate = evaluate_policy(reinforce_agent, num_episodes=test_episodes)
print(f'agent win rate: {win_rate*100 :.2f}% from {test_episodes} test episodes')

agent win rate: 98.00% from 100 test episodes


In [78]:
# test the trained agent in 1 episode
trajectory, done = reinforce_agent.get_trajectory(max_steps=501)
print("done status: ", done)
print("step taken (maximum 500): ", len(trajectory))

done status:  True
step taken (maximum 500):  500


In [80]:
# visualize the trained agent on a separate window
reinforce_agent.visualize_policy(num_episodes=10)