In [1]:
import pfrl
import torch
import torch.nn
import gym
import numpy

In [2]:

env = gym.make('CartPole-v0')
print('observation space:', env.observation_space)
print('action space:', env.action_space)

obs = env.reset()
print('initial observation:', obs)

action = env.action_space.sample()
obs, r, done, info = env.step(action)
print('next observation:', obs)
print('reward:', r)
print('done:', done)
print('info:', info)

# Uncomment to open a GUI window rendering the current state of the environment
# env.render()

observation space: Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)
action space: Discrete(2)
initial observation: [ 0.04896488  0.04308479 -0.03475799 -0.04262517]
next observation: [ 0.04982658 -0.15152195 -0.0356105   0.23889183]
reward: 1.0
done: False
info: {}


In [4]:
class QFunction(torch.nn.Module):

    def __init__(self, obs_size, n_actions):
        super().__init__()
        self.l1 = torch.nn.Linear(obs_size, 50)
        self.l2 = torch.nn.Linear(50, 50)
        self.l3 = torch.nn.Linear(50, n_actions)

    def forward(self, x):
        h = x
        h = torch.nn.functional.relu(self.l1(h))
        h = torch.nn.functional.relu(self.l2(h))
        h = self.l3(h)
        return pfrl.action_value.DiscreteActionValue(h)

obs_size = env.observation_space.low.size
n_actions = env.action_space.n
q_func = QFunction(obs_size, n_actions)


# Use Adam to optimize q_func. eps=1e-2 is for stability.
optimizer = torch.optim.Adam(q_func.parameters(), eps=1e-2)

In [5]:
# Set the discount factor that discounts future rewards.
gamma = 0.9

# Use epsilon-greedy for exploration
explorer = pfrl.explorers.ConstantEpsilonGreedy(
    epsilon=0.3, random_action_func=env.action_space.sample)

# DQN uses Experience Replay.
# Specify a replay buffer and its capacity.
replay_buffer = pfrl.replay_buffers.ReplayBuffer(capacity=10 ** 6)

# Since observations from CartPole-v0 is numpy.float64 while
# As PyTorch only accepts numpy.float32 by default, specify
# a converter as a feature extractor function phi.
phi = lambda x: x.astype(numpy.float32, copy=False)

# Set the device id to use GPU. To use CPU only, set it to -1.
gpu = -1

# Now create an agent that will interact with the environment.
agent = pfrl.agents.DoubleDQN(
    q_func,
    optimizer,
    replay_buffer,
    gamma,
    explorer,
    replay_start_size=500,
    update_interval=1,
    target_update_interval=100,
    phi=phi,
    gpu=gpu,
)

In [6]:

n_episodes = 300
max_episode_len = 200
for i in range(1, n_episodes + 1):
    obs = env.reset()
    R = 0  # return (sum of rewards)
    t = 0  # time step
    while True:
        # Uncomment to watch the behavior in a GUI window
        # env.render()
        action = agent.act(obs)
        obs, reward, done, _ = env.step(action)
        R += reward
        t += 1
        reset = t == max_episode_len
        agent.observe(obs, reward, done, reset)
        if done or reset:
            break
    if i % 10 == 0:
        print('episode:', i, 'R:', R)
    if i % 50 == 0:
        print('statistics:', agent.get_statistics())
print('Finished.')

episode: 10 R: 10.0
episode: 20 R: 10.0
episode: 30 R: 12.0
episode: 40 R: 12.0
episode: 50 R: 9.0
statistics: [('average_q', 1.0142096), ('average_loss', 0.15402821278096074), ('cumulative_steps', 571), ('n_updates', 72), ('rlen', 571)]
episode: 60 R: 13.0
episode: 70 R: 13.0
episode: 80 R: 37.0
episode: 90 R: 12.0
episode: 100 R: 11.0
statistics: [('average_q', 5.2103605), ('average_loss', 0.23780513466335834), ('cumulative_steps', 1277), ('n_updates', 778), ('rlen', 1277)]
episode: 110 R: 9.0
episode: 120 R: 11.0
episode: 130 R: 16.0
episode: 140 R: 14.0
episode: 150 R: 47.0
statistics: [('average_q', 8.236747), ('average_loss', 0.32066360825207085), ('cumulative_steps', 2240), ('n_updates', 1741), ('rlen', 2240)]
episode: 160 R: 60.0
episode: 170 R: 65.0
episode: 180 R: 200.0
episode: 190 R: 200.0
episode: 200 R: 151.0
statistics: [('average_q', 9.92435), ('average_loss', 0.10864238673937507), ('cumulative_steps', 9134), ('n_updates', 8635), ('rlen', 9134)]
episode: 210 R: 70.0
epi

In [8]:
with agent.eval_mode():
    for i in range(10):
        obs = env.reset()
        R = 0
        t = 0
        while True:
            # Uncomment to watch the behavior in a GUI window
            # env.render()
            action = agent.act(obs)
            obs, r, done, _ = env.step(action)
            R += r
            t += 1
            reset = t == 200
            agent.observe(obs, r, done, reset)
            if done or reset:
                break
        print('evaluation episode:', i, 'R:', R)

evaluation episode: 0 R: 200.0
evaluation episode: 1 R: 199.0
evaluation episode: 2 R: 200.0
evaluation episode: 3 R: 200.0
evaluation episode: 4 R: 200.0
evaluation episode: 5 R: 200.0
evaluation episode: 6 R: 200.0
evaluation episode: 7 R: 200.0
evaluation episode: 8 R: 200.0
evaluation episode: 9 R: 200.0
