In [295]:
from IPython.core.display import display, HTML 
display(HTML("<style>.container { width:100% !important; }</style>"))
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [296]:
from hard_cartpole import CartPoleEnv

In [297]:
from curio_dqn import DQN

In [298]:
env = CartPoleEnv()

In [299]:
env.reset()

array([-5.70226325e-03,  5.48336397e-03, -3.13649548e+00,  1.69324305e-03,
        0.00000000e+00])

# 先ずはどこに入れるか考えよう

In [300]:
import chainer
import chainer.functions as F
import chainer.links as L
import chainerrl
import numpy as np

In [301]:
class QFunction(chainer.Chain):

    def __init__(self, obs_size, n_actions, n_hidden_channels=50):
        super().__init__()
        with self.init_scope():
            self.l0 = L.Linear(obs_size, n_hidden_channels)
            self.l1 = L.Linear(n_hidden_channels, n_hidden_channels)
            self.l2 = L.Linear(n_hidden_channels, n_actions)

    def __call__(self, x, test=False):
        """
        Args:
            x (ndarray or chainer.Variable): An observation
            test (bool): a flag indicating whether it is in test mode
        """
        h = F.tanh(self.l0(x))
        h = F.tanh(self.l1(h))
        return chainerrl.action_value.DiscreteActionValue(self.l2(h))

obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n
q_func = QFunction(obs_size, n_actions)

In [302]:
class ForwardPredictor(chainer.Chain):
    
    def __init__(self, obs_size, n_actions, n_hidden_channels=128):
        super().__init__()
        with self.init_scope():
            self.l0 = L.Linear(obs_size+n_actions, n_hidden_channels)
            self.l1 = L.Linear(n_hidden_channels, n_hidden_channels)
            self.l2 = L.Linear(n_hidden_channels, n_hidden_channels)
            self.l3 = L.Linear(n_hidden_channels, obs_size)
            self.bn1 = L.BatchNormalization(n_hidden_channels)
            self.bn2 = L.BatchNormalization(n_hidden_channels)
            self.bn3 = L.BatchNormalization(n_hidden_channels)

    def __call__(self, x, test=False):
        """
        Args:
            x (ndarray or chainer.Variable): An observation
            test (bool): a flag indicating whether it is in test mode
        """
        h = F.relu(self.bn1(self.l0(x)))
        h = F.relu(self.bn2(self.l1(h)))
        h = F.relu(self.bn3(self.l2(h)))
        return self.l3(h)

# リプレイバッファから学習するタイミングに
- 評価の際に(target net で評価するときに同時にr_eも評価するようにする)
- 評価が終わったら，そのときに学習する．

In [303]:
_q_func = chainerrl.q_functions.FCStateQFunctionWithDiscreteAction(
    obs_size, n_actions,
    n_hidden_layers=2, n_hidden_channels=50)
f_pred = ForwardPredictor(obs_size,n_actions)

In [304]:
optimizer = chainer.optimizers.Adam(eps=1e-2)
optimizer.setup(q_func)

<chainer.optimizers.adam.Adam at 0x7fabb8cb1358>

In [311]:
optimizer_f = chainer.optimizers.Adam(eps=1e-7)
optimizer_f.setup(f_pred)

<chainer.optimizers.adam.Adam at 0x7fabb91b7cc0>

In [312]:
# Set the discount factor that discounts future rewards.
gamma = 0.95

# Use epsilon-greedy for exploration
explorer = chainerrl.explorers.ConstantEpsilonGreedy(
    epsilon=0.3, random_action_func=env.action_space.sample)

# DQN uses Experience Replay.
# Specify a replay buffer and its capacity.
replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6)

# Since observations from CartPole-v0 is numpy.float64 while
# Chainer only accepts numpy.float32 by default, specify
# a converter as a feature extractor function phi.
phi = lambda x: x.astype(np.float32, copy=False)

# Now create an agent that will interact with the environment.
agent = DQN(
    q_func, optimizer, replay_buffer, gamma, explorer,
    replay_start_size=500, update_interval=1,
    target_update_interval=100, phi=phi,
    f_pred=f_pred,
    optimizer_f=optimizer_f)

In [None]:
n_episodes = 2000
max_episode_len = 200
for i in range(1, n_episodes + 1):
    obs = env.reset()
    reward = 0
    done = False
    R = 0  # return (sum of rewards)
    t = 0  # time step
    while not done and t < max_episode_len:
        # Uncomment to watch the behaviour
        # env.render()
        action = agent.act_and_train(obs, reward)
        obs, reward, done, _ = env.step(action)
        R += reward
        t += 1
    if i % 10 == 0:
        print('episode:', i,
              'R:', R,
              'statistics:', agent.get_statistics())
    agent.stop_episode_and_train(obs, reward, done)
print('Finished.')

variable(1.1096793)
variable(0.8137543)
variable(2.8652036)
variable(0.78155744)
variable(2.102071)
variable(1.9977939)
variable(1.0350251)
variable(1.5689986)
variable(1.0909193)
variable(0.94089055)
variable(1.3149151)
variable(1.0200441)
variable(0.5556964)
variable(2.4698207)
variable(1.4039607)
variable(1.6842427)
variable(2.3531907)
variable(2.621223)
variable(1.4081852)
variable(1.2630684)
variable(1.2195623)
variable(1.3888046)
variable(2.073272)
variable(0.49924555)
variable(1.0051355)
variable(2.2647574)
variable(1.8503747)
variable(1.3614168)
variable(2.0219612)
variable(1.2721227)
variable(0.9671509)
variable(0.68511426)
variable(1.9063814)
variable(0.77114)
variable(2.2136168)
variable(3.0927532)
variable(1.802523)
variable(2.3910692)
variable(1.1731372)
variable(0.69379973)
variable(0.66574156)
variable(0.9296825)
variable(1.3230853)
variable(2.470442)
variable(1.6307651)
variable(2.1296968)
variable(1.2069625)
variable(0.6798753)
variable(0.8846041)
variable(0.88536316)


In [None]:
np.eye(2)[0]