# ChainerRL Quickstart Guide

* 下記の資料の写経
* https://github.com/chainer/chainerrl/blob/master/examples/quickstart/quickstart.ipynb

In [2]:
import chainer
import chainer.functions as F
import chainer.links as L
import chainerrl
import gym
import numpy as np
import seaborn as sns

In [3]:
sns.set_style("darkgrid")

* env.resetで環境を初期状態にする
* env.stepでactionを実行し、次の状態に移動。そして、次のobservationとreward、terminalかどうか、その他の情報を返す。
* env.renderで現在のstateを返す？

In [5]:
env = gym.make('CartPole-v0')
print('observation space;', env.observation_space)
print('action space;', env.action_space)

obs = env.reset()
env.render()
print("initial observation:",obs)

action = env.action_space.sample()
obs, r, done, info = env.step(action)
print('next observation:', obs)
print('reward:', r)
print('done:', done)
print('info:', info)

observation space; Box(4,)
action space; Discrete(2)
initial observation: [-0.01292212  0.02662745 -0.00172632 -0.04792824]
next observation: [-0.01238957  0.22177411 -0.00268489 -0.34115533]
reward: 1.0
done: False
info: {}


上記で環境が定義できた。これからagentを定義する。

ChainerRLには色々なagentsがある

DQNを使うにはQfunctionを定義する必要。Qfはobservationを受け取り、Agentが取れるactionごとに想定される未来の報酬を返す(?)。

ChainerRLでは、Qfをchainer.Linkで定義できる。

outputはchainerrl.aciton_value.DiscreteActionValueでwrapされてる。これは、chainerrl.action_value.actionvalueを実行している。

Qfのoutputsをwrapすることで、ChainerRLはdiscrete-acitonQfをこのように扱え、NAFSも同様に扱える。


In [6]:
class QFunction(chainer.Chain):

    def __init__(self, obs_size, n_actions, n_hidden_channels=50):
        super().__init__()
        with self.init_scope():
            self.l0 = L.Linear(obs_size, n_hidden_channels)
            self.l1 = L.Linear(n_hidden_channels, n_hidden_channels)
            self.l2 = L.Linear(n_hidden_channels, n_actions)

    def __call__(self, x, test=False):
        """
        Args:
            x (ndarray or chainer.Variable): An observation
            test (bool): a flag indicating whether it is in test mode
        """
        h = F.tanh(self.l0(x))
        h = F.tanh(self.l1(h))
        return chainerrl.action_value.DiscreteActionValue(self.l2(h))

obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n
q_func = QFunction(obs_size, n_actions)

* 下記でも定義できるようだ

In [7]:
_q_func = chainerrl.q_functions.FCStateQFunctionWithDiscreteAction(
    obs_size, n_actions,
    n_hidden_layers=2, n_hidden_channels=50)

* modelをupdateする。Adamを用いている。

In [8]:
# Use Adam to optimize q_func. eps=1e-2 is for stability.
optimizer = chainer.optimizers.Adam(eps=1e-2)
optimizer.setup(q_func)

* DQNagentを作るにはいくつかパラメータと構造を設定する必要

In [9]:
# Set the discount factor that discounts future rewards.
gamma = 0.95

# Use epsilon-greedy for exploration
explorer = chainerrl.explorers.ConstantEpsilonGreedy(
    epsilon=0.3, random_action_func=env.action_space.sample)

# DQN uses Experience Replay.
# Specify a replay buffer and its capacity.
replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6)

# Since observations from CartPole-v0 is numpy.float64 while
# Chainer only accepts numpy.float32 by default, specify
# a converter as a feature extractor function phi.
phi = lambda x: x.astype(np.float32, copy=False)

# Now create an agent that will interact with the environment.
agent = chainerrl.agents.DoubleDQN(
    q_func, optimizer, replay_buffer, gamma, explorer,
    replay_start_size=500, update_interval=1,
    target_update_interval=100, phi=phi)

* agentと環境が作れたので、強化学習を始める。
* 探索的行動を選ぶために、agent.act_and_trainを用いる。episodeが終わったら、agent.stop_episode_and_trainを呼ぶ。agent.get_statistics経由でagentのtraining統計が得られる

In [10]:
n_episodes = 200
max_episode_len = 200
for i in range(1, n_episodes + 1):
    obs = env.reset()
    reward = 0
    done = False
    R = 0  # return (sum of rewards)
    t = 0  # time step
    while not done and t < max_episode_len:
        # Uncomment to watch the behaviour
        # env.render()
        action = agent.act_and_train(obs, reward)
        obs, reward, done, _ = env.step(action)
        R += reward
        t += 1
    if i % 10 == 0:
        print('episode:', i,
              'R:', R,
              'statistics:', agent.get_statistics())
    agent.stop_episode_and_train(obs, reward, done)
print('Finished.')

episode: 10 R: 11.0 statistics: [('average_q', 0.0030674470130931056), ('average_loss', 0)]
episode: 20 R: 9.0 statistics: [('average_q', 0.02775660827377906), ('average_loss', 0)]
episode: 30 R: 12.0 statistics: [('average_q', 0.03084203221441138), ('average_loss', 0)]
episode: 40 R: 10.0 statistics: [('average_q', 0.05007766855455882), ('average_loss', 0)]
episode: 50 R: 9.0 statistics: [('average_q', 0.12055369996585956), ('average_loss', 0.1534333034869617)]
episode: 60 R: 9.0 statistics: [('average_q', 0.2627134552193112), ('average_loss', 0.26728088507099557)]
episode: 70 R: 21.0 statistics: [('average_q', 0.5386227645304451), ('average_loss', 0.24845479360569162)]
episode: 80 R: 14.0 statistics: [('average_q', 0.8805583186682817), ('average_loss', 0.21616087349506796)]
episode: 90 R: 19.0 statistics: [('average_q', 1.561163570580645), ('average_loss', 0.23944542793854828)]
episode: 100 R: 110.0 statistics: [('average_q', 4.037710702569092), ('average_loss', 0.3620809297889803)]


* agent.act と agent.stop_episodeを用いることで、テストができる。
* epsilon greedyのような探索は用いられない。

In [11]:
for i in range(10):
    obs = env.reset()
    done = False
    R = 0
    t = 0
    while not done and t < 200:
        env.render()
        action = agent.act(obs)
        obs, r, done, _ = env.step(action)
        R += r
        t += 1
    print('test episode:', i, 'R:', R)
    agent.stop_episode()

test episode: 0 R: 188.0
test episode: 1 R: 185.0
test episode: 2 R: 185.0
test episode: 3 R: 182.0
test episode: 4 R: 184.0
test episode: 5 R: 184.0
test episode: 6 R: 186.0
test episode: 7 R: 180.0
test episode: 8 R: 174.0
test episode: 9 R: 181.0


* スコアはいい感じ。agentを保存し、再利用できるようにしよう。
* agent.saveを呼べばsaveでき、agent.loadで保存したagentを呼び出せる。

In [12]:
# Save an agent to the 'agent' directory
agent.save('agent')

# Uncomment to load an agent from the 'agent' directory
# agent.load('agent')

* おしまい
* でもこういうコード毎回書くの飽きるよね。utility functionsがあるよ

In [13]:
# Set up the logger to print info messages for understandability.
import logging
import sys
gym.undo_logger_setup()  # Turn off gym's default logger settings
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')

chainerrl.experiments.train_agent_with_evaluation(
    agent, env,
    steps=2000,           # Train the agent for 2000 steps
    eval_n_runs=10,       # 10 episodes are sampled for each evaluation
    max_episode_len=200,  # Maximum length of each episodes
    eval_interval=1000,   # Evaluate the agent after every 1000 steps
    outdir='result')      # Save everything to 'result' directory

outdir:result step:179 episode:0 R:179.0
statistics:[('average_q', 20.176150098189026), ('average_loss', 0.16745357896466967)]
outdir:result step:345 episode:1 R:166.0
statistics:[('average_q', 20.161455445567192), ('average_loss', 0.214232385554759)]
outdir:result step:505 episode:2 R:160.0
statistics:[('average_q', 20.166822858899927), ('average_loss', 0.24006307060331403)]
outdir:result step:658 episode:3 R:153.0
statistics:[('average_q', 20.176389030669352), ('average_loss', 0.2738147322758794)]
outdir:result step:802 episode:4 R:144.0
statistics:[('average_q', 20.185807750922574), ('average_loss', 0.18847578293607056)]
outdir:result step:952 episode:5 R:150.0
statistics:[('average_q', 20.20061362608234), ('average_loss', 0.21948654239706275)]
outdir:result step:1117 episode:6 R:165.0
statistics:[('average_q', 20.208696837346473), ('average_loss', 0.23755985694321494)]
test episode: 0 R: 135.0
test episode: 1 R: 133.0
test episode: 2 R: 124.0
test episode: 3 R: 129.0
test episode: 