In [2]:
import pfrl
import torch
import torch.nn
import gym
import numpy as np

In [3]:
torch.cuda.is_available()

True

In [4]:
env = gym.make('CartPole-v0')
print('observation space:', env.observation_space)
print('action space:', env.action_space)

obs = env.reset()
print('initial observation:', obs)

action = env.action_space.sample()
obs, r, done, info = env.step(action)
print('next observation:', obs)
print('reward:', r)
print('done:', done)
print('info:', info)

# Uncomment to open a GUI window rendering the current state of the environment
# env.render()


observation space: Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)
action space: Discrete(2)
initial observation: [ 0.03481615 -0.0239378  -0.04042435 -0.02445456]
next observation: [ 0.03433739 -0.21845741 -0.04091344  0.25520497]
reward: 1.0
done: False
info: {}


In [5]:
class QFunction(torch.nn.Module):

    def __init__(self, obs_size, n_actions):
        super().__init__()
        self.l1 = torch.nn.Linear(obs_size, 50)
        self.l2 = torch.nn.Linear(50, 50)
        self.l3 = torch.nn.Linear(50, n_actions)

    def forward(self, x):
        h = x
        h = torch.nn.functional.relu(self.l1(h))
        h = torch.nn.functional.relu(self.l2(h))
        h = self.l3(h)
        return pfrl.action_value.DiscreteActionValue(h)

obs_size = env.observation_space.low.size
n_actions = env.action_space.n
q_func = QFunction(obs_size, n_actions)

In [6]:
# Use Adam to optimize q_func. eps=1e-2 is for stability.
optimizer = torch.optim.Adam(q_func.parameters(), eps=1e-2)

In [7]:
gamma = 0.9

explorer = pfrl.explorers.ConstantEpsilonGreedy(epsilon=0.3, random_action_func=env.action_space.sample)

replay_buffer = pfrl.replay_buffers.ReplayBuffer(capacity=10**6)

phi = lambda x: x.astype(np.float32, copy=False)

gpu = -1

agent = pfrl.agents.DoubleDQN(
    q_func,
    optimizer,
    replay_buffer,
    gamma,
    explorer,
    replay_start_size=500,
    update_interval=1,
    target_update_interval=100,
    phi=phi,
    gpu=gpu
)

In [8]:
n_episodes = 300
max_episode_len = 200
for i in range(1, n_episodes+1):
    obs = env.reset()
    R = 0
    t = 0
    while True:
        action = agent.act(obs)
        obs, reward, done, _ = env.step(action)
        R += reward
        t += 1
        reset = t == max_episode_len
        agent.observe(obs, reward, done, reset)
        if done or reset:
            break
    if i % 10 == 0:
        print('episode:', i, 'R:', R)
    if i % 50 == 0:
        print('statics:', agent.get_statistics())

print('Finished.')

episode: 10 R: 24.0
episode: 20 R: 20.0
episode: 30 R: 21.0
episode: 40 R: 15.0
episode: 50 R: 12.0
statics: [('average_q', 2.7332926), ('average_loss', 0.11139749635942281), ('cumulative_steps', 792), ('n_updates', 293), ('rlen', 792)]
episode: 60 R: 16.0
episode: 70 R: 9.0
episode: 80 R: 9.0
episode: 90 R: 10.0
episode: 100 R: 12.0
statics: [('average_q', 6.223807), ('average_loss', 0.333179437154904), ('cumulative_steps', 1367), ('n_updates', 868), ('rlen', 1367)]
episode: 110 R: 8.0
episode: 120 R: 19.0
episode: 130 R: 14.0
episode: 140 R: 150.0
episode: 150 R: 59.0
statics: [('average_q', 9.069472), ('average_loss', 0.3329277034010738), ('cumulative_steps', 2636), ('n_updates', 2137), ('rlen', 2636)]
episode: 160 R: 44.0
episode: 170 R: 80.0
episode: 180 R: 121.0
episode: 190 R: 200.0
episode: 200 R: 169.0
statics: [('average_q', 9.914565), ('average_loss', 0.12350330051500351), ('cumulative_steps', 8073), ('n_updates', 7574), ('rlen', 8073)]
episode: 210 R: 145.0
episode: 220 R: 

In [12]:
with agent.eval_mode():
    for i in range(10):
        obs = env.reset()
        R = 0
        t = 0
        while True:
            env.render()
            action = agent.act(obs)
            obs, r, done, _ = env.step(action)
            R += r
            t += 1
            reset = t == 200
            agent.observe(obs, r, done, reset)
            if done or reset:
                break
        print('evaluatin episode:', i, 'R:', R)

evaluatin episode: 0 R: 200.0
evaluatin episode: 1 R: 200.0
evaluatin episode: 2 R: 185.0
evaluatin episode: 3 R: 184.0
evaluatin episode: 4 R: 184.0
evaluatin episode: 5 R: 194.0
evaluatin episode: 6 R: 168.0
evaluatin episode: 7 R: 168.0
evaluatin episode: 8 R: 200.0
evaluatin episode: 9 R: 200.0


In [13]:
env.close()

## SAC

In [21]:
import os
import random

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm as tqdm

import pfrl
import torch
import torch.nn
import gym

In [25]:
env = gym.make('Pendulum-v0')

print(f'obs: {env.observation_space}')
obs_init = env.reset()

ac = env.action_space.sample()
print('acitons:', ac)
obs, r, done, info = env.step(ac)
# import matplotlib.pyplot as plt
# plt.imshow(np.hstack([obs_init, obs]))
print('obsavation:', obs)
print('reward:', r)
print('done:', done)
print('info:', info)


obs: Box(-8.0, 8.0, (3,), float32)
acitons: [-0.09511738]
obsavation: [-0.93061906  0.3659893   1.27269098]
reward: -7.401462890176967
done: False
info: {}


SACは状態と行動値を受け取って行動価値を返すQ関数と
状態を受け取って行動を返す方策関数からなる Actor-Criticである。
方策関数は平均と標準偏差をネットワークで予測し、ガウス分布を返す。
Q関数は状態と行動をconcatしてMLPに通して価値を返す。

In [26]:
from torch import distributions as dists

class Qfunc(torch.nn.Module):

    def __init__(self, obs_size, n_actions):
        super().__init__()
        self.l1 = torch.nn.Linear(obs_size + n_actions, 50)
        self.l2 = torch.nn.Linear(50, 50)
        self.l3 = torch.nn.Linear(50, 1)

    def forward(self, x):
        state, action = x
        h = torch.cat([state, action], 1)
        h = torch.nn.functional.relu(self.l1(h))
        h = torch.nn.functional.relu(self.l2(h))
        h = self.l3(h)
        return h

    def __init__(self, obs_size, n_actions, log_std_max=3, log_std_min=-15):
        super().__init__()
        self.l1 = torch.nn.Linear(obs_size, 50)
        self.l2 = torch.nn.Linear(50, 50)
        self.mean = torch.nn.Linear(50, n_actions)
        self.log_std = torch.nn.Linear(50, n_actions)
        self.log_std_max = log_std_max
        self.log_std_min = log_std_min

    def forward(self, x):
        h = x
        h = torch.nn.functional.relu(self.l1(h))
        h = torch.nn.functional.relu(self.l2(h))
        mean = self.mean(h)
        log_std = self.log_std(h)
        log_std = torch.clamp(log_std, min=self.log_std_min, max=self.log_std_max)
        dist = dists.Normal(mean, log_std.exp())
        return dist


In [28]:
# https://github.com/pfnet/pfrl/blob/master/pfrl/agents/soft_actor_critic.py

from torch.nn import functional as F

class ModSAC(pfrl.agents.SoftActorCritic):

    def _batch_act_train(self, batch_obs):
        if self.burnin_action_func is not None and self.n_policy_updates == 0:
            batch_action = [self.burnin_action_func() for _ in range(len(batch_obs))]
        else:
            batch_action = self.batch_select_greedy_action(batch_obs, deterministic=self.act_deterministically)
        
        self.batch_last_obs = list(batch_obs)
        self.batch_last_action = list(batch_action)

        return batch_action

    def update_q_func(self, batch):
        """Compute loss for a given Q-function."""

        batch_next_state = batch["next_state"]
        batch_rewards = batch["reward"]
        batch_terminal = batch["is_state_terminal"]
        batch_actions = batch["action"]
        batch_discount = batch["discount"]

        with torch.no_grad(), pfrl.utils.evaluating(self.policy), pfrl.utils.evaluating(self.target_q_func1), pfrl.utils.evaluating(self.target_q_func2):
            next_action_distrib = self.policy(batch_next_state)
            next_actions = next_action_distrib.sample()
            next_log_prob = next_action_distrib.log_prob(next_actions)
            next_q1 = self.target_q_func1((batch_next_state, next_actions))
            next_q2 = self.target_q_func2((batch_next_state, next_actions))
            next_q = torch.min(next_q1, next_q2)
            entropy_term = self.temperature * next_log_prob
            assert next_q.shape == entropy_term.shape

            target_q = batch_rewards + batch_discount * (1.0 - batch_terminal) * torch.flatten(next_q - entropy_term)

        predict_q1 = torch.flatten(self.q_func1((batch_state, batch_actions)))
        predict_q2 = torch.flatten(self.q_func2((batch_state, batch_actions)))

        print(f'=====\n{batch_terminal}') 

        loss1 = 0.5 * F.mse_loss(target_q, predict_q1)
        loos2 = 0.5 * F.mse_loss(target_q, predict_q2)

        self.q1_record.extend(predict_q1.detach().cpu().numpy())
        self.q2_record.extend(predict_q2.detach().cpu().numpy())
        self.q_func1_loss_record.append(float(loss1))
        self.q_func2_loss_record.append(float(loss2))

        self.q_func1_optimizer.zero_grad()
        loss1.backward()
        if self.max_grad_norm is not None:
            clip_l2_grad_norm_(self.q_func1.parameters(), self.max_grad_norm)
        self.q_func1_optimizer.step()

        self.q_func2_optimizer.zero_grad()
        loss2.backward()
        if self.max_grad_norm is not None:
            clip_l2_grad_norm_(self.q_func2.parameters(), self.max_grad_norm)
        self.q_func2_optimizer.step()

        
        
             

SyntaxError: invalid syntax (<ipython-input-28-3f0584cab758>, line 23)

In [None]:
n_episodes = 500
gamma = 0.99
phi = lambda x: x.astype(np.float32, copy=False)
gpu = 1
num_runs = 3

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

res_dfs = []
results = []
for run in range(num_runs):
    env.seed(run)
    set_seed(run)
    policy_func = pfrl.policy.