In [1]:
%%capture
!pip install --upgrade pip
!pip install gymnasium
!pip install pfrl

In [2]:
import pfrl
import torch
from torch import nn
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import tensorflow as tf
import random

In [3]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
assert x_train.shape == (50000, 32, 32, 3)
assert x_test.shape == (10000, 32, 32, 3)
assert y_train.shape == (50000, 1)
assert y_test.shape == (10000, 1)

  and should_run_async(code)


In [4]:
class CifarEnv(gym.Env):

    def __init__(self,):

        self.observation_space = spaces.Box(low=0, high=255, shape=(32, 32, 3), dtype=np.uint8)

        self.action_space = spaces.Discrete(10)
        self.expected_action = 0

        # assert render_mode is None or render_mode in self.metadata["render_modes"]
        self.render_mode = None
        self.x, self.y = (x_train, y_train)
        self.random = True
        self.images_per_episode = 1
        self.dataset_idx = 0

    def _get_info(self):
      return 0

    def step(self, action):
        done = False
        reward = int(action == self.expected_action)

        obs = self._next_obs()

        self.step_count += 1
        if self.step_count >= self.images_per_episode:
            done = True

        return obs, reward, done, {}

    def reset(self, seed=None, options=None):
        self.step_count = 0
        obs = self._next_obs()
        return obs

    def _next_obs(self):
        if self.random:
            next_obs_idx = random.randint(0, len(self.x) - 1)
            self.expected_action = int(self.y[next_obs_idx])
            obs = self.x[next_obs_idx]

        else:
            obs = self.x[self.dataset_idx]
            self.expected_action = int(self.y[self.dataset_idx])

            self.dataset_idx += 1
            if self.dataset_idx >= len(self.x):
                raise StopIteration()

        return obs

class CifarEnvTest(gym.Env):

    def __init__(self,):

        self.observation_space = spaces.Box(low=0, high=255, shape=(32, 32, 3), dtype=np.uint8)

        self.action_space = spaces.Discrete(10)
        self.expected_action = 0

        # assert render_mode is None or render_mode in self.metadata["render_modes"]
        self.render_mode = None
        self.x, self.y = (x_test, y_test)
        self.random = True
        self.images_per_episode = 10000
        self.dataset_idx = 0

    def _get_info(self):
      return 0

    def step(self, action):
        done = False
        reward = int(action == self.expected_action)

        obs = self._next_obs()

        self.step_count += 1
        if self.step_count >= self.images_per_episode:
            done = True

        return obs, reward, done, {}

    def reset(self, seed=None, options=None):
        self.step_count = 0
        obs = self._next_obs()
        return obs

    def _next_obs(self):
        if self.random:
            next_obs_idx = random.randint(0, len(self.x) - 1)
            self.expected_action = int(self.y[next_obs_idx])
            obs = self.x[next_obs_idx]

        else:
            obs = self.x[self.dataset_idx]
            self.expected_action = int(self.y[self.dataset_idx])

            self.dataset_idx += 1
            if self.dataset_idx >= len(self.x):
                raise StopIteration()

        return obs

In [5]:
env = CifarEnv()
test_env = CifarEnvTest()

In [6]:
class QFunction(torch.nn.Module):

    def __init__(self,):
        super().__init__()
        self.network = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # output: 64 x 16 x 16
            nn.BatchNorm2d(64),

            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # output: 128 x 8 x 8
            nn.BatchNorm2d(128),

            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # output: 256 x 4 x 4
            nn.BatchNorm2d(256),

            nn.Flatten(),
            nn.Linear(256*4*4, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512,256),
            nn.ReLU(),
            pfrl.nn.Branched(
                nn.Sequential(nn.Linear(256, 10),
                pfrl.policies.SoftmaxCategoricalHead(),
            ),
            nn.Linear(256, 1),
            )
            )

    def forward(self, x):
        h = self.network(x)
        return h

obs_size = env.observation_space.low.size
n_actions = env.action_space.n
q_func = QFunction()

In [7]:
# optimizer = torch.optim.Adam(q_func.parameters(), eps=1e-2)
optimizer = pfrl.optimizers.RMSpropEpsInsideSqrt(
        q_func.parameters(),
        lr=7e-4,
        eps=1e-5,
        alpha=0.99,
    )
phi = lambda x:np.resize(x/255.0,(3,32,32)).astype(np.float32, copy=False)
gamma = 1
gpu = 0
tau = 0.95
agent = pfrl.agents.A2C(
        q_func,
        optimizer,
        gamma=gamma,
        gpu=gpu,
        num_processes=1,
        update_steps=1,
        phi=phi,
        use_gae=False,
        tau=tau,
        max_grad_norm=40,
    )

In [8]:
import logging
import sys
import time
start_time = time.time()
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')
eval_ep = 10000
pfrl.experiments.train_agent_with_evaluation(
    agent,
    env,
    steps=100000,           # Train the agent for 2000 steps
    eval_n_steps=None,       # We evaluate for episodes, not time
    eval_n_episodes=1,       # 10 episodes are sampled for each evaluation
    train_max_episode_len=1,  # Maximum length of each episode
    eval_max_episode_len=10000,
    eval_interval=20000,   # Evaluate the agent after every 1000 steps
    outdir='result',      # Save everything to 'result' directory
    eval_env = test_env,
)

	addcmul_(Number value, Tensor tensor1, Tensor tensor2)
Consider using one of the following signatures instead:
	addcmul_(Tensor tensor1, Tensor tensor2, *, Number value) (Triggered internally at ../torch/csrc/utils/python_arg_parser.cpp:1485.)
  square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad)


(<pfrl.agents.a2c.A2C at 0x7a1bf775add0>,
 [{'average_actor': -0.008877603044557692,
   'average_value': 0.11102728546506845,
   'average_entropy': 0.5429342656649078,
   'eval_score': 987.0},
  {'average_actor': -0.015379263418809295,
   'average_value': 0.12774680332310334,
   'average_entropy': 0.6325116837493275,
   'eval_score': 1126.0},
  {'average_actor': -0.013288387806436775,
   'average_value': 0.1262041960344501,
   'average_entropy': 0.8231851461669987,
   'eval_score': 1457.0},
  {'average_actor': -0.005292824712974315,
   'average_value': 0.13957002177934297,
   'average_entropy': 0.5157823308716752,
   'eval_score': 1622.0},
  {'average_actor': 0.00802511109145072,
   'average_value': 0.13675678698733135,
   'average_entropy': 0.7241070657438216,
   'eval_score': 1504.0}])