# A2C Demo
This notebook focuses on training and testing the Advantage Actor-Critic (A2C) algorithm.

In [29]:
from a2c.model import ACNetwork
from a2c.agent import A2CAgent
from utils.config import Config
from utils.helper import set_device
from utils.logger import Logger

from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY
import torch.optim as optim

In [30]:
# Set hyperparameters
ENV_NAME = 'SuperMarioBros-v0'

GAMMA = 0.99
LEARNING_RATE = 0.001
EPSILON = 1e-3
ENTROPY_WEIGHT = 0.01

N_STEPS = 4 # TD bootstrapping
GRAD_CLIP = 0.1 # Prevents gradients from being too large
NUM_EPISODES = 5000

# Create environment
env = gym_super_mario_bros.make(ENV_NAME)
env = JoypadSpace(env, RIGHT_ONLY)

# Set config instance
config = Config()

# Set cuda device
device = set_device()

# Add core items to config
config.add(
    env=env,
    env_name=ENV_NAME,
    gamma=GAMMA,
    lr=LEARNING_RATE,
    epsilon=EPSILON,
    entropy_weight=ENTROPY_WEIGHT,
    rollout_size=N_STEPS,
    grad_clip=GRAD_CLIP,
    device=device,
    num_episodes=NUM_EPISODES,
    logger=Logger()
)

# Setup environment parameters
config.set_env_params()

  logger.warn(


CUDA available. Device set to GPU.


In [31]:
# https://github.com/Kautenja/gym-super-mario-bros/blob/master/gym_super_mario_bros/actions.py
print('Available actions:', config.action_space)
print('Obs space shape: ', config.input_shape)

Available actions: Discrete(5)
Obs space shape:  (240, 256, 3)


In [32]:
# Create network
a2c = ACNetwork(config.input_shape, config.n_actions).to(device)

# Add optimizer and network to config
config.add(
    optimizer_fn=lambda params: optim.Adam(
        params,
        lr=config.lr,
        eps=config.epsilon
    ),
    network_fn=lambda: a2c
)

In [5]:
# Train agent
agent = A2CAgent(config)
agent.train(target_score=70)

Running training with N-Steps: 4
(1/5000)	Episode actions: [2, 4, 1, 1]	Avg return: -0.004	Total loss: 0.016
(100/5000)	Episode actions: [0, 1, 0, 0]	Avg return: 50.845	Total loss: -0.233
(200/5000)	Episode actions: [4, 4, 0, 3]	Avg return: 51.620	Total loss: -0.304
(300/5000)	Episode actions: [4, 4, 0, 1]	Avg return: 51.828	Total loss: -0.277
(400/5000)	Episode actions: [4, 2, 3, 2]	Avg return: 52.199	Total loss: -0.295
(500/5000)	Episode actions: [2, 0, 1, 0]	Avg return: 53.556	Total loss: -0.261
(600/5000)	Episode actions: [2, 0, 1, 0]	Avg return: 52.151	Total loss: -0.284
(700/5000)	Episode actions: [3, 1, 4, 1]	Avg return: 53.867	Total loss: -0.278
(800/5000)	Episode actions: [4, 2, 3, 2]	Avg return: 56.255	Total loss: -0.229
(900/5000)	Episode actions: [0, 2, 0, 2]	Avg return: 54.790	Total loss: -0.232
(1000/5000)	Episode actions: [2, 2, 1, 0]	Avg return: 55.225	Total loss: -0.241
(1100/5000)	Episode actions: [2, 4, 3, 2]	Avg return: 55.249	Total loss: -0.244
(1200/5000)	Episode 

In [33]:
from utils.helper import to_tensor, to_numpy
import numpy as np

# Create environment
env = gym_super_mario_bros.make(ENV_NAME)
env = JoypadSpace(env, RIGHT_ONLY)

state = env.reset()
for step in range(2000):
    action_probs = config.network_fn().forward(to_tensor(state).to(config.device).unsqueeze(0))[0]
    action = np.random.choice(action_probs.size(dim=1), p=to_numpy(action_probs).ravel())
    next_state, reward, done, info = env.step(action)
    env.render()
    
    state = next_state
env.close()