# A2C Demo
This notebook focuses on training and testing the Advantage Actor-Critic (A2C) algorithm.

In [14]:
from a2c.agent import A2CAgent
from a2c.model import ACNetwork
from utils.wrappers import ResizeObservation, SkipFrame
from utils.config import Config
from utils.helper import set_device

import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY
from gym.wrappers import FrameStack, GrayScaleObservation
from nes_py.wrappers import JoypadSpace

import torch.optim as optim

In [15]:
# Set hyperparameters
ENV_NAME = 'SuperMarioBros-v3'

GAMMA = 0.99
LEARNING_RATE = 0.001
EPSILON = 1e-3
ENTROPY_WEIGHT = 0.01
VALUE_LOSS_WEIGHT = 1.0

N_STEPS = 10 # TD bootstrapping
GRAD_CLIP = 0.1 # Prevents gradients from being too large
NUM_EPISODES = 1000

# Create environment
env = gym_super_mario_bros.make(ENV_NAME)
env = JoypadSpace(env, RIGHT_ONLY)

# Apply wrappers to environment
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env, keep_dim=False) # Grayscale images
env = ResizeObservation(env, shape=84) # image dim: [84, 84]
env = FrameStack(env, num_stack=4) # 4 frames at a time

# Set config instance
config = Config()

# Set cuda device
device = set_device()

# Add core items to config
config.add(
    env=env,
    env_name=ENV_NAME,
    discount=GAMMA,
    entropy_weight=ENTROPY_WEIGHT,
    value_loss_weight=VALUE_LOSS_WEIGHT,
    rollout_size=N_STEPS,
    grad_clip=GRAD_CLIP,
    device=device,
    num_episodes=NUM_EPISODES
)

# Setup environment parameters
config.set_env_params()

CUDA available. Device set to GPU.


In [16]:
# https://github.com/Kautenja/gym-super-mario-bros/blob/master/gym_super_mario_bros/actions.py
print('Available actions:', config.action_space)
print('Obs space shape: ', config.input_shape)

Available actions: Discrete(5)
Obs space shape:  (4, 84, 84)


In [17]:
# Create networks
network = ACNetwork(config.input_shape, config.n_actions).to(device)

# Add networks and optimizers to config
config.add(
    network=network,
    optimizer = optim.Adam(network.parameters(), lr=LEARNING_RATE, eps=EPSILON)
)

In [18]:
agent = A2CAgent(config)

In [6]:
# Train agent
agent.train(target_score=1000)

Running training with rollout length 10.
(1/1000) Episode actions: [3, 3, 3, 4, 4, 2, 3, 0, 3, 2]	Avg return: 5.192	Total loss: 25.497
(100/1000) Episode actions: [3, 3, 3, 3, 3, 3, 1, 3, 3, 3]	Avg return: 95.496	Total loss: 0.001
(200/1000) Episode actions: [1, 4, 4, 4, 1, 4, 1, 4, 4, 1]	Avg return: 98.651	Total loss: 0.043
(300/1000) Episode actions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]	Avg return: 98.131	Total loss: 0.098
(400/1000) Episode actions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]	Avg return: 95.048	Total loss: 0.037
(500/1000) Episode actions: [4, 4, 4, 1, 1, 4, 1, 4, 1, 1]	Avg return: 99.049	Total loss: 0.044
(600/1000) Episode actions: [4, 1, 1, 4, 4, 4, 4, 4, 4, 4]	Avg return: 96.968	Total loss: 0.035
(700/1000) Episode actions: [4, 4, 4, 4, 4, 4, 4, 4, 4, 4]	Avg return: 98.094	Total loss: 0.040
(800/1000) Episode actions: [4, 4, 4, 4, 4, 4, 4, 4, 4, 4]	Avg return: 96.493	Total loss: 0.035
(900/1000) Episode actions: [4, 4, 1, 4, 4, 4, 1, 4, 4, 4]	Avg return: 97.405	Total loss: 0.037
(

In [10]:
agent.load_model()

CUDA available. Device set to GPU.
Loaded A2C model.


<utils.logger.Logger at 0x12d9728e740>

In [13]:
len(agent.logger.env_info)

1000

In [None]:
# import gym
# from IPython import display
# import matplotlib
# import matplotlib.pyplot as plt
# from utils.helper import to_tensor, to_numpy, normalize_states
# import numpy as np
# import torch
# %matplotlib inline

# env = gym_super_mario_bros.make(ENV_NAME)
# env = JoypadSpace(env, RIGHT_ONLY)
# env = SkipFrame(env, skip=4)
# env = GrayScaleObservation(env, keep_dim=False) # Grayscale images
# env = ResizeObservation(env, shape=84) # image dim: [84, 84]
# env = FrameStack(env, num_stack=4) # 4 frames at a time

# agent.load_model()

# state = env.reset()
# for _ in range(1000):
#     state = normalize_states(to_tensor(state)).to(config.device)
#     action_probs = agent.config.network.forward(state.unsqueeze(0))[0]
#     action = torch.distributions.Categorical(action_probs).sample().item()
#     next_state, reward, done, _ = env.step(action)
#     env.render()
    
#     state = next_state
    
#     if done:
#         state = env.reset()

In [None]:
# import gym
# from IPython import display
# import matplotlib
# import matplotlib.pyplot as plt
# from utils.helper import to_tensor, to_numpy, normalize_states
# import numpy as np
# import torch
# %matplotlib inline

# env = gym_super_mario_bros.make(ENV_NAME)
# env = JoypadSpace(env, RIGHT_ONLY)
# env = SkipFrame(env, skip=4)
# env = GrayScaleObservation(env, keep_dim=False) # Grayscale images
# env = ResizeObservation(env, shape=84) # image dim: [84, 84]
# env = FrameStack(env, num_stack=4) # 4 frames at a time

# agent.load_model()

# state = env.reset()
# img = plt.imshow(env.render(mode='rgb_array')) # only call this once
# for _ in range(1000):
#     img.set_data(env.render(mode='rgb_array')) # just update the data
#     display.display(plt.gcf())
#     display.clear_output(wait=True)
    
#     state = normalize_states(to_tensor(state)).to(config.device)
#     action_probs = agent.config.network.forward(state.unsqueeze(0))[0]
#     action = torch.distributions.Categorical(action_probs).sample().item()
#     next_state, reward, done, _ = env.step(action)
    
#     state = next_state
    
#     if done:
#         state = env.reset()