### Import Dependencies

In [5]:
import gym 
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
import os

### Test Environment

In [4]:
environment_name = "Breakout-v0"
env = gym.make(environment_name)
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:1.0
Episode:2 Score:2.0
Episode:3 Score:1.0
Episode:4 Score:1.0
Episode:5 Score:3.0


### Vectorise environment and train model

In [8]:
env = make_atari_env('Breakout-v0', n_envs=4, seed=0)
env = VecFrameStack(env, n_stack=4)

In [6]:
log_path = os.path.join('Training', 'Logs')

In [7]:
model = A2C("CnnPolicy", env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [8]:
model.learn(total_timesteps=400000)

Logging to Training/Logs/A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 271      |
|    ep_rew_mean        | 1.34     |
| time/                 |          |
|    fps                | 119      |
|    iterations         | 100      |
|    time_elapsed       | 16       |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.39    |
|    explained_variance | 0.0633   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.362    |
|    value_loss         | 0.238    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 278      |
|    ep_rew_mean        | 1.48     |
| time/                 |          |
|    fps                | 142      |
|    iterations         | 200      |
|    time_elapsed       | 27       |
|    total_timesteps    | 4000     |
| train

<stable_baselines3.a2c.a2c.A2C at 0x7ff636e8f100>

### Save and Reload Model

In [6]:
a2c_path = os.path.join('Training', 'Saved_Models', 'A2C_model')

In [None]:
model.save(a2c_path)

In [2]:
# del model

In [9]:
model = A2C.load(a2c_path, env)

Wrapping the env in a VecTransposeImage.


### Evaluate and Test

In [10]:
env = make_atari_env('Breakout-v0', n_envs=1, seed=0)
env = VecFrameStack(env, n_stack=4)

In [11]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)
env.close()

(11.4, 5.589275444992849)

In [29]:
obs = env.reset()
for _ in range(500):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
env.close()

In [12]:
env.close()

### Making GIFs of a trained and untrained agents

In [19]:
import imageio
import numpy as np

images = []
obs = env.reset()
img = env.render(mode="rgb_array")
for _ in range(500):
    images.append(img)
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    img = env.render(mode="rgb_array")
    # env.render()

imageio.mimsave("atari_breakout_trained_agent.gif", [np.array(img) for i, img in enumerate(images) if i%2 == 0], duration=40)

In [35]:
import imageio
import numpy as np

images = []
obs = env.reset()
img = env.render(mode="rgb_array")
for _ in range(500):
    images.append(img)
    action = env.action_space.sample()
    n_state, reward, done, info = env.step(np.array([action]))
    img = env.render(mode="rgb_array")
    # env.render()

imageio.mimsave("atari_breakout_untrained_agent.gif", [np.array(img) for i, img in enumerate(images) if i%2 == 0], duration=40)