In [1]:
# !sudo apt-get install -y xvfb ffmpeg
!pip install -q atari_py
!pip install -q gym[atari]
!pip install -q imageio
!pip install -q PILLOW
!pip install -q pyglet
!pip install -q typing-extensions==3.7.4.3
!pip install -q pyvirtualdisplay
!pip install -q ../../OSAR-keras/.
# !pip install -q git+https://github.com/ustyuzhaninky/OSAR-keras

In [2]:
from __future__ import absolute_import, division, print_function

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image
import pyvirtualdisplay

import tensorflow as tf

from tf_agents import agents
from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

from OSAR import OSARNetwork, Runner, TrialAgent

In [3]:
env_names = [
    'Alien-v0',
    'MontezumaRevenge-v0',
    'AirRaid-v0',
    'Solaris-v0',
    'Berzerk-v0',
    'Asteroids-v0',
    'Venture-v0',
    'Asteroids-v0',
    'BattleZone-v0',
    'Enduro-v0',
    'Tutankham-v0',
    'WizardOfWor-v0'
]

In [4]:
num_iterations = 10000 # @param {type:"integer"}

initial_collect_steps = 100  # @param {type:"integer"} 
collect_steps_per_iteration = 1  # @param {type:"integer"}
replay_buffer_max_length = 1000  # @param {type:"integer"}

batch_size = 1  # @param {type:"integer"}
learning_rate = 1e-3  # @param {type:"number"}
log_interval = 1  # @param {type:"integer"}
memory_len = 10 # @param {type: "integer"}
n_turns = 3 # @param {type: "integer"}
num_atoms = 51  # @param {type:"integer"}
q_value = 10  # @param {type:"integer"}
n_step_update = 2  # @param {type:"integer"}

num_eval_episodes = 20  # @param {type:"integer"}
eval_interval = 1000  # @param {type:"integer"}


In [5]:
fc_layer_params = (64,)
from tf_agents.networks import categorical_q_network, q_network

agent_specs = {
    'batch_size': batch_size,
    'memory_len': memory_len,
    'n_turns': n_turns,
    'fc_layer_params': fc_layer_params,
    'num_atoms': num_atoms,
    'conv_type': '1d',
    'learning_rate': learning_rate,
    'q_value': q_value,
    'n_step_update': n_step_update,
    'boltzmann_temperature': None,
    'epsilon_greedy': 0.1,
    'debug_summaries': True,
    'summarize_grads_and_vars': True,
}

def osar_generator(
    observation_spec,
    action_spec,
    batch_size,
    memory_len,
    n_turns,
    fc_layer_params,
    num_atoms,
    conv_type,
    learning_rate,
    time_step_spec,
    n_step_update,
    train_step_counter,
    q_value=10,
    boltzmann_temperature=None,
    epsilon_greedy=0.1,
    debug_summaries=True,
    summarize_grads_and_vars=True,
    **kwargs,
    ):
    q_net = OSARNetwork(
        observation_spec,
        action_spec,
        batch_size,
        memory_len,
        n_turns,
        fc_layer_params=fc_layer_params,
        conv_type='2d',
        )
    
    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
    
    agent = TrialAgent(
        time_step_spec,
        action_spec,
        network=q_net,
        optimizer=optimizer,
        td_errors_loss_fn=common.element_wise_squared_loss,
        train_step_counter=train_step_counter,
        boltzmann_temperature=boltzmann_temperature,
        epsilon_greedy=epsilon_greedy,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
    )
    
    agent.initialize()
    
    return agent

    

In [6]:
experiment_common_specs = {
    'agent_specs': agent_specs,
    'agent_generator': osar_generator,
    'num_iterations': num_iterations,
    'initial_collect_steps': initial_collect_steps,
    'collect_steps_per_iteration': collect_steps_per_iteration,
    'replay_buffer_max_length': replay_buffer_max_length,
    'num_eval_episodes': num_eval_episodes,
    'eval_interval': eval_interval,
    'n_step_update': n_step_update,
}

In [7]:
configs = []
for name in env_names:
    config = experiment_common_specs.copy()
    config['env_name'] = name
    configs.append(config)

In [8]:
# !pip install pybullet
from tf_agents.environments import suite_pybullet
env = suite_pybullet.load('Alien-v0')
env.reset()
print(env.time_step_spec().observation.dtype)
# PIL.Image.fromarray(env.render())

uint8


In [None]:
%%time
logpath = ''
model_name = 'test_osar'
runner = Runner(model_name=model_name, logpath=logpath, list_configs=configs)
runner.run(progress=False, experiment_progress=True)



Instructions for updating:
rename to distribute_datasets_from_function


Instructions for updating:
rename to distribute_datasets_from_function
Episode 1000:  10%|▉         | 999/10000 [01:50<11:16, 13.30it/s, avg_return=0, train_loss=6.83]

In [None]:
def c51_generator(
    observation_spec,
    action_spec,
    batch_size,
    memory_len,
    n_turns,
    fc_layer_params,
    num_atoms,
    conv_type,
    learning_rate,
    time_step_spec,
    n_step_update,
    train_step_counter,
    q_value=10,
    boltzmann_temperature=None,
    epsilon_greedy=0.1,
    debug_summaries=True,
    summarize_grads_and_vars=True,
    ):

    q_net = categorical_q_network.CategoricalQNetwork(
    observation_spec,
    action_spec,
    num_atoms=num_atoms,
    fc_layer_params=fc_layer_params)
    
    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
    
    train_step_counter = tf.Variable(0, dtype=tf.int64)
    
    agent = agents.categorical_dqn.categorical_dqn_agent.CategoricalDqnAgent(
        time_step_spec,
        action_spec,
        categorical_q_network=q_net,
        optimizer=optimizer,
        td_errors_loss_fn=common.element_wise_squared_loss,
        train_step_counter=train_step_counter,
        min_q_value = -q_value,
        max_q_value = q_value,
        n_step_update = n_step_update,
        boltzmann_temperature=boltzmann_temperature,
        epsilon_greedy=epsilon_greedy,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
    )
    agent.initialize()
    
    return agent

experiment_c51_specs = {
    'agent_specs': agent_specs,
    'agent_generator': c51_generator,
    'num_iterations': num_iterations,
    'initial_collect_steps': initial_collect_steps,
    'collect_steps_per_iteration': collect_steps_per_iteration,
    'replay_buffer_max_length': replay_buffer_max_length,
    'num_eval_episodes': num_eval_episodes,
    'eval_interval': eval_interval,
    'n_step_update': n_step_update,
}
configs = []
for name in env_names:
    config = experiment_c51_specs.copy()
    config['env_name'] = name
    configs.append(config)

In [None]:
%%time
logpath = ''
model_name = 'test_c51'
runner = Runner(model_name=model_name, logpath=logpath, list_configs=configs)
runner.run(progress=False, experiment_progress=True)

In [None]:
def dqn_generator(
    observation_spec,
    action_spec,
    batch_size,
    memory_len,
    n_turns,
    fc_layer_params,
    num_atoms,
    conv_type,
    learning_rate,
    time_step_spec,
    n_step_update,
    q_value=10,
    boltzmann_temperature=None,
    epsilon_greedy=0.1,
    debug_summaries=True,
    summarize_grads_and_vars=True,
    ):

    q_net = q_network.QNetwork(
    observation_spec,
    action_spec,
    fc_layer_params=fc_layer_params)
    
    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

    train_step_counter = tf.Variable(0, dtype=tf.int64)
    
    agent = agents.dqn.dqn_agent.DqnAgent(
        time_step_spec,
        action_spec,
        q_network=q_net,
        optimizer=optimizer,
        td_errors_loss_fn=common.element_wise_squared_loss,
        train_step_counter=train_step_counter,
        n_step_update = n_step_update,
        boltzmann_temperature=boltzmann_temperature,
        epsilon_greedy=epsilon_greedy,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
    )
    agent.initialize()
    
    return agent

experiment_dqn_specs = {
    'agent_specs': agent_specs,
    'agent_generator': dqn_generator,
    'num_iterations': num_iterations,
    'initial_collect_steps': initial_collect_steps,
    'collect_steps_per_iteration': collect_steps_per_iteration,
    'replay_buffer_max_length': replay_buffer_max_length,
    'num_eval_episodes': num_eval_episodes,
    'eval_interval': eval_interval,
    'n_step_update': n_step_update,
}
configs = []
for name in env_names:
    config = experiment_dqn_specs.copy()
    config['env_name'] = name
    configs.append(config)

In [None]:
%%time
logpath = ''
model_name = 'test_dqn'
runner = Runner(model_name=model_name, logpath=logpath, list_configs=configs)
runner.run(progress=False, experiment_progress=True)