In [1]:
from tensorforce.agents import Agent
from tensorforce.execution import Runner
from tensorforce.contrib.openai_gym import OpenAIGym
import time

In [2]:
# Get the environment and extract the number of actions.
import gym
import numpy as np
from gym.wrappers import Monitor
ENV_NAME = 'CartPole-v0'
env = OpenAIGym(
        gym_id=ENV_NAME,
        monitor=None,
        monitor_safe=None,
        monitor_video=None,
        visualize=True
    )

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [3]:
nb_actions = env.actions['num_actions']

In [4]:
nb_states = env.states['shape'][0]

In [5]:
# Define the specifications of the DDPG agent according to the original paper
agent = {
    'actions_exploration': {'mu': 0.0,
                            'sigma': 0.3,
                            'theta': 0.15,
                            'type': 'ornstein_uhlenbeck'},
    'critic_network': {'size_t0': 64, 
                       'size_t1': 64},
    'critic_optimizer': {'learning_rate': 0.001, 
                         'type': 'adam'},
    'discount': 0.99,
    'entropy_regularization': None,
    'execution': {'distributed_spec': None,
                  'session_config': None,
                  'type': 'single'},
    'memory': {'capacity': 100000, 
               'include_next_states': True, 
               'type': 'replay'},
    'optimizer': {'learning_rate': 0.0001, 
               'type': 'adam'},
    'saver': {'directory': None,
              'seconds': 600},
    'summarizer': {'directory': None,
                   'labels': [],
                   'seconds': 120},
    'target_sync_frequency': 1,
    'target_update_weight': 0.999,
    'type': 'ddpg_agent',
    'update_mode': {'batch_size': 64, 
                    'frequency': 64, 
                    'unit': 'timesteps'}
}

In [6]:
# Define the networks to be used in DDPG. Although DDPG uses two networks (actor-critic), we'll use the
# same configuration for both as per the original paper
network = [  
             {'size': 64, 'type': 'linear'},
             {'layer': 'batch_normalization', 'type': 'tf_layer'},
             {'name': 'relu', 'type': 'nonlinearity'},
             {'size': 64, 'type': 'linear'},
             {'layer': 'batch_normalization', 'type': 'tf_layer'},
             {'name': 'relu', 'type': 'nonlinearity'},
             {'activation': None, 'size': 64, 'type': 'dense'}
          ]

In [7]:
env

<tensorforce.contrib.openai_gym.OpenAIGym at 0x7feb4004c2b0>

In [8]:
agent = Agent.from_spec(
        spec=agent,
        kwargs=dict(
            states=env.states,
            actions=env.actions,
            network=network,
        )
    )

Instructions for updating:
dim is deprecated, use axis instead
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [9]:
agent

<tensorforce.agents.ddpg_agent.DDPGAgent at 0x7feb3ffe7588>

In [11]:
DEBUG = True
runner = Runner(
        agent=agent,
        environment=env,
        repeat_actions=1
    )

if DEBUG:  # TODO: Timestep-based reporting
    report_episodes = 1
else:
    report_episodes = 100

print("Starting {agent} for Environment '{env}'".format(agent=agent, env=env))

def episode_finished(r, id_):
    if r.episode % report_episodes == 0:
        steps_per_second = r.timestep / (time.time() - r.start_time)
        print("Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}".format(
            r.agent.episode, r.episode_timestep, steps_per_second
        ))
        print("Episode reward: {}".format(r.episode_rewards[-1]))
        print("Average of last 500 rewards: {:0.2f}".
                    format(sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards))))
        print("Average of last 100 rewards: {:0.2f}".
                    format(sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards))))
    return True


Starting DDPGAgent for Environment 'OpenAIGym(CartPole-v0)'


In [12]:
runner.run(
        num_timesteps=None,
        num_episodes=10,
        max_episode_timesteps=None,
        deterministic=False,
        episode_finished=episode_finished
    )

Finished episode 1 after 10 timesteps. Steps Per Second 7.88
Episode reward: 10.0
Average of last 500 rewards: 10.00
Average of last 100 rewards: 10.00
Finished episode 2 after 14 timesteps. Steps Per Second 18.31
Episode reward: 14.0
Average of last 500 rewards: 12.00
Average of last 100 rewards: 12.00
Finished episode 3 after 12 timesteps. Steps Per Second 23.84
Episode reward: 12.0
Average of last 500 rewards: 12.00
Average of last 100 rewards: 12.00
Finished episode 4 after 9 timesteps. Steps Per Second 27.06
Episode reward: 9.0
Average of last 500 rewards: 11.25
Average of last 100 rewards: 11.25
Finished episode 5 after 11 timesteps. Steps Per Second 30.37
Episode reward: 11.0
Average of last 500 rewards: 11.20
Average of last 100 rewards: 11.20
Finished episode 6 after 9 timesteps. Steps Per Second 32.57
Episode reward: 9.0
Average of last 500 rewards: 10.83
Average of last 100 rewards: 10.83
Finished episode 7 after 9 timesteps. Steps Per Second 34.53
Episode reward: 9.0
Averag

In [13]:
runner.close()

print("Learning finished. Total episodes: {ep}".format(ep=runner.agent.episode))

Learning finished. Total episodes: 10
