In [83]:
import numpy as np
import tensorflow as tf
from collections import deque, namedtuple
import random

import gym
import pommerman
from pommerman import agents
from tqdm import tqdm
xrange = range

In [84]:
ENV_NAME = 'PommeFFACompetition-v0'
EPISODES = 100000
TEST = 10

In [140]:
from gym.core import Env

from pommerman.configs import ffa_competition_env
from pommerman.envs.v0 import Pomme
from pommerman.characters import Bomber

In [216]:
class EnvWrapper(Env):
    """The abstract environment class that is used by all agents. This class has the exact
        same API that OpenAI Gym uses so that integrating with it is trivial. In contrast to the
        OpenAI Gym implementation, this class only defines the abstract methods without any actual
        implementation.
        To implement your own environment, you need to define the following methods:
        - `step`
        - `reset`
        - `render`
        - `close`
        Refer to the [Gym documentation](https://gym.openai.com/docs/#environments).
        """
    reward_range = (-1, 1)
    action_space = None
    observation_space = None

    def __init__(self, gym, board_size):
        self.gym = gym
        self.action_space = gym.action_space
        self.observation_space = gym.observation_space
        self.reward_range = gym.reward_range
        self.board_size = board_size

    def step(self, action):
        """Run one timestep of the environment's dynamics.
        Accepts an action and returns a tuple (observation, reward, done, info).
        # Arguments
            action (object): An action provided by the environment.
        # Returns
            observation (object): Agent's observation of the current environment.
            reward (float) : Amount of reward returned after previous action.
            done (boolean): Whether the episode has ended, in which case further step() calls will return undefined results.
            info (dict): Contains auxiliary diagnostic information (helpful for debugging, and sometimes learning).
        """
        obs = self.gym.get_observations()
        all_actions = self.gym.act(obs)
        all_actions.insert(self.gym.training_agent, action)
        state, reward, terminal, info = self.gym.step(all_actions)
        agent_state = self.featurize(state[self.gym.training_agent])
        agent_reward = reward[self.gym.training_agent]
        return agent_state, agent_reward, terminal, info

    def reset(self):
        """
        Resets the state of the environment and returns an initial observation.
        # Returns
            observation (object): The initial observation of the space. Initial reward is assumed to be 0.
        """
        obs = self.gym.reset()
        agent_obs = self.featurize(obs[self.gym.training_agent])
        return agent_obs

    def render(self, mode='human', close=False):
        """Renders the environment.
        The set of supported modes varies per environment. (And some
        environments do not support rendering at all.)
        # Arguments
            mode (str): The mode to render with.
            close (bool): Close all open renderings.
        """
        self.gym.render(mode=mode, close=close)

    def close(self):
        """Override in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        self.gym.close()

    def seed(self, seed=None):
        """Sets the seed for this env's random number generator(s).
        # Returns
            Returns the list of seeds used in this env's random number generators
        """
        raise self.gym.seed(seed)

    def configure(self, *args, **kwargs):
        """Provides runtime configuration to the environment.
        This configuration should consist of data that tells your
        environment how to run (such as an address of a remote server,
        or path to your ImageNet data). It should not affect the
        semantics of the environment.
        """
        raise NotImplementedError()

    def featurize(self, obs):
        shape = (self.board_size, self.board_size, 1)

        def get_matrix(dict, key):
            res = dict[key]
            return res.reshape(shape).astype(np.float32)

        def get_map(board, item):
            map = np.zeros(shape)
            map[board == item] = 1
            return map

        board = get_matrix(obs, 'board')

        # TODO: probably not needed Passage = 0
        rigid_map = get_map(board, 1)  # Rigid = 1
        wood_map = get_map(board, 2)  # Wood = 2
        bomb_map = get_map(board, 3)  # Bomb = 3
        flames_map = get_map(board, 4)  # Flames = 4
        fog_map = get_map(board, 5)  # TODO: not used for first two stages Fog = 5
        extra_bomb_map = get_map(board, 6)  # ExtraBomb = 6
        incr_range_map = get_map(board, 7)  # IncrRange = 7
        kick_map = get_map(board, 8)  # Kick = 8
        skull_map = get_map(board, 9)  # Skull = 9

        position = obs["position"]
        my_position = np.zeros(shape)
        my_position[position[0], position[1], 0] = 1

        team_mates = get_map(board, obs["teammate"].value)  # TODO during documentation it should be an array

        enemies = np.zeros(shape)
        for enemy in obs["enemies"]:
            enemies[board == enemy.value] = 1

        bomb_blast_strength = get_matrix(obs, 'bomb_blast_strength')
        bomb_life = get_matrix(obs, 'bomb_life')

        ammo = np.full(shape, obs["ammo"])
        blast_strength = np.full(shape, obs["blast_strength"])
        can_kick = np.full(shape, int(obs["can_kick"]))

        obs = np.concatenate([my_position, enemies, team_mates, rigid_map,
                              wood_map, bomb_map, flames_map,
                              fog_map, extra_bomb_map, incr_range_map,
                              kick_map, skull_map, bomb_blast_strength,
                              bomb_life, ammo, blast_strength, can_kick], axis=2)
        return obs

    ## state_to_matrix(obs)
#     def featurize(self, obs):
#         #In this implementation I just concatenate everything in one big matrix

#         #for e in obs['enemies']:
#         #    print(e)
#         #    print(Item(e))
#         #TODO enemies
#         my_position = np.asmatrix(obs['position'])
#         bomb_life = np.asmatrix(obs['bomb_life'])
#         board = np.asmatrix(obs['board'])
#         bombs = np.asmatrix(obs['bomb_blast_strength'])
#         #enemies = np.asmatrix([Item_en(e) for e in obs['enemies']])
#         can_kick = np.asmatrix(int(1 if obs['can_kick'] else 0))
#         ammo = np.asmatrix(int(obs['ammo']))
#         blast_strength = np.asmatrix(int(obs['blast_strength']))

#         m = np.max([my_position.shape[1], bomb_life.shape[1], board.shape[1], bombs.shape[1],  can_kick.shape[1], ammo.shape[1], blast_strength.shape[1]])

#         my_position1 = np.concatenate((my_position, np.zeros(( my_position.shape[0], m - my_position.shape[1]))), axis=1)
#         bomb_life1 = np.concatenate((bomb_life, np.zeros((bomb_life.shape[0], m - bomb_life.shape[1]))), axis=1)
#         board1 = np.concatenate((board, np.zeros((board.shape[0], m - board.shape[1]))), axis=1)
#         bombs1 = np.concatenate((bombs, np.zeros((bombs.shape[0], m - bombs.shape[1]))), axis=1)
#         #enemies1 = np.concatenate((enemies, np.zeros((enemies.shape[0], m - enemies.shape[1]))), axis=1)
#         can_kick1 = np.concatenate((can_kick, np.zeros((can_kick.shape[0], m - can_kick.shape[1]))), axis=1)
#         ammo1 = np.concatenate((ammo, np.zeros((ammo.shape[0], m - ammo.shape[1]))), axis=1)
#         blast_strength1 = np.concatenate((blast_strength, np.zeros((blast_strength.shape[0], m - blast_strength.shape[1]))), axis=1)

#         result = np.concatenate((my_position1, bomb_life1, board1, bombs1, can_kick1, ammo1, blast_strength1), axis=0)
#         return np.asmatrix(result)

    def __del__(self):
        self.close()

    def __str__(self):
        return '<{} instance>'.format(type(self).__name__)

In [217]:
# Create a set of agents (exactly four)
agent_list = [
    agents.SimpleAgent(),
    agents.SimpleAgent(),
    agents.RandomAgent(),
    agents.RandomAgent(),
]
env = pommerman.make(ENV_NAME, agent_list)

[2018-07-18 13:17:30,744] Making new env: PommeFFACompetition-v0


In [218]:
env.set_agents(agent_list)
env.set_training_agent(agent_list[-1].agent_id)

In [219]:
print(env.training_agent)

3


In [220]:
BOARD_SIZE = 11
env_wrapper = EnvWrapper(env, BOARD_SIZE)

In [221]:
env = env_wrapper

In [222]:
r_sum = np.zeros(1)

from gym.wrappers import Monitor
import datetime

ACTOR_LEARNING_RATE = 0.0001
CRITIC_LEARNING_RATE = 0.001
MAX_EPISODES = 100000
MAX_STEPS_EPISODE = 50000
WARMUP_STEPS = 10000
EXPLORATION_EPISODES = 10000
GAMMA = 0.99
TAU = 0.001
BUFFER_SIZE = 1000000
OU_THETA = 0.15
OU_MU = 0.
OU_SIGMA = 0.3
MIN_EPSILON = 0.1
MAX_EPSILON = 1
EVAL_PERIODS = 100
EVAL_EPISODES = 10
MINI_BATCH = 64
RANDOM_SEED = 123
DATETIME = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
SUMMARY_DIR = './results/{}/{}/tf_ddpg'.format(ENV_NAME, DATETIME)

gpu_options = tf.GPUOptions(allow_growth=True)

In [223]:
def state_to_matrix(obs):
    #In this implementation I just concatenate everything in one big matrix

    #for e in obs['enemies']:
    #    print(e)
    #    print(Item(e))
    #TODO enemies
    my_position = np.asmatrix(obs['position'])
    bomb_life = np.asmatrix(obs['bomb_life'])
    board = np.asmatrix(obs['board'])
    bombs = np.asmatrix(obs['bomb_blast_strength'])
    #enemies = np.asmatrix([Item_en(e) for e in obs['enemies']])
    can_kick = np.asmatrix(int(1 if obs['can_kick'] else 0))
    ammo = np.asmatrix(int(obs['ammo']))
    blast_strength = np.asmatrix(int(obs['blast_strength']))

    m = np.max([my_position.shape[1], bomb_life.shape[1], board.shape[1], bombs.shape[1],  can_kick.shape[1], ammo.shape[1], blast_strength.shape[1]])

    my_position1 = np.concatenate((my_position, np.zeros(( my_position.shape[0], m - my_position.shape[1]))), axis=1)
    bomb_life1 = np.concatenate((bomb_life, np.zeros((bomb_life.shape[0], m - bomb_life.shape[1]))), axis=1)
    board1 = np.concatenate((board, np.zeros((board.shape[0], m - board.shape[1]))), axis=1)
    bombs1 = np.concatenate((bombs, np.zeros((bombs.shape[0], m - bombs.shape[1]))), axis=1)
    #enemies1 = np.concatenate((enemies, np.zeros((enemies.shape[0], m - enemies.shape[1]))), axis=1)
    can_kick1 = np.concatenate((can_kick, np.zeros((can_kick.shape[0], m - can_kick.shape[1]))), axis=1)
    ammo1 = np.concatenate((ammo, np.zeros((ammo.shape[0], m - ammo.shape[1]))), axis=1)
    blast_strength1 = np.concatenate((blast_strength, np.zeros((blast_strength.shape[0], m - blast_strength.shape[1]))), axis=1)

    result = np.concatenate((my_position1, bomb_life1, board1, bombs1, can_kick1, ammo1, blast_strength1), axis=0)
    return np.asmatrix(result)

def fully_connected(inputs, output_size, activation_fn=None, weights_initializer=tf.truncated_normal_initializer(),\
        weights_regularizer=tf.contrib.layers.l2_regularizer(0.001), biases_initializer=tf.constant_initializer(0.0)):
    return tf.contrib.layers.fully_connected(inputs, output_size, activation_fn=activation_fn, \
            weights_initializer=weights_initializer, weights_regularizer=weights_regularizer, biases_initializer=biases_initializer)

def batch_norm(inputs, phase):
    return tf.contrib.layers.batch_norm(inputs, center=True, scale=True, is_training=phase)

Transition = namedtuple('Transition', ['state', 'action', 'reward', 'terminal', 'next_state'])

class ReplayBuffer(object):

    def __init__(self, buffer_size, random_seed=1234):
        self.buffer_size = buffer_size
        self.count = 0
        # Right side of deque contains newest experience
        self.buffer = deque()
        random.seed(random_seed)

    def add(self, state, action, reward, terminal, next_state):
        experience = Transition(state, action, reward, terminal, next_state)
        if self.count < self.buffer_size:
            self.buffer.append(experience)
            self.count += 1
        else:
            self.buffer.popleft()
            self.buffer.append(experience)

    def size(self):
        return self.count

    def sample_batch(self, batch_size):
        batch = []

        if self.count < batch_size:
            batch = random.sample(self.buffer, self.count)
        else:
            batch = random.sample(self.buffer, batch_size)

        return map(np.array, zip(*batch))

    def clear(self):
        self.buffer.clear()
        self.count = 0
        
def discretize(value, num_actions):
    discretization = tf.round(value)
    discretization = tf.minimum(tf.constant(num_actions-1, dtype=tf.float32), tf.maximum(tf.constant(0, dtype=tf.float32), tf.to_float(discretization)))
    return tf.to_int32(discretization)

class OrnsteinUhlenbeckProcess(object):
    def __init__(self, theta, mu=0, sigma=1, x0=0, dt=1e-2, n_steps_annealing=100, size=1):
        self.theta = theta
        self.sigma = sigma
        self.n_steps_annealing = n_steps_annealing
        self.sigma_step = - self.sigma / float(self.n_steps_annealing)
        self.x0 = x0
        self.mu = mu
        self.dt = dt
        self.size = size

    def generate(self, step):
        sigma = max(0, self.sigma_step * step + self.sigma)
        x = self.x0 + self.theta * (self.mu - self.x0) * self.dt + sigma * np.sqrt(self.dt) * np.random.normal(size=self.size)
        self.x0 = x
        return x

class GreedyPolicy(object):
    def __init__(self, action_dim, n_steps_annealing, min_epsilon, max_epsilon):
        self.epsilon = max_epsilon
        self.min_epsilon = min_epsilon
        self.action_dim = action_dim
        self.n_steps_annealing = n_steps_annealing
        self.epsilon_step = - (self.epsilon - self.min_epsilon) / float(self.n_steps_annealing)

    def generate(self, action, step):
        epsilon = max(self.min_epsilon, self.epsilon_step * step + self.epsilon)
        if random.random() < epsilon:
            return random.choice(range(self.action_dim))
        else:
            return action

In [224]:
class BaseNetwork(object):
    def __init__(self, sess, state_dim, action_dim, learning_rate, tau):
        """
        base network for actor and critic network.
        Args:
            sess: tf.Session()
            state_dim: env.observation_space.shape
            action_dim: env.action_space.shape[0]
            learning_rate: learning rate for training
            tau: update parameter for target.
        """
        self.sess = sess
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.learning_rate = learning_rate
        self.tau = tau

    def build_network(self):
        """
        build network.
        """
        raise NotImplementedError("build newtork first!")

    def train(self, *args):
        raise NotImplementedError("train network!")

    def predict(self, *args):
        raise NotImplementedError("predict output for network!")

    def predict_target(self, *args):
        raise NotImplementedError("predict output for target network!")

    def update_target_network(self):
        raise NotImplementedError("update target network!")

    def get_num_trainable_vars(self):
        raise NotImplementedError("update target network!")

In [225]:
class ActorNetwork(BaseNetwork):
    def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, tau):
        super(ActorNetwork, self).__init__(sess, state_dim, action_dim, learning_rate, tau)
        self.action_bound = action_bound
#         self.input_dim = input_dim

        # Actor network
        self.inputs, self.phase, self.outputs, self.scaled_outputs = self.build_network()
        self.net_params = tf.trainable_variables()
#         print('tf.trainable_variables()', tf.trainable_variables())
#         print('self.inputs.shape', self.inputs.shape) # [?, 407]

        # Target network
        self.target_inputs, self.target_phase, self.target_outputs, self.target_scaled_outputs = self.build_network()
        self.target_net_params = tf.trainable_variables()[len(self.net_params):]
#         print('self.target_inputs.shape', self.target_inputs.shape) # [?, 407]

        # Op for periodically updating target network with online network weights
        self.update_target_net_params = \
            [self.target_net_params[i].assign(tf.multiply(self.net_params[i], self.tau) +
                                              tf.multiply(self.target_net_params[i], 1. - self.tau))
             for i in range(len(self.target_net_params))]

        # Combine dnetScaledOut/dnetParams with criticToActionGradient to get actorGradient
        # Temporary placeholder action gradient
        self.action_gradients = tf.placeholder(tf.float32, [None, 1])

        self.actor_gradients = tf.gradients(self.outputs, self.net_params, -self.action_gradients)

        # Optimization Op
        self.optimize = tf.train.AdamOptimizer(self.learning_rate).\
            apply_gradients(zip(self.actor_gradients, self.net_params))

        self.num_trainable_vars = len(self.net_params) + len(self.target_net_params)

    def build_network(self):
        inputs = tf.placeholder(tf.float32, shape=(None,) + self.state_dim)
        phase = tf.placeholder(tf.bool)
        net = fully_connected(inputs, 400, activation_fn=tf.nn.relu)
        net = fully_connected(net, 300, activation_fn=tf.nn.relu)
        # Final layer weight are initialized to Uniform[-3e-3, 3e-3]
        outputs = fully_connected(net, 1, weights_initializer=tf.random_uniform_initializer(-3e-3, 3e-3))
        scaled_outputs = discretize(outputs, self.action_dim)
        print('build_network')

        return inputs, phase, outputs, scaled_outputs

    def train(self, *args):
        # args [inputs, action_gradients, phase]
        return self.sess.run(self.optimize, feed_dict={
            self.inputs: args[0],
            self.action_gradients: args[1],
            self.phase: True
        })

    def predict(self, *args):
        return self.sess.run(self.scaled_outputs, feed_dict={
            self.inputs: args[0],
            self.phase: False
        })

    def predict_target(self, *args):
        return self.sess.run(self.target_scaled_outputs, feed_dict={
            self.target_inputs: args[0],
            self.target_phase: False,
        })

    def update_target_network(self):
        self.sess.run(self.update_target_net_params)

    def get_num_trainable_vars(self):
        return self.num_trainable_vars


class CriticNetwork(BaseNetwork):

    def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, tau, num_actor_vars):
        super(CriticNetwork, self).__init__(sess, state_dim, action_dim, learning_rate, tau)
        self.action_bound = action_bound

        # Critic network
        self.inputs, self.phase, self.action, self.outputs = self.build_network()
        self.net_params = tf.trainable_variables()[num_actor_vars:]

        # Target network
        self.target_inputs, self.target_phase, self.target_action, self.target_outputs = self.build_network()
        self.target_net_params = tf.trainable_variables()[len(self.net_params) + num_actor_vars:]

        # Op for periodically updating target network with online network weights
        self.update_target_net_params = \
            [self.target_net_params[i].assign(tf.multiply(self.net_params[i], self.tau) +
                                              tf.multiply(self.target_net_params[i], 1. - self.tau))
             for i in range(len(self.target_net_params))]

        self.update_target_bn_params = \
            [self.target_net_params[i].assign(self.net_params[i]) for i in range(len(self.target_net_params)) if self.target_net_params[i].name.startswith('BatchNorm')]

        # Network target (y_i)
        # Obtained from the target networks
        self.predicted_q_value = tf.placeholder(tf.float32, [None, 1])

        # Define loss and optimization Op
        self.loss = tf.reduce_mean(tf.squared_difference(self.predicted_q_value, self.outputs))
        self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

        # Get the gradient of the critic w.r.t. the action
        self.action_grads = tf.gradients(self.outputs, self.action)

    def build_network(self):
        inputs = tf.placeholder(tf.float32, shape=(None,) + self.state_dim)
        phase = tf.placeholder(tf.bool)
        action = tf.placeholder(tf.float32, [None, 1])
        net = fully_connected(inputs, 400, activation_fn=tf.nn.relu)
        net = fully_connected(tf.concat([net, action], 1), 300, activation_fn=tf.nn.relu)
        outputs = fully_connected(net, 1, weights_initializer=tf.random_uniform_initializer(-3e-3, 3e-3))

        return inputs, phase, action, outputs

    def train(self, *args):
        # args (inputs, action, predicted_q_value, phase)
        return self.sess.run([self.outputs, self.optimize], feed_dict={
            self.inputs: args[0],
            self.action: args[1],
            self.predicted_q_value: args[2],
            self.phase: True
        })

    def predict(self, *args):
        # args  (inputs, action, phase)
        return self.sess.run(self.outputs, feed_dict={
            self.inputs: args[0],
            self.action: args[1],
            self.phase: False
        })

    def predict_target(self, *args):
        # args  (inputs, action, phase)
        return self.sess.run(self.target_outputs, feed_dict={
            self.target_inputs: args[0],
            self.target_action: args[1],
            self.target_phase: False
        })

    def action_gradients(self, inputs, action):
        return self.sess.run(self.action_grads, feed_dict={
            self.inputs: inputs,
            self.action: action,
            self.phase: False
        })

    def update_target_network(self):
        self.sess.run(self.update_target_net_params)

In [233]:
class BaseAgent(object):
    def __init__(self, sess, env, replay_buffer, noise=None, exploration_episodes=10000, max_episodes=10000, max_steps_episode=10000, warmup_steps=5000,\
            mini_batch=32, eval_episodes=10, eval_periods=100, env_render=False, summary_dir=None):
        """
        Base agent, provide basic functions. 
        Args:
            sess: tf.Session(). 
            env: openai gym environment. could be a wrapper.
            replay_buffer: replay_buffer for sampling. 
            noise: noise added to action for exploration. 
            exploration_episodes: maximum episodes for training with noise.
            max_episodes: maximum episodes for training.
            max_steps_episode: maximum steps per episode.
            mini_batch: mini batch size in the training.
            eval_episodes: number of episodes to evaluate current model.
            eval_periods: periods to evaluate model.
            env_render: whether display observation.
            summary_dir: folder to store summaries of algorithm.
        """
        self.sess = sess
        self.env = env
        self.replay_buffer = replay_buffer
        self.noise = noise
        self.exploration_episodes = exploration_episodes
        self.max_episodes = max_episodes
        self.max_steps_episode = max_steps_episode
        self.warmup_steps = warmup_steps
        self.mini_batch = mini_batch
        self.eval_episodes = eval_episodes
        self.eval_periods = eval_periods
        self.env_render = env_render
        self.summary_dir = summary_dir

        # Initialize Tensorflow variables
        self.sess.run(tf.global_variables_initializer())

        self.writer = tf.summary.FileWriter(self.summary_dir, sess.graph)


    def train(self):
        """
        Train the model. 
        """
        raise NotImplementedError("train() method should be implemented")


    def evaluate(self, cur_episode):
        """
        evaluate the model.
        """
        raise NotImplementedError("evaluate() method should be implemented")
        
class DDPGAgent(BaseAgent):
    def __init__(self, sess, actor, critic, gamma, env, replay_buffer, noise=None, exploration_episodes=10000, max_episodes=10000, max_steps_episode=10000,\
            warmup_steps=5000, mini_batch=32, eval_episodes=10, eval_periods=100, env_render=False, summary_dir=None):
        """
        Deep Deterministic Policy Gradient Agent.
        Args:
            actor: actor network.
            critic: critic network.
            gamma: discount factor.
        """
        super(DDPGAgent, self).__init__(sess, env, replay_buffer, noise=noise, exploration_episodes=exploration_episodes, max_episodes=max_episodes, max_steps_episode=max_steps_episode,\
                warmup_steps=warmup_steps, mini_batch=mini_batch, eval_episodes=eval_episodes, eval_periods=eval_periods, env_render=env_render, summary_dir=summary_dir)

        self.actor = actor
        self.critic = critic
        self.gamma = gamma


    def train(self):
        # Initialize target network weights
        self.actor.update_target_network()
        self.critic.update_target_network()

        for cur_episode in tqdm(xrange(self.max_episodes)):

            # evaluate here. 
            if cur_episode % self.eval_periods == 0:
                self.evaluate(cur_episode)

            state = self.env.reset()
#             state = state_to_matrix(state[0])

            episode_reward = 0
            episode_ave_max_q = 0

            for cur_step in xrange(self.max_steps_episode):

                if self.env_render:
                    self.env.render()

                # Add exploratory noise according to Ornstein-Uhlenbeck process to action
                if self.replay_buffer.size() < self.warmup_steps:
                    action= self.env.action_space.sample()
#                     action = action[0]
                else: 
                    action = self.noise.generate(self.actor.predict(np.expand_dims(state, 0))[0,0], cur_episode)
#                     actoin = action[0]

                next_state, reward, terminal, info = self.env.step(action)
#                 next_state = state_to_matrix(next_state[0])

                self.replay_buffer.add(state, action, reward, terminal, next_state)

                # Keep adding experience to the memory until there are at least minibatch size samples
                if self.replay_buffer.size() > self.warmup_steps:
                    state_batch, action_batch, reward_batch, terminal_batch, next_state_batch = \
                        self.replay_buffer.sample_batch(self.mini_batch)

                    # Calculate targets
                    target_q = self.critic.predict_target(next_state_batch, self.actor.predict_target(next_state_batch))

                    y_i = np.reshape(reward_batch, (self.mini_batch, 1)) + (1 \
                            - np.reshape(terminal_batch, (self.mini_batch, 1)).astype(float))\
                            * self.gamma * np.reshape(target_q, (self.mini_batch, 1))

                    # Update the critic given the targets
                    action_batch = np.reshape(action_batch, [self.mini_batch, 1])

                    episode_ave_max_q += np.amax(predicted_q_value)

                    # Update the actor policy using the sampled gradient
                    a_outs = self.actor.predict(state_batch)
                    a_grads = self.critic.action_gradients(state_batch, a_outs)
                    self.actor.train(state_batch, a_grads[0])


                    # Update target networks
                    self.actor.update_target_network()
                    self.critic.update_target_network()

                state = next_state
                episode_reward += reward

                if terminal or cur_step == self.max_steps_episode-1:
                    train_episode_summary = tf.Summary() 
                    train_episode_summary.value.add(simple_value=episode_reward, tag="train/episode_reward")
                    train_episode_summary.value.add(simple_value=episode_ave_max_q/float(cur_step), tag="train/episode_ave_max_q")
                    self.writer.add_summary(train_episode_summary, cur_episode)
                    self.writer.flush()

                    print('Reward: %.2i' % int(episode_reward), ' | Episode', cur_episode, \
                          '| Qmax: %.4f' % (episode_ave_max_q / float(cur_step)))

                    break


    def evaluate(self, cur_episode):
        # evaluate here. 
        total_episode_reward = 0 
        for eval_i in xrange(self.eval_episodes):
            state = self.env.reset()
            state = state.flatten()
#             state = state_to_matrix(state[0])
            
            terminal = False
            while not terminal:
                action = self.actor.predict(np.expand_dims(state, 0))[0,0]
#                 print('action', action)
                
                state, reward, terminal, info = self.env.step(action)
#                 print('state', state)
                total_episode_reward += reward
        ave_episode_reward = total_episode_reward / float(self.eval_episodes)
        print("\nAverage reward {}\n".format(ave_episode_reward))
        # Add ave reward to Tensorboard
        eval_episode_summary = tf.Summary()
        eval_episode_summary.value.add(simple_value=ave_episode_reward, tag="eval/reward")
        self.writer.add_summary(eval_episode_summary, cur_episode)

In [234]:
def state_to_matrix_2(obs):
    #In this implementation I just concatenate everything in one big matrix

    #for e in obs['enemies']:
    #    print(e)
    #    print(Item(e))
    #TODO enemies
    my_position = np.asmatrix(obs['position'])
    bomb_life = np.asmatrix(obs['bomb_life'])
    board = np.asmatrix(obs['board'])
    bombs = np.asmatrix(obs['bomb_blast_strength'])
    #enemies = np.asmatrix([Item_en(e) for e in obs['enemies']])
    can_kick = np.asmatrix(int(1 if obs['can_kick'] else 0))
    ammo = np.asmatrix(int(obs['ammo']))
    blast_strength = np.asmatrix(int(obs['blast_strength']))

    m = np.max([my_position.shape[1], bomb_life.shape[1], board.shape[1], bombs.shape[1],  can_kick.shape[1], ammo.shape[1], blast_strength.shape[1]])

    my_position1 = np.concatenate((my_position, np.zeros(( my_position.shape[0], m - my_position.shape[1]))), axis=1)
    bomb_life1 = np.concatenate((bomb_life, np.zeros((bomb_life.shape[0], m - bomb_life.shape[1]))), axis=1)
    board1 = np.concatenate((board, np.zeros((board.shape[0], m - board.shape[1]))), axis=1)
    bombs1 = np.concatenate((bombs, np.zeros((bombs.shape[0], m - bombs.shape[1]))), axis=1)
    #enemies1 = np.concatenate((enemies, np.zeros((enemies.shape[0], m - enemies.shape[1]))), axis=1)
    can_kick1 = np.concatenate((can_kick, np.zeros((can_kick.shape[0], m - can_kick.shape[1]))), axis=1)
    ammo1 = np.concatenate((ammo, np.zeros((ammo.shape[0], m - ammo.shape[1]))), axis=1)
    blast_strength1 = np.concatenate((blast_strength, np.zeros((blast_strength.shape[0], m - blast_strength.shape[1]))), axis=1)

    result = np.concatenate((my_position1, bomb_life1, board1, bombs1, can_kick1, ammo1, blast_strength1), axis=0)
    return np.asmatrix(result)

In [235]:
# ## Trying to concatenate without extra padding (but has to be same dimension -- failed)

# state = env.reset()
# print(state_to_matrix(state[0]).shape)
# my_position1 = np.asmatrix(state[0]['position'])
# bomb_life1=np.asmatrix(state[0]['bomb_life'])
# board1=np.asmatrix(state[0]['board'])
# bombs1=np.asmatrix(state[0]['bomb_blast_strength'])
# can_kick1=np.asmatrix(int(1 if state[0]['can_kick'] else 0))
# ammo1=np.asmatrix(state[0]['ammo'])
# blast_strength1=np.asmatrix(state[0]['blast_strength'])
# result = np.concatenate((my_position1, bomb_life1, board1, bombs1, can_kick1, ammo1, blast_strength1), axis=0)

In [236]:
sess = tf.InteractiveSession()

state_dim = env.observation_space.shape
action_dim = env.action_space.n
action_bound = None



In [229]:
# env = env_wrapper

state = env.reset()
# state = state_to_matrix(state[0])
# input_dim = state.flatten().shape
# input_dim = (input_dim[-1],)

In [230]:
actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                     ACTOR_LEARNING_RATE, TAU)
critic = CriticNetwork(sess, state_dim, action_dim, action_bound,
                       CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars())

replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
noise = GreedyPolicy(action_dim, EXPLORATION_EPISODES, MIN_EPSILON, MAX_EPSILON)

build_network
build_network


In [238]:
agent = DDPGAgent(sess, actor, critic, GAMMA, env, replay_buffer, noise=noise, \
                  exploration_episodes=EXPLORATION_EPISODES,max_episodes=MAX_EPISODES, \
                  max_steps_episode=MAX_STEPS_EPISODE, warmup_steps=WARMUP_STEPS,mini_batch=MINI_BATCH,\
                  eval_episodes=EVAL_EPISODES, eval_periods=EVAL_PERIODS, env_render=False, summary_dir=SUMMARY_DIR)

In [239]:
agent.train()

  0%|          | 0/100000 [00:00<?, ?it/s]


ValueError: Cannot feed value of shape (1, 2057) for Tensor 'Placeholder_115:0', which has shape '(?, 372)'

# COMMENT

### * I think the dimension of input type should be (?, 372).
### * In the networks, the required parameters are (?, 372); the input parameters are (1, 37, 11) -- state_to_matrix()

# TRY TO RUN LINE BY LINE

## ACTOR INIT

In [64]:
def build_network(state_dim):
    inputs = tf.placeholder(tf.float32, shape=(None,) + state_dim)
    phase = tf.placeholder(tf.bool)
    net = fully_connected(inputs, 400, activation_fn=tf.nn.relu)
    net = fully_connected(net, 300, activation_fn=tf.nn.relu)
    # Final layer weight are initialized to Uniform[-3e-3, 3e-3]
    outputs = fully_connected(net, 1, weights_initializer=tf.random_uniform_initializer(-3e-3, 3e-3))
    scaled_outputs = discretize(outputs, action_dim)
    
    return inputs, phase, outputs, scaled_outputs

In [65]:
ainputs, aphase, aoutputs, ascaled_outputs = build_network(state_dim)

In [66]:
anet_params = tf.trainable_variables()

In [67]:
atarget_inputs, atarget_phase, atarget_outputs, atarget_scaled_outputs = build_network(state_dim)

In [68]:
atarget_net_params = tf.trainable_variables()[len(anet_params):]

In [69]:
aupdate_target_net_params = [atarget_net_params[i].assign(tf.multiply(anet_params[i], TAU) + 
                                                        tf.multiply(atarget_net_params[i], 1. - TAU))
                            for i in range(len(atarget_net_params))]

In [70]:
action_gradients = tf.placeholder(tf.float32, [None, 1])

In [71]:
actor_gradients = tf.gradients(outputs, anet_params, -action_gradients)

In [72]:
aoptimize = tf.train.AdamOptimizer(ACTOR_LEARNING_RATE).\
            apply_gradients(zip(actor_gradients, anet_params))

In [73]:
num_trainable_vars = len(anet_params) + len(atarget_net_params)

## CRITIC INIT

In [74]:
cinputs, aphase, caction, cscaled_outputs = build_network(state_dim)
ctarget_inputs, ctarget_phase, ctarget_action, ctarget_scaled_outputs = build_network(state_dim)

In [75]:
cnet_params = tf.trainable_variables()[num_trainable_vars:]
ctarget_net_params = tf.trainable_variables()[len(cnet_params) + num_trainable_vars:]

In [76]:
cupdate_target_net_params = [ctarget_net_params[i].assign(tf.multiply(cnet_params[i], TAU) +
                                              tf.multiply(ctarget_net_params[i], 1. - TAU))
             for i in range(len(ctarget_net_params))]

In [77]:
predicted_q_value = tf.placeholder(tf.float32, [None, 1])

In [78]:
loss = tf.reduce_mean(tf.squared_difference(predicted_q_value, outputs))
coptimize = tf.train.AdamOptimizer(CRITIC_LEARNING_RATE).minimize(loss)

In [79]:
action_grads = tf.gradients(coutputs, caction)

In [80]:
replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
noise = GreedyPolicy(action_dim, EXPLORATION_EPISODES, MIN_EPSILON, MAX_EPSILON)

## AGENT TRAIN

In [82]:
sess.run(aupdate_target_net_params)

FailedPreconditionError: Attempting to use uninitialized value fully_connected_1/weights
	 [[Node: fully_connected_1/weights/read = Identity[T=DT_FLOAT, _class=["loc:@fully_connected_1/weights"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](fully_connected_1/weights)]]

Caused by op 'fully_connected_1/weights/read', defined at:
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 127, in start
    self.asyncio_loop.run_forever()
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/asyncio/base_events.py", line 422, in run_forever
    self._run_once()
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/asyncio/base_events.py", line 1432, in _run_once
    handle._run()
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 117, in _handle_events
    handler_func(fileobj, events)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2903, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-b85f0a18e16c>", line 4, in <module>
    net = fully_connected(net, 300, activation_fn=tf.nn.relu)
  File "<ipython-input-5-84b2a229ab53>", line 32, in fully_connected
    return tf.contrib.layers.fully_connected(inputs, output_size, activation_fn=activation_fn,             weights_initializer=weights_initializer, weights_regularizer=weights_regularizer, biases_initializer=biases_initializer)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 183, in func_with_args
    return func(*args, **current_args)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1716, in fully_connected
    outputs = layer.apply(inputs)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/layers/base.py", line 828, in apply
    return self.__call__(inputs, *args, **kwargs)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/layers/base.py", line 699, in __call__
    self.build(input_shapes)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/layers/core.py", line 138, in build
    trainable=True)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/layers/base.py", line 546, in add_variable
    partitioner=partitioner)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/training/checkpointable.py", line 436, in _add_variable_with_custom_getter
    **kwargs_for_getter)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 1317, in get_variable
    constraint=constraint)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 1079, in get_variable
    constraint=constraint)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 417, in get_variable
    return custom_getter(**custom_getter_kwargs)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1611, in layer_variable_getter
    return _model_variable_getter(getter, *args, **kwargs)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1602, in _model_variable_getter
    use_resource=use_resource)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 183, in func_with_args
    return func(*args, **current_args)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/contrib/framework/python/ops/variables.py", line 291, in model_variable
    use_resource=use_resource)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 183, in func_with_args
    return func(*args, **current_args)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/contrib/framework/python/ops/variables.py", line 246, in variable
    use_resource=use_resource)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 394, in _true_getter
    use_resource=use_resource, constraint=constraint)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 786, in _get_single_variable
    use_resource=use_resource)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 2220, in variable
    use_resource=use_resource)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 2210, in <lambda>
    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 2193, in default_variable_creator
    constraint=constraint)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 235, in __init__
    constraint=constraint)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 397, in _init_from_args
    self._snapshot = array_ops.identity(self._variable, name="read")
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 142, in identity
    return gen_array_ops.identity(input, name=name)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 3187, in identity
    "Identity", input=input, name=name)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op
    op_def=op_def)
  File "/Users/teggsung/anaconda3/envs/tf36/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1718, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

FailedPreconditionError (see above for traceback): Attempting to use uninitialized value fully_connected_1/weights
	 [[Node: fully_connected_1/weights/read = Identity[T=DT_FLOAT, _class=["loc:@fully_connected_1/weights"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](fully_connected_1/weights)]]
