In [1]:
import os
import numpy as np
import tensorflow as tf


class ActorCriticNetwork:
    def __init__(self, *args, **kwargs):
        self.scope_name = kwargs.get("scope_name", "global")
        self.scope = tf.variable_scope(self.scope_name, reuse=tf.AUTO_REUSE)

        # parse all the keyword args into the instance variables
        self.num_outputs = kwargs.get("encoded_board_size", 400)
        self.num_of_actions = kwargs.get("num_of_actions", 7)
        self.lstm_layers = kwargs.get("lstm_layers", 1)
        self.lstm_size = kwargs.get("lstm_size", 512)
        self.inputs = kwargs.get("inputs")
        self.dropout_prob = kwargs.get("dropout_prob")
        self.critic_output = kwargs.get("critic_output")
        self.actor_output = kwargs.get("actor_output")
        self.init_graph = kwargs.get("init_graph")
        self.state_size = kwargs.get("state_size", 17)
        self.model_name = kwargs.get("param_file")
        self.state = None  # Will be set implicitly by tensorflow

        self.sess = tf.Session()

        # initialize the computational graph
        with self.scope:
            self.initialize_scope(self.scope)
            if self.model_name and os.path.isfile(f"{self.model_name}.meta"):
                saver = tf.train.Saver()
                saver.restore(self.sess, self.model_name)
                self.sess.run()
            else:
                self.init_graph = tf.global_variables_initializer()
                self.sess.run(self.init_graph)

    def initialize_scope(self, graph):
        # Probability for dropout
        self.inputs = tf.placeholder(tf.float32, [self.state_size, 1],
                                     name='inputs')  # Dimensions of this will be 17 X 1 for each of the states
        self.dropout_prob = tf.placeholder(tf.float32, name='keep_prob')

        # Let the activation function be RELu, we can play with it later on
        # Automatically creates weights with the help of the Xavier Initializer
        # https://www.tensorflow.org/api_docs/python/tf/contrib/layers/fully_connected
        encoded_inputs = tf.contrib.layers.fully_connected(self.inputs, self.num_outputs)
        rnn_inputs = tf.reshape(encoded_inputs, (1, self.state_size, self.num_outputs))
        # Forming an LSTM Layer
        lstm = tf.contrib.rnn.BasicLSTMCell(self.lstm_size)
        drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.dropout_prob)
        cell = tf.contrib.rnn.MultiRNNCell([drop] * self.lstm_layers)

        # Get the outputs and the final_state, which need to be fed in the Value and Policy network
        # final state comprises of c and h
        outputs, self.state = tf.nn.dynamic_rnn(
            cell, rnn_inputs, dtype=tf.float32, initial_state=self.state
        )

        # calculate the value of move
        final_output = tf.expand_dims(outputs[0, 16, :], [0])
        self.critic_output = tf.contrib.layers.fully_connected(
            final_output, 1, activation_fn=None
        )

        # calculate the distribution of actions
        self.actor_output = tf.contrib.layers.fully_connected(
            final_output, self.num_of_actions, activation_fn=tf.nn.softmax
        )

    def __call__(self, game_board, *args, **kwargs) -> np.array:
        feed = {
            self.inputs: game_board.T,
            self.dropout_prob: kwargs.get("dropout_prob", 0.2),
        }
        critic_output, actor_output = self.sess.run([self.critic_output, self.actor_output], feed_dict=feed)

        return critic_output, actor_output

    def exit(self):
        """
        Closes session if open,
        tensorflow already has a isOpen check
        :return:
        """
        self.sess.close()


In [2]:
from enum import Enum


class Side(Enum):
    NORTH = 0
    SOUTH = 1

    def opposite(self):
        return Side.SOUTH if self == Side.NORTH else Side.NORTH

In [3]:
MANKALAH = 7

def play_hole(hole, board_copy, agent_side) -> bool:
    seeds = board_copy[agent_side.value][hole]
    board_copy[agent_side.value][hole] = 0
    cur_hole = (hole + 1)
    current_side = agent_side
    while seeds > 1:
        # only increment my mankalah
        if current_side != agent_side and cur_hole == MANKALAH:
            cur_hole = (cur_hole + 1) % 8
            current_side = current_side.opposite()
            continue
        board_copy[current_side.value][cur_hole] += 1
        if cur_hole > 6:
            current_side = current_side.opposite()
        cur_hole = (cur_hole + 1) % 8
        seeds -= 1

    opposite_hole = MANKALAH - 1 - hole
    # check if we can capture opponents pieces
    if cur_hole != MANKALAH and current_side == agent_side \
            and board_copy[current_side.value][cur_hole] == 0 \
            and board_copy[current_side.opposite().value][opposite_hole] > 0:
        captured_seeds = board_copy[current_side.opposite().value][opposite_hole]
        board_copy[current_side.opposite().value][opposite_hole] = 0
        board_copy[current_side.value][MANKALAH] += captured_seeds + 1  # current seed
        return False

    board_copy[current_side.value][cur_hole] += 1
    return current_side == agent_side and cur_hole == MANKALAH


In [4]:
def reset():
    return np.array([[7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 0, 1]])


def score(init_game_state, new_state):
    """
    Returns the change in mankalah given a move
    :param init_game_state:
    :param new_state:
    :return:
    """
    # init_side
    if init_game_state[-1] == 1:
        return init_game_state[15] - new_state[15]
    return init_game_state[7] - new_state[7]


def flatten_game(game_board, side):
    """
    Flatten the game board and concatenate the side
    and return a new numpy array with the side on the end

    :param side:
    :param game_board:
    :return:
    """
    return np.expand_dims(np.concatenate((game_board.flatten(), [side.value])), axis=0)


def game_over(board):
    return np.sum(board[Side.NORTH.value][:-1]) == 0 or np.sum(board[Side.SOUTH.value][:-1]) == 0


def step(init_game_state, action):
    """
    Takes a game state transforms it into a format that playhole
    can understand, executes playhole then returns the next game state,
    reward and whether or not the game is over
    :param init_game_state:
    :param action:
    :return:
    """

    init_game_state = init_game_state[0]
    # last index is side
    side = Side(init_game_state[-1])
    board = np.reshape(init_game_state[:-1], (2, 8))
    repeat_go = play_hole(action, board, side)
    side = side if repeat_go else side.opposite()
    new_state = flatten_game(board, side)

    # calculate reward
    reward = score(init_game_state, new_state[0])

    return new_state, reward, game_over(board)


In [5]:
import logging
# Copies one set of variables to another.
# Used to set worker network parameters to those of global network.
def update_target_graph(from_scope, to_scope):
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

    op_holder = []
    for from_var, to_var in zip(from_vars, to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder


class Worker(ActorCriticNetwork):
    def __init__(self, *args, **kwargs):
        self.model_path = kwargs.get("model_path")
        self.lr = kwargs.get("lr", 0.001)
        self.trainer = kwargs.get("trainer", tf.train.AdamOptimizer(learning_rate=self.lr))
        super().__init__(*args, **kwargs)
        self.sess = kwargs.get("sess")

    def initialize_scope(self, graph):
        super().initialize_scope(graph)
        # sync with global model
        self.update_local_ops = update_target_graph('global', self.scope_name)

        self.actions = tf.placeholder(dtype=tf.int32)
        self.target_v = tf.placeholder(dtype=tf.float32)
        # self.advantages = tf.placeholder(dtype=tf.float32)
        self.generalized_advantage = tf.placeholder(dtype=tf.float32)

        # Loss functions
        self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.critic_output, [-1])))
        self.entropy = - tf.reduce_sum(self.actor_output * tf.log(self.actor_output))
        self.policy_loss = -tf.reduce_sum(tf.log(self.actor_output) * self.generalized_advantage)
        self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01

        # Get gradients from local network using local losses
        local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope_name)
        self.gradients = tf.gradients(self.loss, local_vars)
        self.var_norms = tf.global_norm(local_vars)
        grads, self.grad_norms = tf.clip_by_global_norm(self.gradients, 40.0)

        # Apply local gradients to global network
        global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
        self.apply_grads = self.trainer.apply_gradients(zip(grads, global_vars))

    def work(self, max_episode_length, gamma, saver):
        total_steps = 0
        sess = self.sess
        with sess.as_default(), sess.graph.as_default():
            episode_count = 0
            for i in range(max_episode_length):
                # copy the local params from the global network
                sess.run(self.update_local_ops)
                episode_buffer = []
                episode_frames = []
                episode_reward = 0
                episode_step_count = 0
                game_over = False

                init_game_state = reset()
                episode_frames.append(init_game_state)
                while not game_over:
                    # get an action distribution and estimate value from policy
                    action_distribution, estimated_value, rnn_state = sess.run(
                        [self.actor_output, self.critic_output, self.state],
                        feed_dict={
                            self.inputs: init_game_state.T,
                            self.dropout_prob: 0.2,
                            # self.state: rnn_state,
                        })

                    # select action
                    action = int(np.argmax(action_distribution))

                    # play action on game
                    next_game_state, reward, game_over = step(init_game_state, action)

                    # save game transition and estimated value
                    episode_buffer.append(
                        [
                            init_game_state,
                            action,
                            reward,
                            next_game_state,
                            game_over,
                            estimated_value
                        ])

                    # add reward for episode + update state
                    episode_reward += reward
                    init_game_state = next_game_state
                    total_steps += 1
                    episode_step_count += 1

                    # If the episode hasn't ended, but the experience buffer is full, then we
                    # make an update step using that experience rollout.
                    if len(episode_buffer) == 30 and not game_over and episode_step_count != max_episode_length - 1:
                        self.update_params(
                            episode_buffer, sess, gamma
                        )
                        episode_buffer = []
                        sess.run(self.update_local_ops)

                # Update the network using the episode buffer at the end of the episode.
                if len(episode_buffer) != 0:
                    value_loss, policy_loss, entropy_loss, gradients, variance = self.update_params(episode_buffer,
                                                                                                    sess, gamma)
                    logging.warning(
                        'For episode 1: %s we have value loss: %s '
                        'and policy loss: %s' % (episode_count, value_loss, policy_loss)
                    )

                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count != 0:
                    saver.save(sess, self.model_path + '/model-' + str(episode_count) + '.ckpt')
                    print("Saved Model")

                episode_count += 1

    def update_params(self, episode_buffer, sess, gamma):
        total_reward = 0
        total_generalized_advantage = 0
        value_loss = 0
        policy_loss = 0
        entropy_loss = 0
        gradients = 0
        variance = 0
        for i, episode in enumerate(episode_buffer[:-1]):
            state, action, reward, value = episode[0], episode[1], episode[2], episode[5]
            total_reward += reward * (gamma ** i)
            delta_t = reward + gamma * episode_buffer[i + 1][5] - value
            total_generalized_advantage = total_generalized_advantage * gamma + delta_t

            value_loss, policy_loss, entropy_loss, gradients, variance = sess.run([
                self.value_loss, self.policy_loss, self.loss, self.gradients, self.var_norms], feed_dict={
                self.inputs: state.T,
                self.dropout_prob: 0.2,
                self.actions: action,
                self.target_v: total_reward,
                self.generalized_advantage: total_generalized_advantage
            })
        return value_loss, policy_loss, entropy_loss, gradients, variance


In [7]:
path_to_be_stored = "."
max_episodes = 200
gamma = 0.3
workers = []
master_network = ActorCriticNetwork()
with tf.Session() as sess:
    saver = tf.train.Saver()
    worker = Worker(0, scope_name=f"worker_0", state_size=17, saver=saver,
                    sess=master_network.sess, model_path=path_to_be_stored)
    worker.work(max_episodes, gamma=gamma, saver=saver)



Saved Model




Saved Model




Saved Model




Saved Model




Saved Model




Saved Model




Saved Model




Saved Model




Saved Model




Saved Model




Saved Model




Saved Model


KeyboardInterrupt: 