# Turtle runs - RL with deep Q-learning

In [1]:
import sys
import numpy as np
from collections import namedtuple
from time import time

import tensorflow as tf
import keras
import keras.layers as L
tf.reset_default_graph()
sess = tf.InteractiveSession()
keras.backend.set_session(sess)

Using TensorFlow backend.


In [2]:
actions = ['N', 'E', 'S', 'W', 'O']

In [3]:
class Turtle():
    def __init__(self, position, halite):
        self.position = position
        self.halite = halite

In [45]:
class GameState():
    def __init__(self, game_map, position, halite):
        self.game_map = game_map
        self.position = position
        self.halite = halite
        
    def __eq__(self, other):
        return (self.game_map == other.game_map).all() and self.position == other.position and self.halite == other.halite

    def __contains__(self, key):
        return key in self.numbers
    
    def __bytes__(self):
        return bytes(self.game_map) + bytes(self.position) + bytes(self.halite)

    def __hash__(self):
        xxh64 = xxhash.xxh64(self.__bytes__())
        return xxh64.intdigest()
    
    def get_nn_repr(self):
        hal_std_scaled = (self.game_map.reshape(-1,) - 500) / 1000
        pos_indicator = [0] * 25
        pos_indicator[self.position[0] * 5 + self.position[1]] = 1
        return np.array(list(hal_std_scaled) + pos_indicator + [self.halite / 1000])

In [88]:
class SimpleHalite():
    def __init__(self, height, width, start_pos):
        np.random.seed(42)
        self.game_map = np.random.randint(1, 1000, size=(height, width))
        self.game_map[start_pos] = 0
        self.orig_map = self.game_map.copy()
        self.turtle = Turtle(start_pos, 0)
        self.turn = 1
        self.max_turns = 50
        self.halite = 0
        self.base = start_pos
        self.height = height
        self.width = width
        self.actions = ['N', 'E', 'S', 'W', 'O']
    
    def get_state(self):
        game_state = GameState(self.game_map.copy(), self.turtle.position, self.turtle.halite)
        return game_state
    
    def get_possible_actions(self):
        return self.actions
    
    def reset(self):
        self.game_map = self.orig_map.copy()
        self.turtle = Turtle(self.base, 0)
        self.turn = 1
        self.halite = 0
        return self.get_state()
        
    def step(self, action):
        action = actions[action]
        reward = 0
        if action == 'O':
            mined_halite = self.game_map[self.turtle.position] // 4
            self.game_map[self.turtle.position] -= mined_halite
            self.turtle.halite += min(1000, mined_halite)
        else:
            if action == 'N':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (0, 1))])
            elif action == 'E':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (1, 0))])
            elif action == 'S':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (0, -1))])
            elif action == 'W':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (-1, 0))])
            #print(cost_halite, self.turtle.halite)
            if cost_halite <= self.turtle.halite:
                #print("moving turtle to {}".format(new_pos))
                self.turtle = Turtle(new_pos, self.turtle.halite - cost_halite)
            else:
                mined_halite = self.game_map[self.turtle.position] // 4
                self.game_map[self.turtle.position] -= mined_halite
                self.turtle.halite += min(1000, mined_halite)                
        self.turtle.position = (self.turtle.position[0] % self.width, 
                                self.turtle.position[1] % self.height)
        if self.turtle.position == self.base:
            self.halite += self.turtle.halite
            reward = self.turtle.halite
            self.turtle.halite = 0
        self.turn += 1
        #print("turn increment to {}".format(self.turn))
        return self.get_state(), reward, self.turn == self.max_turns

Needs to be a tuple rather than a list as start_pos

In [89]:
game = SimpleHalite(5, 5, (2, 2))

In [90]:
game.reset()

<__main__.GameState at 0x198dfdbde80>

## Q Learning 

In [91]:
state_dim = (len(game.reset().get_nn_repr()),)
n_actions = len(game.get_possible_actions())

In [92]:
network = keras.models.Sequential()
network.add(L.InputLayer(input_shape=state_dim))
network.add(L.Dense(128, activation='relu'))
network.add(L.Dense(128, activation='relu'))
network.add(L.Dense(len(game.get_possible_actions())))

In [93]:
states_ph = keras.backend.placeholder(dtype='float32', shape=(None,) + state_dim)
actions_ph = keras.backend.placeholder(dtype='int32', shape=[None])
rewards_ph = keras.backend.placeholder(dtype='float32', shape=[None])
next_states_ph = keras.backend.placeholder(dtype='float32', shape=(None,) + state_dim)
is_done_ph = keras.backend.placeholder(dtype='bool', shape=[None])

In [94]:
#get q-values for all actions in current states
predicted_qvalues = network(states_ph)

#select q-values for chosen actions
predicted_qvalues_for_actions = tf.reduce_sum(predicted_qvalues * tf.one_hot(actions_ph, n_actions), axis=1)

In [95]:
gamma = 0.99
predicted_next_qvalues = network(next_states_ph)
next_state_values = tf.reduce_max(predicted_next_qvalues, axis=1)
target_qvalues_for_actions = rewards_ph + gamma * next_state_values
# last time step
target_qvalues_for_actions = tf.where(is_done_ph, rewards_ph, target_qvalues_for_actions)

In [96]:
loss = (predicted_qvalues_for_actions - tf.stop_gradient(target_qvalues_for_actions)) ** 2
loss = tf.reduce_mean(loss)

train_step = tf.train.AdamOptimizer(1e-4).minimize(loss)

In [99]:
def get_action(state, epsilon=0):
    """
    sample actions with epsilon-greedy policy
    """
    q_values = network.predict(state.reshape(1, -1))
    
    if np.random.uniform() > epsilon:
        chosen_action = np.argmax(q_values)
    else:
        chosen_action = np.random.choice(n_actions)
    
    return chosen_action

In [106]:
def generate_session(game, epsilon=0, train=False):
    """play env with approximate q-learning agent and train it at the same time"""
    total_reward = 0
    s = game.reset().get_nn_repr()
    
    done = False
    while not done:
        a = get_action(s, epsilon=epsilon)   
        next_s, r, done, = game.step(a)
        
        if train:
            sess.run(train_step,{
                states_ph: [s], actions_ph: [a], rewards_ph: [r], 
                next_states_ph: [next_s.get_nn_repr()], is_done_ph: [done]
            })

        total_reward += r
        s = next_s.get_nn_repr()
    return total_reward

In [107]:
def train_qlearning_agent(game, epsilon, n_iter, n_sessions):
    for j in range(n_iter):   
        now = time()
        session_rewards = [generate_session(game=game, epsilon=epsilon, train=True) for _ in range(n_sessions)]
        print("Iteration took {:.0f} seconds".format(time() - now))
        print("Mean reward on iteration {}: {}".format(j, np.mean(session_rewards)))
        if epsilon > 0.01:
            epsilon *= 0.99

In [None]:
train_qlearning_agent(game, 0.5, 5, 100)

Iteration took 6 seconds
Mean reward on iteration 0: 243.17
Iteration took 6 seconds
Mean reward on iteration 1: 207.04
Iteration took 6 seconds
Mean reward on iteration 2: 252.44


In [72]:
network.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 128)               6656      
_________________________________________________________________
dense_5 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_6 (Dense)              (None, 5)                 645       
Total params: 23,813
Trainable params: 23,813
Non-trainable params: 0
_________________________________________________________________
