# Turtle runs - RL with the smoothed Crossentropy Method

In [None]:
import sys
import numpy as np
from collections import namedtuple
from time import time

import xxhash

In [None]:
actions = ['N', 'E', 'S', 'W', 'O']

In [None]:
class Turtle():
    def __init__(self, position, halite):
        self.position = position
        self.halite = halite

In [None]:
class GameState():
    def __init__(self, game_map, position, halite):
        self.game_map = game_map
        self.position = position
        self.halite = halite
        
    def __eq__(self, other):
        return (self.game_map == other.game_map).all() and self.position == other.position and self.halite == other.halite

    def __contains__(self, key):
        return key in self.numbers
    
    def __bytes__(self):
        return bytes(self.game_map) + bytes(self.position) + bytes(self.halite)

    def __hash__(self):
        xxh64 = xxhash.xxh64(self.__bytes__())
        return xxh64.intdigest()
    
    def get_nn_repr(self):
        hal_std_scaled = (game_map.reshape(-1,) - 500) / 1000
        pos_indicator = [0] * 25
        pos_indicator[self.position[0] * 5 + self.position[1]] = 1
        return list(hal_std_scaled) + pos_indicator + [self.halite / 1000]

In [None]:
class SimpleHalite():
    def __init__(self, height, width, start_pos):
        np.random.seed(42)
        self.game_map = np.random.randint(1, 1000, size=(height, width))
        self.game_map[start_pos] = 0
        self.orig_map = self.game_map.copy()
        self.turtle = Turtle(start_pos, 0)
        self.turn = 1
        self.max_turns = 50
        self.halite = 0
        self.base = start_pos
        self.height = height
        self.width = width
        self.actions = ['N', 'E', 'S', 'W', 'O']
    
    def get_state(self):
        game_state = GameState(self.game_map.copy(), self.turtle.position, self.turtle.halite)
        return game_state
    
    def get_possible_actions(self):
        return self.actions
    
    def reset(self):
        self.game_map = self.orig_map.copy()
        self.turtle = Turtle(self.base, 0)
        self.turn = 1
        self.halite = 0
        return self.get_state()
        
    def step(self, action):
        reward = 0
        if action == 'O':
            mined_halite = self.game_map[self.turtle.position] // 4
            self.game_map[self.turtle.position] -= mined_halite
            self.turtle.halite += min(1000, mined_halite)
        else:
            if action == 'N':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (0, 1))])
            elif action == 'E':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (1, 0))])
            elif action == 'S':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (0, -1))])
            elif action == 'W':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (-1, 0))])
            #print(cost_halite, self.turtle.halite)
            if cost_halite <= self.turtle.halite:
                #print("moving turtle to {}".format(new_pos))
                self.turtle = Turtle(new_pos, self.turtle.halite - cost_halite)
            else:
                mined_halite = self.game_map[self.turtle.position] // 4
                self.game_map[self.turtle.position] -= mined_halite
                self.turtle.halite += min(1000, mined_halite)                
        self.turtle.position = (self.turtle.position[0] % self.width, 
                                self.turtle.position[1] % self.height)
        if self.turtle.position == self.base:
            self.halite += self.turtle.halite
            reward = self.turtle.halite
            self.turtle.halite = 0
        self.turn += 1
        #print("turn increment to {}".format(self.turn))
        return self.get_state(), reward, self.turn == self.max_turns

Needs to be a tuple rather than a list as start_pos

In [None]:
game = SimpleHalite(5, 5, (2, 2))

In [None]:
game.reset()

## Q Learning 

In [None]:
class QLearningAgent:
    def __init__(self, alpha, epsilon, discount, possible_actions):
        """
        Q-Learning Agent
        """
        self.possible_actions = possible_actions
        self._qvalues = {}
        self.alpha = alpha
        self.epsilon = epsilon
        self.discount = discount

    def get_qvalue(self, state, action):
        """ Returns Q(state,action) """
        if state in self._qvalues:
            if action not in self._qvalues[state]:
                self._qvalues[state][action] = 0
        else:
            self._qvalues[state] = {}
            for action in self.possible_actions:
                self._qvalues[state][action] = 0
        return self._qvalues[state][action]

    def set_qvalue(self,state,action,value):
        """ Sets the Qvalue for [state,action] to the given value """
        if state not in self._qvalues:
            self._qvalues[state] = {}
        self._qvalues[state][action] = value

    def get_value(self, state):
        """
        V(s) = max_over_action Q(state,action) over possible actions.
        """
        possible_actions = self.possible_actions

        #If there are no legal actions, return 0.0
        if len(possible_actions) == 0:
            return 0.0

        value = max(self.get_qvalue(state, a) for a in possible_actions)

        return value

    def update(self, state, action, reward, next_state):
        """
        Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s'))
        """
        gamma = self.discount
        learning_rate = self.alpha
        new_q_val = (1 - learning_rate) * self.get_qvalue(state, action) + learning_rate * (reward + gamma * self.get_value(next_state))
        self.set_qvalue(state, action, new_q_val)
    
    def get_best_action(self, state):
        """
        Compute the best action to take in a state (using current q-values). 
        """
        possible_actions = self.possible_actions
        best_action = possible_actions[np.argmax([self.get_qvalue(state, a) for a in possible_actions])]
        return best_action

    def get_action(self, state):
        """
        Compute the action to take in the current state, including exploration.  
        With probability self.epsilon, take a random action.
            Otherwise - the best policy action (self.getPolicy).
        """
        if np.random.uniform() < self.epsilon:
            return np.random.choice(self.possible_actions)
        else:
            return self.get_best_action(state)        

In [None]:
def train_qlearning_agent(game, agent, n_iter, n_sessions):
    for j in range(n_iter):   
        now = time()
        final_rewards = []
        for i in range(n_sessions):
            #print("Session {}".format(i))
            game_state = game.reset()
            done = False
            total_reward = 0
            k = 0
            while not done:
                action = agent.get_action(game_state)
                next_game_state, reward, done = game.step(action)
                agent.update(game_state, action, reward, next_game_state)
                game_state = next_game_state
                total_reward += reward
                #print("Turn {}, action was {}, reward {}, done is {}".format(game.turn, action, reward, done))
                #print("Turtle on {}".format(game_state.position))
            final_rewards.append(total_reward)
        print("Iteration took {:.0f} seconds".format(time() - now))
        print("Agent mapped {} states".format(len(agent._qvalues)))
        print("Mean reward on iteration {}: {}".format(j, np.mean(final_rewards)))
        if agent.epsilon > 0.01:
            agent.epsilon -= 0.005

In [None]:
agent = QLearningAgent(alpha=0.5, epsilon=0.35, discount=0.98, 
                       possible_actions=game.get_possible_actions())

In [None]:
train_qlearning_agent(game, agent, 100, 1000)