# Turtle runs - RL with the smoothed Crossentropy Method

In [1]:
import sys
import numpy as np
from collections import namedtuple
from time import time

In [2]:
actions = ['N', 'E', 'S', 'W', 'O']

In [3]:
class Turtle():
    def __init__(self, position, halite):
        self.position = position
        self.halite = halite

In [4]:
class GameState():
    def __init__(self, game_map, position, halite):
        self.game_map = game_map
        self.position = position
        self.halite = halite
        
    def __eq__(self, other):
        return (self.game_map == other.game_map).all() and self.position == other.position and self.halite == other.halite

    def __contains__(self, key):
        return key in self.numbers


In [6]:
class SimpleHalite():
    def __init__(self, height, width, start_pos):
        np.random.seed(42)
        self.game_map = np.random.randint(1, 1000, size=(height, width))
        self.game_map[start_pos] = 0
        self.orig_map = self.game_map.copy()
        self.turtle = Turtle(start_pos, 0)
        self.turn = 1
        self.max_turns = 50
        self.halite = 0
        self.base = start_pos
        self.height = height
        self.width = width
    
    def get_state(self):
        game_state = GameState(self.game_map.copy(), self.turtle.position, self.turtle.halite)
        return game_state, self.turn, self.halite, self.turn == self.max_turns
    
    def reset(self):
        self.game_map = self.orig_map.copy()
        self.turtle = Turtle(self.base, 0)
        self.turn = 1
        self.halite = 0
        
    def step(self, action):
        reward = 0
        if action == 'O':
            mined_halite = self.game_map[self.turtle.position] // 4
            self.game_map[self.turtle.position] -= mined_halite
            self.turtle.halite += min(1000, mined_halite)
        else:
            if action == 'N':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (0, 1))])
            elif action == 'E':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (1, 0))])
            elif action == 'S':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (0, -1))])
            elif action == 'W':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (-1, 0))])
            #print(cost_halite, self.turtle.halite)
            if cost_halite <= self.turtle.halite:
                #print("moving turtle to {}".format(new_pos))
                self.turtle = Turtle(new_pos, self.turtle.halite - cost_halite)
            else:
                mined_halite = self.game_map[self.turtle.position] // 4
                self.game_map[self.turtle.position] -= mined_halite
                self.turtle.halite += min(1000, mined_halite)                
        self.turtle.position = (self.turtle.position[0] % self.width, 
                                self.turtle.position[1] % self.height)
        if self.turtle.position == self.base:
            self.halite += self.turtle.halite
            reward = self.turtle.halite
            self.turtle.halite = 0
        self.turn += 1
        return self.get_state()

In [7]:
def policy(game_state, turn, policy_mapping=None):
    if policy_mapping is not None and (game_state.position, game_state.halite) in policy_mapping:
        for map_action in policy_mapping[(game_state.position, game_state.halite)]:
            if (map_action[0] == game_state.game_map).all():
                return np.random.choice(actions, p=map_action[1])
    return np.random.choice(actions)

Needs to be a tuple rather than a list as start_pos

In [8]:
game = SimpleHalite(5, 5, (2, 2))

In [9]:
N = 200
M = 50
alpha = 0.5

In [10]:
def add_to_policy_mapping(policy_mapping, state, action_probs):
    if (state.position, state.halite) not in policy_mapping:
            policy_mapping[(state.position, state.halite)] = []
    policy_mapping[(state.position, state.halite)].append((state.game_map, action_probs))
    return

In [11]:
import warnings
warnings.filterwarnings("error")

In [None]:
policy_mapping = {}
for j in range(M):
    now = time()
    state_action_maps = []
    final_rewards = []
    for i in range(N):
        state_action_map = []
        game.reset()
        turn_count = 0
        game_state, turn, reward, done = game.get_state()
        while not done:
            action = policy(game_state, turn, policy_mapping)
            state_action_map.append((game_state, action))
            turn_count += 1
            game_state, turn, reward, done = game.step(action)
        state_action_maps.append(state_action_map)
        final_rewards.append(reward)
    print(j, np.mean(final_rewards))
    elite_games = np.argsort(final_rewards)[N//5:]
    maps_to_keep = np.array(state_action_maps)[elite_games]
    full_map = [item for sublist in maps_to_keep for item in sublist]
    done_policies = []
    print("Entries in policy mapping:", sum(len(x) for x in policy_mapping.values()))
    print("Number of keys:", len(policy_mapping.keys()))
    updated, not_updated = 0, 0
    for (state, _) in full_map:
        if state not in done_policies:
            done_policies.append(state)
            subset = [a for (s, a) in full_map if s == state]
            action_probs = []
            action_probs_raw = []
            for action in actions:
                action_probs.append((subset.count(action) + 1) / (len(subset) + 5))
                action_probs_raw.append(subset.count(action) / len(subset))
            if (state.position, state.halite) in policy_mapping: 
                for ix, (game_map, probs) in enumerate(policy_mapping[state.position, state.halite]):
                    if (game_map == state.game_map).all():
                        policy_mapping[(state.position, state.halite)][ix] = \
                            (state.game_map, alpha * np.array(action_probs_raw) + (1 - alpha) * np.array(probs))
                        updated += 1
                        break
                not_updated += 1
                add_to_policy_mapping(policy_mapping, state, action_probs)
            else:
                not_updated += 1
                add_to_policy_mapping(policy_mapping, state, action_probs)
    print("Updated: {}, not updated: {}".format(updated, not_updated))
    print("Iteration took {:.0f} seconds".format(time() - now))

0 220.985
Entries in policy mapping: 0
Number of keys: 0
Updated: 0, not updated: 6878
Iteration took 267 seconds
1 231.015
Entries in policy mapping: 6878
Number of keys: 4315
Updated: 321, not updated: 6849
Iteration took 265 seconds
2 243.99
Entries in policy mapping: 13727
Number of keys: 6740
Updated: 377, not updated: 6790
Iteration took 266 seconds
3 238.88
Entries in policy mapping: 20517
Number of keys: 8244
Updated: 493, not updated: 6741
Iteration took 265 seconds
4 235.185
Entries in policy mapping: 27258
Number of keys: 9198
Updated: 553, not updated: 6643
Iteration took 260 seconds
5 259.62
Entries in policy mapping: 33901
Number of keys: 9887
Updated: 578, not updated: 6629
Iteration took 260 seconds
6 297.55
Entries in policy mapping: 40530
Number of keys: 10611
Updated: 654, not updated: 6454
Iteration took 253 seconds
7 301.155
Entries in policy mapping: 46984
Number of keys: 11118


In [None]:
[x for x in policy_mapping if max(x[1]) > 0.5]

We can now simulate the game loop. We now need to 
- fix the game map (otherwise the state mappong will become too unwieldy) - DONE
- keep track of state-action pairs (memory intensive, but maybe we can manage) - DONE
- use state-action pairs from elite runs to update policy - DONE
- add smoothing - DONE
- improve policy mapping data structure to keep turn times roughly constant - DONE