# Turtle runs - RL with the smoothed Crossentropy Method

In [1]:
import sys
import numpy as np
from collections import namedtuple

                the kernel may be left running.  Please let us know
                about your system (bitness, Python, etc.) at
                ipython-dev@scipy.org
  ipython-dev@scipy.org""")


In [2]:
actions = ['N', 'E', 'S', 'W', 'O']

In [3]:
class Turtle():
    def __init__(self, position, halite):
        self.position = position
        self.halite = halite

In [4]:
class GameState():
    def __init__(self, game_map, position, halite):
        self.game_map = game_map
        self.position = position
        self.halite = halite
        
    def __eq__(self, other):
        return (self.game_map == other.game_map).all() and self.position == other.position and self.halite == other.halite

In [5]:
class SimpleHalite():
    def __init__(self, height, width, start_pos):
        np.random.seed(42)
        self.game_map = np.random.randint(1, 1000, size=(height, width))
        self.game_map[start_pos] = 0
        self.orig_map = self.game_map.copy()
        self.turtle = Turtle(start_pos, 0)
        self.turn = 1
        self.max_turns = 100
        self.halite = 0
        self.base = start_pos
        self.height = height
        self.width = width
    
    def get_state(self):
        return GameState(self.game_map, self.turtle.position, self.turtle.halite), self.turn, self.halite, self.turn == 100
    
    def reset(self):
        self.game_map = self.orig_map.copy()
        self.turtle = Turtle(self.base, 0)
        self.turn = 1
        self.halite = 0
        
    def step(self, action):
        reward = 0
        if action == 'O':
            mined_halite = self.game_map[self.turtle.position] // 4
            self.game_map[self.turtle.position] -= mined_halite
            self.turtle.halite += min(1000, mined_halite)
        else:
            if action == 'N':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (0, 1))])
            elif action == 'E':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (1, 0))])
            elif action == 'S':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (0, -1))])
            elif action == 'W':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (-1, 0))])
            #print(cost_halite, self.turtle.halite)
            if cost_halite <= self.turtle.halite:
                #print("moving turtle to {}".format(new_pos))
                self.turtle = Turtle(new_pos, self.turtle.halite - cost_halite)
            else:
                mined_halite = self.game_map[self.turtle.position] // 4
                self.game_map[self.turtle.position] -= mined_halite
                self.turtle.halite += min(1000, mined_halite)                
        self.turtle.position = (self.turtle.position[0] % self.width, 
                                self.turtle.position[1] % self.height)
        if self.turtle.position == self.base:
            self.halite += self.turtle.halite
            reward = self.turtle.halite
            self.turtle.halite = 0
        self.turn += 1
        return self.get_state()

In [6]:
def policy(game_state, turn, policy_mapping=[]):
    for (state, probs) in policy_mapping:
        if state == game_state:
            return np.random.choice(actions, p=probs)
    return np.random.choice(actions)

Needs to be a tuple rather than a list as start_pos

In [7]:
game = SimpleHalite(5, 5, (2, 2))

In [8]:
N = 200
M = 40

In [None]:
policy_mapping = []
for j in range(M):
    state_action_maps = []
    final_rewards = []
    for i in range(N):
        state_action_map = []
        game.reset()
        turn_count = 0
        game_state, turn, reward, done = game.get_state()
        while not done:
            action = policy(game_state, turn, policy_mapping)
            state_action_map.append((game_state, action))
            #print(action, turtle.position)
            turn_count += 1
            game_state, turn, reward, done = game.step(action)
        state_action_maps.append(state_action_map)
        # print(reward, turtle.position)
        final_rewards.append(reward)
    print(j, np.mean(final_rewards))
    elite_games = np.argsort(final_rewards)[-(N//10):]
    maps_to_keep = np.array(state_action_maps)[elite_games]
    full_map = [item for sublist in maps_to_keep for item in sublist]
    done_policies = []
    policy_states = [x[0] for x in policy_mapping]
    for (state, _) in full_map:
        if state not in done_policies:
            done_policies.append(state)
            subset = [a for (s, a) in full_map if s == game_state]
            action_probs = []
            for action in actions:
                action_probs.append((subset.count(action) + 2) / (len(subset) + 2*5))
            if state in policy_states:
                for ix, policy in enumerate(policy_mapping):
                    if policy[0] == state:
                        policy_mapping[ix] = (state, alpha * action_probs + (1 - alpha) * policy[1])
            else:
                policy_mapping.append((state, action_probs))

0 422.235


In [53]:
policy_mapping = []
for (state, _) in full_map:
    if state not in policy_mapping:
        subset = [a for (s, a) in full_map if (s[0] == state[0]).all()]
        action_probs = []
        for action in actions:
            action_probs.append((subset.count(action) + 2) / (len(subset) + 2*5))
        policy_mapping.append((state, action_probs))

We can now simulate the game loop. We now need to 
- fix the game map (otherwise the state mappong will become too unwieldy) - DONE
- keep track of state-action pairs (memory intensive, but maybe we can manage) - DONE
- use state-action pairs from elite runs to update policy
- add smoothing

In [23]:
elements = [1.1, 2.2, 3.3]
probabilities = [0.2, 0.5, 0.3]
np.random.choice(elements, p=probabilities)

3.3