# Turtle runs - RL with the smoothed Crossentropy Method

In [1]:
import sys
import numpy as np
from collections import namedtuple

In [2]:
actions = ['N', 'E', 'S', 'W', 'O']

In [3]:
class Turtle():
    def __init__(self, position, halite):
        self.position = position
        self.halite = halite

In [68]:
class GameState():
    def __init__(self, game_map, position, halite):
        self.game_map = game_map
        self.position = position
        self.halite = halite
        
    def __eq__(self, other):
        return (self.game_map == other.game_map).all() and self.position == other.position and self.halite == other.halite



In [85]:
class SimpleHalite():
    def __init__(self, height, width, start_pos):
        np.random.seed(42)
        self.game_map = np.random.randint(1, 1000, size=(height, width))
        self.game_map[start_pos] = 0
        self.orig_map = self.game_map.copy()
        self.turtle = Turtle(start_pos, 0)
        self.turn = 1
        self.max_turns = 50
        self.halite = 0
        self.base = start_pos
        self.height = height
        self.width = width
    
    def get_state(self):
        game_state = GameState(self.game_map.copy(), self.turtle.position, self.turtle.halite)
        return game_state, self.turn, self.halite, self.turn == self.max_turns
    
    def reset(self):
        self.game_map = self.orig_map.copy()
        self.turtle = Turtle(self.base, 0)
        self.turn = 1
        self.halite = 0
        
    def step(self, action):
        reward = 0
        if action == 'O':
            mined_halite = self.game_map[self.turtle.position] // 4
            self.game_map[self.turtle.position] -= mined_halite
            self.turtle.halite += min(1000, mined_halite)
        else:
            if action == 'N':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (0, 1))])
            elif action == 'E':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (1, 0))])
            elif action == 'S':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (0, -1))])
            elif action == 'W':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (-1, 0))])
            #print(cost_halite, self.turtle.halite)
            if cost_halite <= self.turtle.halite:
                #print("moving turtle to {}".format(new_pos))
                self.turtle = Turtle(new_pos, self.turtle.halite - cost_halite)
            else:
                mined_halite = self.game_map[self.turtle.position] // 4
                self.game_map[self.turtle.position] -= mined_halite
                self.turtle.halite += min(1000, mined_halite)                
        self.turtle.position = (self.turtle.position[0] % self.width, 
                                self.turtle.position[1] % self.height)
        if self.turtle.position == self.base:
            self.halite += self.turtle.halite
            reward = self.turtle.halite
            self.turtle.halite = 0
        self.turn += 1
        return self.get_state()

In [86]:
def policy(game_state, turn, policy_mapping=None):
    if policy_mapping is not None:
        for (state, probs) in policy_mapping:
            if state == game_state:
                return np.random.choice(actions, p=probs)
    return np.random.choice(actions)

Needs to be a tuple rather than a list as start_pos

In [87]:
game = SimpleHalite(5, 5, (2, 2))

In [93]:
N = 100
M = 50
alpha = 0.5

In [None]:
policy_mapping = []
for j in range(M):
    state_action_maps = []
    final_rewards = []
    for i in range(N):
        state_action_map = []
        game.reset()
        turn_count = 0
        game_state, turn, reward, done = game.get_state()
        while not done:
#             if turn_count == 0:
#                 print(game_state.game_map, game_state.halite, game_state.position)
#                 if len(policy_mapping) > 0:
#                     print("Should be equal: ", game_state == policy_mapping[0][0])
#                     raise ValueError()
            action = policy(game_state, turn, policy_mapping)
            state_action_map.append((game_state, action))
            #print(action, turtle.position)
            turn_count += 1
            game_state, turn, reward, done = game.step(action)
        state_action_maps.append(state_action_map)
        # print(reward, turtle.position)
        final_rewards.append(reward)
    print(j, np.mean(final_rewards))
    elite_games = np.argsort(final_rewards)[:N//5]
    maps_to_keep = np.array(state_action_maps)[elite_games]
    full_map = [item for sublist in maps_to_keep for item in sublist]
    done_policies = []
    policy_states = [x[0] for x in policy_mapping]
    print(len(policy_states))
    updated, not_updated = 0, 0
    for (state, _) in full_map:
        if state not in done_policies:
            done_policies.append(state)
            subset = [a for (s, a) in full_map if s == state]
            action_probs = []
            action_probs_raw = []
            for action in actions:
                action_probs.append((subset.count(action) + 1) / (len(subset) + 5))
                action_probs_raw.append(subset.count(action) / len(subset))
            if state in policy_states:
                for ix, (s, probs) in enumerate(policy_mapping):
                    if s == state:
                        policy_mapping[ix] = (state, alpha * np.array(action_probs_raw) \
                                              + (1 - alpha) * np.array(probs))
                        updated += 1
                        break
            else:
                not_updated += 1
                policy_mapping.append((state, action_probs))
    print("Updated: {}, not updated: {}".format(updated, not_updated))
    # print("1st pol", policy_mapping[0][0].game_map, policy_mapping[0][0].halite, policy_mapping[0][0].position, policy_mapping[0][1])

0 214.66
0
Updated: 0, not updated: 903
1 233.34
903
Updated: 32, not updated: 876
2 238.54
1779
Updated: 51, not updated: 853
3 178.34
2632
Updated: 58, not updated: 829
4 190.81
3461
Updated: 74, not updated: 819
5 195.41
4280
Updated: 74, not updated: 816
6 169.76
5096
Updated: 76, not updated: 781
7 208.18
5877
Updated: 93, not updated: 783
8 184.97
6660
Updated: 85, not updated: 760
9 175.28
7420
Updated: 95, not updated: 779
10 200.37
8199
Updated: 93, not updated: 783
11 177.02
8982
Updated: 102, not updated: 757
12 180.1
9739
Updated: 90, not updated: 781
13 157.53
10520
Updated: 105, not updated: 754
14 173.63
11274
Updated: 113, not updated: 722
15 128.94
11996
Updated: 110, not updated: 716


In [None]:
[x for x in policy_mapping if max(x[1]) > 0.5]

We can now simulate the game loop. We now need to 
- fix the game map (otherwise the state mappong will become too unwieldy) - DONE
- keep track of state-action pairs (memory intensive, but maybe we can manage) - DONE
- use state-action pairs from elite runs to update policy
- add smoothing