# Turtle runs - RL with the smoothed Crossentropy Method

In [1]:
import sys
import numpy as np
from collections import namedtuple

In [2]:
actions = ['N', 'E', 'S', 'W', 'O']

In [3]:
class Turtle():
    def __init__(self, position, halite):
        self.position = position
        self.halite = halite

In [64]:
class SimpleHalite():
    def __init__(self, height, width, start_pos):
        self.game_map = np.random.randint(1, 1000, size=(height, width))
        self.game_map[start_pos] = 0
        self.orig_map = self.game_map.copy()
        self.turtle = Turtle(start_pos, 0)
        self.turn = 1
        self.max_turns = 100
        self.halite = 0
        self.base = start_pos
        self.height = height
        self.width = width
    
    def get_state(self):
        return [self.game_map, self.turtle, self.turn], self.halite, self.turn == 100
    
    def reset(self):
        self.game_map = self.orig_map.copy()
        self.turtle = Turtle(self.base, 0)
        self.turn = 1
        self.halite = 0
        
    def step(self, action):
        reward = 0
        if action == 'O':
            mined_halite = self.game_map[self.turtle.position] // 4
            self.game_map[self.turtle.position] -= mined_halite
            self.turtle.halite += min(1000, mined_halite)
        else:
            if action == 'N':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (0, 1))])
            elif action == 'E':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (1, 0))])
            elif action == 'S':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (0, -1))])
            elif action == 'W':
                cost_halite = self.game_map[self.turtle.position] // 10
                new_pos = tuple([sum(x) for x in zip(self.turtle.position, (-1, 0))])
            #print(cost_halite, self.turtle.halite)
            if cost_halite <= self.turtle.halite:
                #print("moving turtle to {}".format(new_pos))
                self.turtle = Turtle(new_pos, self.turtle.halite - cost_halite)
            else:
                mined_halite = self.game_map[self.turtle.position] // 4
                self.game_map[self.turtle.position] -= mined_halite
                self.turtle.halite += min(1000, mined_halite)                
        self.turtle.position = (self.turtle.position[0] % self.width, 
                                self.turtle.position[1] % self.height)
        if self.turtle.position == self.base:
            self.halite += self.turtle.halite
            reward = self.turtle.halite
            self.turtle.halite = 0
        self.turn += 1
        return [self.game_map, self.turtle, self.turn], self.halite, self.turn == 100

In [65]:
def policy(game_map, turtle, turn):
    return np.random.choice(actions)

Needs to be a tuple rather than a list as start_pos

In [66]:
game = SimpleHalite(5, 5, (2, 2))

In [68]:
for i in range(100):
    game.reset()
    turn_count = 0
    [game_map, turtle, turn], reward, done = game.get_state()
    while not done:
        action = policy(game_map, turtle, turn)
        #print(action, turtle.position)
        turn_count += 1
        [game_map, turtle, turn], reward, done = game.step(action)
    print(reward, turtle.position, turn_count)

18 (3, 4) 99
0 (3, 1) 99
131 (3, 3) 99
754 (2, 1) 99
0 (3, 0) 99
474 (2, 3) 99
276 (2, 2) 99
1100 (2, 1) 99
258 (0, 0) 99
809 (0, 1) 99
560 (2, 0) 99
0 (0, 4) 99
467 (2, 0) 99
382 (2, 4) 99
425 (2, 4) 99
930 (0, 2) 99
18 (1, 2) 99
222 (0, 1) 99
495 (4, 3) 99
1299 (4, 4) 99
185 (2, 0) 99
589 (2, 1) 99
798 (2, 3) 99
791 (1, 2) 99
633 (4, 3) 99
689 (2, 3) 99
1841 (2, 1) 99
416 (0, 3) 99
292 (0, 0) 99
271 (2, 3) 99
0 (4, 4) 99
335 (4, 4) 99
368 (2, 0) 99
821 (0, 2) 99
137 (0, 0) 99
767 (2, 4) 99
399 (3, 1) 99
882 (3, 2) 99
356 (4, 2) 99
426 (1, 2) 99
917 (2, 4) 99
894 (3, 0) 99
609 (4, 4) 99
960 (2, 0) 99
145 (4, 1) 99
729 (3, 2) 99
627 (3, 2) 99
0 (3, 2) 99
80 (0, 2) 99
422 (2, 4) 99
996 (2, 3) 99
258 (4, 3) 99
1310 (1, 3) 99
544 (4, 3) 99
579 (2, 2) 99
305 (0, 0) 99
212 (3, 4) 99
670 (2, 1) 99
606 (0, 1) 99
506 (0, 1) 99
263 (2, 4) 99
286 (3, 1) 99
538 (4, 0) 99
825 (1, 1) 99
431 (1, 3) 99
623 (0, 3) 99
315 (3, 1) 99
1088 (0, 4) 99
764 (2, 2) 99
525 (2, 1) 99
804 (4, 1) 99
1353 (0, 0) 99

We can now simulate the game loop. We now need to 
- fix the game map (otherwise the state mappong will become too unwieldy)
- keep track of state-action pairs (memory intensive, but maybe we can manage)
- use state-action pairs from elite runs to update policy
- add smoothing