## 2 Learning a policy via MC - Policy Iteration
For the following work with last weeks implementation of your own gridworld! You may revise/change pieces of it, or ask other groups for access to their implementation of course.

* Implement tabular MC-estimate Policy Iteration
* Measure average Return-per-Episode and plot it against (1) episodes sam-
pled, and (2) wallclock-time For an outstanding submission:
* Visualize the State-Action Values in your gridworld during training at regular intervals, and provide a visualization of them (e.g. a series of images, best combine them into a short video clip)

### GridWorld Code (refactored) From HW 1:

#### Global Variables, Static Functions

In [1]:
import random
# Directions as a faux enum, this is the set A of actions
UP = 0
LEFT = 1
DOWN = 2
RIGHT = 3
DIRECTIONS = [UP, LEFT, DOWN, RIGHT]

def distance_between_states(state_1, state_2):
    """Calculate Manhattan distance between states. This is useful for the 
       agent's policy, but also useful for positioning walls are warps away 
       from the start and end state."""
    return abs(state_2[0] - state_1[0]) + abs(state_2[1] - state_1[1])
    
def direction_arithmetic(curr_pos, direction):
    """Calculate the resulting state coordinates given a state and direction."""
    row, col = curr_pos
    if direction == UP:
        row = row - 1
    elif direction == LEFT:
        col = col - 1
    elif direction == DOWN:
        row = row + 1
    elif direction == RIGHT:
        col = col + 1
    else:
        raise Exception(f"Unrecognized direction: {direction}")
    return (row, col)

def average(l):
    return sum(l)/len(l)

def print_matrix(m):
    for row in m:
        for col in row:
            print(f"{col:.3f}", end='\t')
        print()

#### Agent Classes

In [2]:
class Agent:
    """This base agent class just takes random actions."""
    
    def __init__(self, state=(0, 0)):
        self.state = state
        self.available_actions = []
        self.available_next_states = []
        
    def get_action_from_policy(self):
        return random.choice(self.available_actions)
    
    def take_action(self, action):
        action_index = self.available_action.index(action)
        self.state = self.available_next_states[action_index]
        
    def reset(self, state=(0, 0)):
        self.state = state
        self.available_actions = []
        self.available_next_states = []
        
class MagneticAgent(Agent):
    """The MagneticAgent likes to choose actions that bring it closer
       to the win state, regardless of what obstacles are in the way."""

    def __init__(self, win_state=(0, 0), start_state=(0, 0)):
        self.state = start_state
        self.win_state = win_state
        self.available_actions = []
        self.available_next_states = []
    
    def get_action_from_policy(self):
        
        """Get possible actions/next states, and pick one. The probability of choosing a direction is
           inversely proportional to the distance that the resulting state is from the terminal state"""
        distances_to_win_states = list(map(lambda s : distance_between_states(s, self.win_state), self.available_next_states))
        reciprocals_of_distances = list(map(lambda d : 1/(d+1), distances_to_win_states))
        sum_of_reciprocals = sum(reciprocals_of_distances)
        normalized_probabilities = list(map(lambda r : r/sum_of_reciprocals, reciprocals_of_distances))
        return random.choices(self.available_actions, weights=reciprocals_of_distances)[0]


#### GridWorld Class

In [3]:
class GridWorld:
 
    def __init__(self, height, width, complex=False):
        """Initialize the grid with properties we expect to not change
           during the game."""
        self.height = height
        self.width = width
        self.complex = complex
        self.walls = []
        self.warps = []
        self.grid = [[0 for _ in range(width)] for _ in range(height)]
        self.agent = Agent()
        
        self.reset()
        
    def set_agent(self, agent):
        self.agent = agent
        
    def random_position(self):
        """Pick out a random tile."""
        rand_row = random.randint(0, self.height-1)
        rand_col = random.randint(0, self.width-1)
        return (rand_row, rand_col)
    
    def tile_is_open(self, tile):
        return self.grid[tile[0]][tile[1]] == " "
    
    def spawn_complexity_randomly(self, complexity, seed=None):
        random.seed(seed)
        tile = self.random_position()
        if (self.tile_is_open(tile) and
            distance_between_states(tile, self.win_state) > 1 and
            distance_between_states(tile, self.agent.state) > 1):
            if complexity == "wall":
                self.walls.append(tile)
                self.grid[tile[0]][tile[1]] = "█"
            elif complexity == "warp":
                self.warps.append(tile)
                self.grid[tile[0]][tile[1]] = "*"
            else:
                raise Exception(f"Unrecognized complexity: {complexity}!")
        
    def reset(self):
        """Reset the GridWorld. Send the agent back to the corner. Set up
           walls and warps"""
        for i in range(len(self.grid)):
            for j in range(len(self.grid[i])):
                self.grid[i][j] = " "
        self.win_state = (self.height-1, self.width-1)
        self.grid[self.win_state[0]][self.win_state[1]] = "W"
        
        self.agent.reset()
        self.grid[self.agent.state[0]][self.agent.state[1]] = "A"
        self.update_valid_next_actions_and_states()

        
        """Add complexities (2 walls, 2 warps). the location is random, but consistent for
           a given grid size. This helps make the value function more specific to one grid."""
        self.walls = []
        self.warps = []
        if self.complex:
            iteration = 0
            while len(self.walls) < 2:
                self.spawn_complexity_randomly("wall", iteration)
                iteration += 1
            while len(self.warps) < 2:
                self.spawn_complexity_randomly("warp", iteration)
                iteration += 1
            random.seed()
                
    def valid(self, state):
        """Checks to see if a state lies within the bounds of the grid."""
        row, col = state
        return (row >=0 and row < self.height) and (col >=0 and col < self.width)
    
    def update_valid_next_actions_and_states(self):
        """From the agent's state or a given state, look around and see what directions
           are possible."""
        valid_actions = []
        valid_states = []
        for direction in DIRECTIONS:
            target_state = direction_arithmetic(self.agent.state, direction)
            if self.valid(target_state):
                valid_actions.append(direction)
                valid_states.append(target_state)
        self.agent.available_actions = valid_actions
        self.agent.available_next_states = valid_states
        
    def reward_from_state(self, state, direction):
        """Reward function given state and action. Penalizes warps more than walls.
           No penalty for simply moving to an open space."""
        target_state = direction_arithmetic(state, direction)
        if target_state == self.win_state:
            return 1
        if target_state in self.walls:
            return -0.25
        if target_state in self.warps:
            return -0.5
        else:
            return 0

    def reward(self, direction):
        """Same as above, but from the agent's state."""
        return self.reward_from_state(self.agent.state, direction)
    
        
    def move(self, direction):
        """Try to move in a given direction. Hitting a wall will leave the agent where
           it is. Hitting a warp will send the agent back to the starting corner."""
        target_state = direction_arithmetic(self.agent.state, direction)
        if self.valid(target_state) and target_state not in self.walls:
            self.grid[self.agent.state[0]][self.agent.state[1]] = " "
            
            # go back to the beginning if you hit a warp tile
            if target_state in self.warps:
                self.agent.state = (0, 0)
            else:
                self.agent.state = target_state
            self.grid[self.agent.state[0]][self.agent.state[1]] = "A"
            self.update_valid_next_actions_and_states()
    

    def __repr__(self):
        """For printing but mainly for debugging"""
        s = ""
        for row in range(self.height):
            s += "==" * (self.width) + "="
            s += "\n"
            for col in range(self.width):
                s += f"|{self.grid[row][col]}"
            s +="|\n"
        s += "==" * (self.width) + "="
        return s

### Run Episodes

In [4]:
"""GAMMA = 0.95
HEIGHT = 5
WIDTH = 5
V = [[0 for _ in range(WIDTH)] for _ in range(HEIGHT)]
returns = [[ [] for _ in range(WIDTH)] for _ in range(HEIGHT)]
g = GridWorld(HEIGHT, WIDTH, complex=True)
a = MagneticAgent()
g.set_agent(a)"""

'GAMMA = 0.95\nHEIGHT = 5\nWIDTH = 5\nV = [[0 for _ in range(WIDTH)] for _ in range(HEIGHT)]\nreturns = [[ [] for _ in range(WIDTH)] for _ in range(HEIGHT)]\ng = GridWorld(HEIGHT, WIDTH, complex=True)\na = MagneticAgent()\ng.set_agent(a)'

In [5]:
def monte_carlo_simulation(height, width, gamma, agent, num_episodes):
    V = [[0 for _ in range(width)] for _ in range(height)]
    returns = [[ [] for _ in range(width)] for _ in range(height)]
    g = GridWorld(height, width, complex=True)
    g.set_agent(agent)
    
    completed_episodes = 0
    while completed_episodes < 500:
        time_step = 0
        visited_states = list()
        g.reset()

        """the agent should act as long as it hasn't reached the terminal state"""
        while g.agent.state != g.win_state:

            selected_action = g.agent.get_action_from_policy()

            """Calculate the reward for the move. Incorporate this reward into the rewards of all states
               that have been visited so far this episode."""
            reward_from_action = g.reward(selected_action)
            for i in range(len(visited_states)):
                state_in_history = visited_states[-1*i] # moving backwards in time
                state_in_history[1] += gamma**i * reward_from_action # element 1 is the reward
            visited_states.append([g.agent.state, reward_from_action])

            """Make the move and increase the time step."""
            g.move(selected_action)
            time_step += 1

        """After every episode, add the rewards for each visited state into the returns 3-D array (indexed
           by (row, col)). Then recalculate V based on the ever growing returns lists. As they grow, the
           values in V should converge."""
        for visited_state in visited_states:
            state = visited_state[0]
            rewards = visited_state[1]
            returns[state[0]][state[1]].append(rewards)
            V[state[0]][state[1]] = average(returns[state[0]][state[1]])
        completed_episodes += 1

        if completed_episodes % 10 == 0:
            print(f"Completed episodes: {completed_episodes}")
    return V

In [6]:
#V = monte_carlo_simulation(5, 5, 0.95, Agent(), 500)
#print_matrix(V)

In [None]:
V = monte_carlo_simulation(5, 5, 0.95, MagneticAgent(), 500)
print_matrix(V)

Completed episodes: 10
Completed episodes: 20
Completed episodes: 30
Completed episodes: 40
Completed episodes: 50
Completed episodes: 60
Completed episodes: 70
Completed episodes: 80
Completed episodes: 90
Completed episodes: 100
Completed episodes: 110
Completed episodes: 120
Completed episodes: 130
Completed episodes: 140
Completed episodes: 150
Completed episodes: 160
Completed episodes: 170
Completed episodes: 180
Completed episodes: 190
Completed episodes: 200
Completed episodes: 210
