In [2]:
import sys
import numpy as np
import random
import matplotlib.pyplot as plt
import string

In [109]:
class Board(object):
    def __init__(self,width,height,terminal_states,reward_states,teleport_starts,teleport_ends):
        self.valid_states = [x+1 for x in range(width*height)]
        self.edges = {
            "e": [(x*width)+1 for x in range(height)],
            "n": [x+1 for x in range(width)],
            "w": [(x+1)*width for x in range(height)],
            "s": [(x+((height-1)*width)+1) for x in range(width)],
        }
        self.edges['ne'] = self.edges['e']+self.edges['n']
        self.edges['nw'] = self.edges['w']+self.edges['n']
        self.edges['se'] = self.edges['e']+self.edges['s']
        self.edges['sw'] = self.edges['w']+self.edges['s']
        self.width = width
        self.height = height
        self.terminal_states = terminal_states
        self.reward_states = reward_states
        self.teleport_starts = teleport_starts
        self.teleport_ends = teleport_ends
        
    def draw(self,agent_state):
        board_vals = [['_' for x in range(self.width)] for y in range(self.height)]
        for state in self.terminal_states:
            adj_state = state - 1
            board_vals[int(adj_state/self.width)][adj_state%self.width] = 'T'
        a = list(string.ascii_uppercase)
        i = 0
        for state in self.reward_states:
            adj_state = state - 1
            board_vals[int(adj_state/self.width)][adj_state%self.width] = a[i]
            i += 1
        for j,state in enumerate(self.teleport_starts):
            adj_state = state - 1
            board_vals[int(adj_state/self.width)][adj_state%self.width] = a[i]
            adj_state = self.teleport_ends[j] - 1
            board_vals[int(adj_state/self.width)][adj_state%self.width] = a[i]+"'"
            i += 1
        
        adj_state = (agent_state-1)
        board_vals[int(adj_state/self.width)][adj_state%self.width] = "*"
        
        print('\n'.join([''.join(['{:3}'.format(item) for item in row]) for row in board_vals]))
        
    def draw_policy(self,policy_vals):
        board_vals = [['_' for x in range(self.width)] for y in range(self.height)]
        val_chars = {
            'e': '<',
            'ne': '\\',
            'n': '^',
            'nw': "/",
            'w': '>',
            'sw': "\\.",
            's': 'v',
            'se': "./",
        }
        for i,p in enumerate(policy_vals):
            if p in val_chars:
                board_vals[int(i/self.width)][i%self.width] = val_chars[p]
        
        print('\n'.join([''.join(['{:3}'.format(item) for item in row]) for row in board_vals]))
    

class GridEnvironment(object):
    def __init__(self,width,height,movement_reward,edge_reward,reward_locations,terminal_locations,terminal_reward,starting_points,allow_diag = False):
        self.movement_reward = float(movement_reward)
        self.edge_reward = float(edge_reward)
        self.valid_states = [x+1 for x in range(width*height)]
        self.valid_actions = ['e','n','w','s']
        if allow_diag:
            self.valid_actions = ['e','ne','n','nw','w','sw','s','se']
        # reward_locations format: 
#         [
#             start_state: [reward,(end_state)?],  # The end_state is optional. Without it, the agent will move in whatever direction they chose
#             ...
#         ]
        self.reward_locations = reward_locations
        self.teleporting_starts = [k for k,v in reward_locations.iteritems() if len(v) == 2]
        self.teleporting_ends = [v[1] for k,v in reward_locations.iteritems() if len(v) == 2]
        self.board = Board(width,height,terminal_locations,reward_locations.keys(),self.teleporting_starts,self.teleporting_ends)
        self.terminal_locations = terminal_locations
        self.terminal_reward = float(terminal_reward)
        self.starting_points = starting_points
        
    def initialize_agent(self):
        if len(self.starting_points) == 0:
            return random.randint(1,(self.board.width*self.board.height + 1))
        else:
            i = random(0,len(self.starting_points))
            return self.starting_points[i]
    
    def is_terminal_state(self,state):
        return state in self.terminal_locations
    
    def move_result(self,old_state,direction):
        if direction not in self.valid_actions:
            print "Invalid direction {}".format(direction)
            raise
        
        if old_state in self.teleporting_starts:
            return self.reward_locations[old_state][1]
        elif old_state in self.board.edges[direction]:
            return old_state
        elif direction == "e":
            return old_state - 1
        elif direction == 'ne':
            return old_state - 1 - self.board.width
        elif direction == "n":
            return old_state - self.board.width
        elif direction == 'nw':
            return old_state - self.board.width + 1
        elif direction == "w":
            return old_state + 1
        elif direction == 'sw':
            return old_state + self.board.width + 1
        elif direction == "s":
            return old_state + self.board.width
        elif direction == "se":
            return old_state + self.board.width - 1
            
    def transition_probability(self,old_state,new_state,direction):
        if old_state in self.terminal_locations:
            return 0.0
        elif new_state == self.move_result(old_state,direction):
            return 1.0
    
        return 0.0
    
    def reward(self,old_state,new_state):
        if old_state in self.reward_locations.keys():
            return self.reward_locations[old_state][0]
        elif new_state in self.terminal_locations:
            return self.terminal_reward
        else:
            return self.movement_reward
        

In [185]:
class EGreedyPolicy(object):
    def __init__(self,epsilon):
        self.e = epsilon
        
    def action(self,state,valid_actions,vals,just_val=False):
        r = random.random()
        if just_val:
            r = 1.0
        
        if r < self.e:
            return valid_actions[random.randint(0,len(valid_actions)-1)]
        else:
            keys = ["{}_{}".format(state,a) for a in valid_actions]
            vals = [vals[k] for k in keys if k in vals ]
            
            if len(vals) == 0:
                if just_val:
                    return "NA"
                else:
                    return valid_actions[random.randint(0,len(valid_actions)-1)]
            else:
                max_action = keys[max(range(len(vals)),key=(lambda k: vals[k]))].split("_")[1]
                return max_action
    

class RLAgent(object):
    def __init__(self,env,discount_factor,alpha,policy):
        self.y = discount_factor
        self.policy = policy
        self.a = alpha
        self.env = env
        self.valid_actions = self.env.valid_actions
        self.valid_states = self.env.valid_states
        self.initialize_vals()

    def initialize_vals(self):
        self.vals = {}
        
    def initialize_episode(self):
        self.curr_state = self.env.initialize_agent()
            
    def draw_policy(self):
        policy_vals = [self.policy.action(state,self.valid_actions,self.vals,True) for state in self.valid_states]
        self.env.board.draw_policy(policy_vals)
        
    def print_vals(self):
        for state in self.valid_states:
            print "{}:".format(state)
            for action in self.valid_actions:
                key = "{}_{}".format(state,action)
                print "\t{}: {}".format(action,(self.vals[key] if key in self.vals else "NA"))
        
    def move(self):
        return
        
    def play_episode(self,draw_board_interval=None,max_iter=None):
        self.initialize_episode()
        
        i = 0
        while (self.env.is_terminal_state(self.curr_state) == False) and (max_iter == None or i < max_iter):
            self.move()
            if draw_board_interval != None and i%draw_board_interval == 0:
                print "Step #{}".format(i+1)
                self.env.board.draw(self.curr_state)
                print
                print
            i += 1
            
        return
    
class SarsaAgent(RLAgent):
    def move(self):
        action = self.policy.action(self.curr_state,self.valid_actions,self.vals)
        
        key = "{}_{}".format(self.curr_state,action)
        if key not in self.vals:
            self.vals[key] = 0.0

        old_val = self.vals[key]
        new_state = self.env.move_result(self.curr_state,action)
        reward = self.env.reward(self.curr_state,new_state)
        
        new_action = self.policy.action(new_state,self.valid_actions,self.vals)
        new_key = "{}_{}".format(new_state,new_action)
        new_state_val = (self.vals[new_key] if new_key in self.vals else 0.0)
        
        new_val = old_val + self.a * (reward + self.y*new_state_val - old_val)
        
        self.vals[key] = new_val
        self.curr_state = new_state
        
        return
    
class QLearningAgent(RLAgent):
    def move(self):
        action = self.policy.action(self.curr_state,self.valid_actions,self.vals)
        
        key = "{}_{}".format(self.curr_state,action)
        if key not in self.vals:
            self.vals[key] = 0.0

        old_val = self.vals[key]
        new_state = self.env.move_result(self.curr_state,action)
        reward = self.env.reward(self.curr_state,new_state)
        
        keys = ["{}_{}".format(new_state,a) for a in self.valid_actions]
        vals = [self.vals[k] if k in self.vals else 0.0 for k in keys]
        max_val = max(vals)
        
        new_val = old_val + self.a * (reward + self.y*max_val - old_val)
        
        self.vals[key] = new_val
        self.curr_state = new_state
        
        return

class TDLamAgent(RLAgent):
    def __init__(self,env,discount_factor,alpha,policy,lam,elig_cutoff = 0.01):
        self.y = discount_factor
        self.policy = policy
        self.a = alpha
        self.l = lam
        self.elig_cutoff = elig_cutoff
        self.env = env
        self.valid_actions = self.env.valid_actions
        self.valid_states = self.env.valid_states
        self.initialize_vals()
    
    def initialize_vals(self):
        self.vals = {}
        self.elig_traces = {}
    
    def move(self):
        action = self.policy.action(self.curr_state,self.valid_actions,self.vals)
        
        key = "{}_{}".format(self.curr_state,action)
        if key not in self.vals:
            self.vals[key] = 0.0

        old_val = self.vals[key]
        new_state = self.env.move_result(self.curr_state,action)
        reward = self.env.reward(self.curr_state,new_state)
        
        new_action = self.policy.action(new_state,self.valid_actions,self.vals)
        new_key = "{}_{}".format(new_state,new_action)
        new_state_val = (self.vals[new_key] if new_key in self.vals else 0.0)
        delta = reward + self.y*new_state_val - old_val
        
        if key not in self.elig_traces:
            self.elig_traces[key] = 0
        
        self.elig_traces[key] += 1
        
        keys_to_remove = []
        for k,v in self.elig_traces.iteritems():
            if k not in self.vals:
                self.vals[k] = 0.0
            
            self.vals[k] += self.a * delta * v
            self.elig_traces[k] = self.y * self.l * v
            if self.elig_traces[k] < self.elig_cutoff:
                keys_to_remove.append(k)
        
        for k in keys_to_remove:
            del self.elig_traces[k]
        
        self.curr_state = new_state
        
        return

class TDQAgent(TDLamAgent):
    def move(self):
        action = self.policy.action(self.curr_state,self.valid_actions,self.vals)
        
        key = "{}_{}".format(self.curr_state,action)
        if key not in self.vals:
            self.vals[key] = 0.0

        old_val = self.vals[key]
        new_state = self.env.move_result(self.curr_state,action)
        reward = self.env.reward(self.curr_state,new_state)
        
        keys = ["{}_{}".format(new_state,a) for a in self.valid_actions]
        vals = [self.vals[k] if k in self.vals else 0.0 for k in keys]
        max_val = max(vals)
        
        delta = reward + self.y*max_val - old_val
        
        if key not in self.elig_traces:
            self.elig_traces[key] = 0
        
        self.elig_traces[key] += 1
        
        keys_to_remove = []
        for k,v in self.elig_traces.iteritems():
            if k not in self.vals:
                self.vals[k] = 0.0
            
            self.vals[k] += self.a * delta * v
            self.elig_traces[k] = self.y * self.l * v
            if self.elig_traces[k] < self.elig_cutoff:
                keys_to_remove.append(k)
        
        for k in keys_to_remove:
            del self.elig_traces[k]
        
        self.curr_state = new_state
        
        return


In [191]:
grid = GridEnvironment(5,6,-1,-1,{},[1,30],0,[],False)
policy = EGreedyPolicy(0.05)
agent = TDQAgent(grid,0.9,0.35,policy,0.01,0.10)

In [194]:
# agent.initialize_vals()
for x in range(1000):
    agent.play_episode()


In [195]:
agent.draw_policy()

_  <  <  <  <  
^  <  ^  ^  v  
^  ^  ^  >  v  
^  <  >  v  v  
^  v  v  >  v  
>  >  >  >  _  
