In [19]:
import numpy as np
import matplotlib.pyplot as plt
import random
from four_rooms import FourRoomsEnvironment
from option import Option

""" Simple agent planning using Q-learning """
class SMDPQLearningAgent():
    def __init__(self, gamma=0.9):
        self.options = \
         [Option("left"), Option("up"), Option("right"), Option("down"),
          Option("topleft->topright"), Option("topleft->botleft"),
          Option("topright->topleft"), Option("topright->botright"),
          Option("botleft->topleft"), Option("botleft->botright"),
          Option("botright->botleft"), Option("botright->topright")]
        
        self.gamma = gamma # Discount factor, 0.9 by default as in paper
        self.current_option = None
        self.starting_state = None # Starting state of current option
        self.k = 0   # Number of time steps elapsed in current option
        self.cumulative_reward = 0  # Total reward for current option
            
        # Initialize option value table, and occurrence counts table
        n_states = 13 * 13
        n_options = len(self.options)
        self.Q = np.zeros((n_states, n_options)) 
        self.N = np.zeros((n_states, n_options))
    
    def epsilonGreedyPolicy(self, state, epsilon=0.1):
        # If we are not currently following an option
        if self.current_option is None:
            # Pick a new option and record starting state
            self._pickNewOptionEpsilonGreedily(state, epsilon)
        
        # Select action according to policy of current option 
        action, _ = self.current_option.pickAction(state)
        return action
        
    # Remark : state argument is unused, we update only for the starting
    # state of the finishing option (which is recorded in the agent)
    def recordTransition(self, state, reward, next_state):
        # Add reward discounted by current discounting factor
        self.cumulative_reward += (self.gamma ** self.k) * reward
        self.k += 1 # Increment k after
        
        # If current option terminates at next state
        if self.current_option.beta[next_state] == 1:
            # Update Q table
            self._updateQValue(next_state)
            # Reset current option to None
            self._resetCurrentOption()
        
    def _updateQValue(self, next_state):
        s1 = self._sIdx(self.starting_state)
        o = self._oIdx(self.current_option)
        s2 = self._sIdx(next_state)
        
        self.N[s1, o] += 1
        alpha = (1. / self.N[s1, o])
        
        target = self.cumulative_reward + \
            (self.gamma ** self.k) * np.max(self.Q[s2])
        self.Q[s1, o] += alpha * (target - self.Q[s1, o])
        
    # Pick new option according to model and state value function greedily
    def _pickNewOptionEpsilonGreedily(self, state, epsilon):
        # Iterate over options, keeping track of all available options
        # and the index of best option seen so far
        available_options = []
        best_option_index = 0   
        s = self._sIdx(state)
        for i in xrange(len(self.options)):
            if self.options[i].I[state] == 1:
                available_options.append(self.options[i])
                if self.Q[s, i] > self.Q[s, best_option_index]:
                    best_option_index = i
        
        # Pick greedy option with probability (1 - epsilon)
        if random.uniform(0, 1) > epsilon:
            self.current_option = self.options[best_option_index]
        
        # Pick random action with probability epsilon
        else:
            self.current_option = random.choice(available_options)
            
        # Set starting state of option
        self.starting_state = state
        
    def _sIdx(self, state):
        return state[0] * 13 + state[1]
    
    def _oIdx(self, option):
        return self.options.index(option)
    
    def _resetCurrentOption(self):
        self.k = 0
        self.cumulative_reward = 0
        self.current_option = None
        self.starting_state = None

In [20]:
env = FourRoomsEnvironment()
agent = SMDPQLearningAgent()

def run_episode(verbose=False):
    state = env.reset()
    while True:
        action = agent.epsilonGreedyPolicy(state)
        if verbose:
            print("State = {}, Option = {}, Action = {}".format(
                state, agent.current_option, action))
        next_state, reward, done = env.step(action)
        agent.recordTransition(state, reward, next_state)
        state = next_state
        if done:
            break
            
for i in xrange(10000):
    run_episode()

In [21]:
run_episode(verbose=True)
print
print(agent.Q[agent._sIdx((1,1))])
print(agent.Q[agent._sIdx((3,6))])

State = (1, 1), Option = topleft->topright, Action = right
State = (2, 1), Option = topleft->topright, Action = right
State = (3, 1), Option = topleft->topright, Action = right
State = (3, 1), Option = topleft->topright, Action = right
State = (2, 1), Option = topleft->topright, Action = right
State = (2, 2), Option = topleft->topright, Action = right
State = (2, 3), Option = topleft->topright, Action = right
State = (2, 2), Option = topleft->topright, Action = right
State = (2, 3), Option = topleft->topright, Action = right
State = (2, 4), Option = topleft->topright, Action = right
State = (2, 5), Option = topleft->topright, Action = down
State = (3, 5), Option = topleft->topright, Action = right
State = (3, 6), Option = topright->botright, Action = right
State = (3, 6), Option = topright->botright, Action = right
State = (3, 5), Option = right, Action = right
State = (3, 6), Option = topright->botright, Action = right
State = (3, 6), Option = topright->botright, Action = right
State 