In [2]:
import random

## Model-based vs Model-free

Model-based reinforcement learning need to **know the model**; in particular, we have access to $P_a(s' \mid s)$ and $r(s,a,s')$.

While in model-free reinforcement learning, we don't know the transitions and the rewards?!* We **learn through experience** by trying actions and seeing what the results is, making this machine learning problem.

In [None]:
class MDP:
    """Return all states of this MDP"""

    def get_states(self):
        pass

    """ Return all actions with non-zero probability from this state """

    def get_actions(self, state):
        pass

    """ Return all non-zero probability transitions for this action
        from this state, as a list of (state, probability) pairs
    """

    def get_transitions(self, state, action):
        pass

    """ Return the reward for transitioning from state to
        nextState via action
    """

    def get_reward(self, state, action, next_state):
        pass

    """ Return true if and only if state is a terminal state of this MDP """

    def is_terminal(self, state):
        pass

    """ Return the discount factor for this MDP """

    def get_discount_factor(self):
        pass

    """ Return the initial state of this MDP """

    def get_initial_state(self):
        pass

    """ Return all goal states of this MDP """

    def get_goal_states(self):
        pass

    """ Return a new state and a reward for executing action in state,
    based on the underlying probability. This can be used for
    model-free learning methods, but requires a model to operate.
    Override for simulation-based learning
    """

    def execute(self, state, action):
        rand = random.random()
        cumulative_probability = 0.0
        for (new_state, probability) in self.get_transitions(state, action):
            if cumulative_probability <= rand <= probability + cumulative_probability:
                reward = self.get_reward(state, action, new_state)
                return (new_state, reward, self.is_terminal(new_state))
            cumulative_probability += probability
            if cumulative_probability >= 1.0:
                raise (
                    "Cumulative probability >= 1.0 for action "
                    + str(action)
                    + " from "
                    + str(state)
                )

        raise BaseException(
            "No outcome state in simulation for action "
            + str(action)
            + " from "
            + str(state)
        )

    """ 
    Execute a policy on this mdp for a number of episodes.
    """

    def execute_policy(self, policy, episodes=100, max_step=100):
        cumulative_rewards = []
        states = set()
        for _ in range(episodes):
            cumulative_reward = 0.0
            state = self.get_initial_state()
            step = 0
            while not self.is_terminal(state):
                actions = self.get_actions(state)
                action = policy.select_action(state, actions)
                (next_state, reward, done) = self.execute(state, action)
                cumulative_reward += reward * (self.discount_factor ** step)
                state = next_state
                step += 1
                if step > max_step:
                    break
            cumulative_rewards += [cumulative_reward]
        return cumulative_rewards

## Q-learning: Off-policy Temporal-difference Learning

In [1]:
class ModelFreeLearner:
    def execute(self, eposodes=2000):
        pass


class TempralDifferenceLearner(ModelFreeLearner):
    def __init__(self, mdp, bandit, qfunction):
        self.mdp = mdp
        self.bandit = bandit
        self.qfunction = qfunction

    def execute(self, episodes=2000):

        rewards = []
        for episode in range(episodes):
            state = self.mdp.get_initial_state()
            actions = self.mdp.get_actions(state)
            action = self.bandit.select(state, actions, self.qfunction)

            episode_reward = 0.0
            step = 0
            while not self.mdp.is_terminal(state):
                (next_state, reward, done) = self.mdp.execute(state, action)
                actions = self.mdp.get_actions(next_state)
                next_action = self.bandit.select(next_state, actions, self.qfunction)

                delta = self.get_delta(reward, state, action, next_state, next_action)
                self.qfunction.update(state, action, delta)
                
                state = next_state
                action = next_action
                episode_reward += reward * (self.mdp.discount_factor ** step)
                step += 1
            
            rewards.append(episode_reward)

        return rewards
    
    """ Calculate the delta for the update """

    def get_delta(self, reward, state, action, next_state, next_action):
        q_value = self.qfunction.get_q_value(state, action)
        next_state_value = self.state_value(next_state, next_action)
        delta = reward + self.mdp.discount_factor * next
        pass
        
