# Notebook of Model-Free Reinforcement Learning
This notebook implements two model-free reinforcement learning algorithms, SARSA and Q-learning. They are evaluated on the [Cliff-Walking task](https://www.gymlibrary.dev/environments/toy_text/cliff_walking/) provided by the [OpenAI gym](https://github.com/openai/gym). There are 3x12 + 1 possible states and 4 discrete deterministic actions. 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import gym
%matplotlib inline

env = gym.make('CliffWalking-v0', new_step_api=False)

## Implement the $\varepsilon$-greedy policy

In [None]:
def epsilon_greedy_policy(Q, epsilon=0.1):
    """Epsilon greedy policy
    
    Parameters
    ----------
    Q : ndarray
        action-value function (1D array).
    epsilon : float
        hyperparameter to control the tradeoff between exploration and
        exploitation.
    """
    n_actions = Q.shape[0]
    policy = epsilon/n_actions*np.ones(n_actions)
    policy[Q.argmax(0)] += 1 - epsilon

    return policy


def sample_epsilon_greedy(Q, epsilon=0.1):
    """Sample an action by the epsilon greedy policy
    
    Parameters
    ----------
    Q : ndarray
        action-value function (1D array).
    epsilon : float
        hyperparameter to control the tradeoff between exploration and
        exploitation.    """
    n_actions = Q.shape[0]
    prob = epsilon_greedy_policy(Q, epsilon)

    return np.random.choice(n_actions, p=prob)

# Update rules of SARSA and Q-learning
When $s_t$ is a terminal state, the TD error is computed by
\begin{equation}
  \delta = r_t - Q(s_p, a_p).
\end{equation}
Otherwise, the TD error of SARSA is computed by
\begin{equation}
  \delta = r_t + \gamma Q(s_t, a_t) - Q(s_p, a_p),
\end{equation}
while the TD-error of Q-learning is given by
\begin{equation}
  \delta = r_t + \gamma \max_a Q(s_t, a) - Q(s_p, a_p).
\end{equation}
Then, $Q(s_p, a_p)$ is updated by
\begin{equation}
  Q(s_p, a_p) = Q(s_p, a_p) + \alpha \delta,
\end{equation}
where $\alpha \in [0, 1]$ is a learning rate.

In [None]:
def calc_TD(method, Q, s_p, a_p, r_t, s_t, a_t, is_terminal, gamma=1.0):
    """Calculate TD error
    
    Parameters
    ----------
    method : str
    Q : ndarray
    s_p : int
    a_p : int
    r_t : float
    s_t : int
    a_t : int
    
    Return
    ------
    td_error : float
        TD error.
    """
    if is_terminal:
        td_error = r_t - Q[s_p, a_p]
    elif method == 'qlearn':
        td_error = r_t + gamma*np.max(Q[s_t, :]) - Q[s_p, a_p]
    elif method == 'sarsa':
        td_error = r_t + gamma*Q[s_t, a_t] - Q[s_p, a_p]
    elif method == 'expsarsa':
        prob = epsilon_greedy_policy(Q[s_t, :], epsilon=EPSILON)
        Q_s_t = np.dot(prob, Q[s_t, :])
        td_error = r_t + gamma*Q_s_t - Q[s_p, a_p]
    else:
        td_error = r_t + gamma*np.max(Q[s_t, :]) - Q[s_p, a_p]

    return td_error

# Simulation of the Cliff-Walking task


In [None]:
def run(method, alpha=0.2, epsilon=0.1, num_episodes=1000):
    # state-action value function
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    total_reward = np.zeros(num_episodes)

    for episode in range(num_episodes):
        s_p, is_terminal = env.reset(), False
        a_p = sample_epsilon_greedy(Q[s_p, :], epsilon=epsilon)

        while not is_terminal:
            s_t, r_t, is_terminal, _ = env.step(a_p)
            a_t = sample_epsilon_greedy(Q[s_t, :], epsilon=epsilon)

            td_error = calc_TD(method, Q, s_p, a_p, r_t, s_t, a_t, is_terminal)
            Q[s_p, a_p] += alpha*td_error

            s_p, a_p = s_t, a_t

            total_reward[episode] += r_t

    return Q, total_reward


def plot_rewards(r_sarsa=None, r_qlearn=None):
    """Plot the learning curves
    """
    def plot_shaded(result, algorithm_name, color):
        r_mean = np.mean(result, axis=0)
        # r_std = np.std(result, axis=0)
        x = np.arange(len(r_mean))
        plt.plot(x, r_mean, '-', color=color, label=algorithm_name)
        # plt.fill_between(x, r_mean - r_std, r_mean + r_std, color=color, alpha=0.2)
        
    
    if r_sarsa is not None:
        plot_shaded(r_sarsa, 'SARSA', 'b')
    if r_qlearn is not None:
        plot_shaded(r_qlearn, 'Q-learning', 'r')

    plt.legend()
    plt.xlabel('episodes')
    plt.ylabel('sum of rewards')
    axes = plt.gca()
    axes.set_ylim([-100, 0])

    plt.show()


def evaluate(Q):
    """
    """

    s_p, is_terminal = env.reset(), False
    a_p = sample_epsilon_greedy(Q[s_p, :], epsilon=0) # select the optimal action

    while not is_terminal:
        print(env.render(mode='ansi'))
        s_t, r_t, is_terminal, _ = env.step(a_p)
        a_t = sample_epsilon_greedy(Q[s_t, :], epsilon=0)
        s_p, a_p = s_t, a_t
    print(env.render(mode='ansi'))

## Simulation results of SARSA

In [None]:
alpha = 0.2
epsilon = 0.1
num_episodes = 1000
num_runs = 10
r_sarsa = np.zeros((num_runs, num_episodes))
for r in range(num_runs):
    q_sarsa, r_sarsa[r, :] = run('sarsa', alpha, epsilon, num_episodes)
plot_rewards(r_sarsa=r_sarsa)

## Simulation results of Q-learning

In [None]:
r_qlearn = np.zeros((num_runs, num_episodes))
for r in range(num_runs):
    q_qlearn, r_qlearn[r, :] = run('qlearn', alpha, epsilon, num_episodes)
plot_rewards(r_sarsa=r_sarsa, r_qlearn=r_qlearn)

## Optimal policy of SARSA

In [None]:
evaluate(q_sarsa)

## Optimal policy of Q-learning

In [None]:
evaluate(q_qlearn)