# SARSA Algorithm on the Lunar Lander V2 Problem (OpenAI Gym)

In [None]:
!pip install gym[box2d]==0.17

import gym
import numpy as np
import pandas as pd
import random

In [None]:
"""
Constants for SARSA Agent
"""

n_experiments = 5
n_episodes_sarsa = 10000

In [None]:
class SARSAAgent():
  """
  Agent that learns using the SARSA algorithm. 
  """
  def __init__(self, n_actions):
    self.gamma = 0.99                               # discount

    self.num_bins = 3                               # number of bins for discretizing continuous state variables
    self.bins2D = []                                # num_bins bins for each of the 6 continuous variables
    self.init_bins()

    self.n_actions = n_actions                               # number of actions
    self.dim_state = (self.num_bins ** 6) * (2 ** 2)         # 6 continuous, 2 binary
    self.Q = np.random.rand(self.dim_state, n_actions)       # initialize Q(s,a) table

  def get_action(self, state, epsilon):
    if random.uniform(0, 1) < epsilon:
      return np.random.choice(range(self.n_actions))
    else:
      return np.argmax(self.Q[self.get_state_index(state)])

  def q_update(self, prev_state, action, reward, new_state, alpha):
    prev_state_idx = self.get_state_index(prev_state)
    new_state_idx = self.get_state_index(new_state)
    self.Q[prev_state_idx][action] += alpha * (reward + self.gamma * np.max(self.Q[new_state_idx]) - self.Q[prev_state_idx][action])

  def init_bins(self):
    for _ in range(6):
      mid_bound = 0.05
      left_bounds = [-float("inf"), -mid_bound, mid_bound]
      right_bounds = [-mid_bound, mid_bound, float("inf")]
      self.bins2D.append(pd.IntervalIndex.from_arrays(left_bounds, right_bounds, closed="neither"))

  def get_state_representation(self, state):
    representation = [self.bins2D[i].get_loc(state[i]) for i in range(6)]
    representation.append(int(state[6]))
    representation.append(int(state[7]))
    return representation

  def get_state_index(self, state):
    bases = [self.num_bins]*6 + [2, 2]
    n = 0
    for i in range(len(state) - 2):
      n = (n + self.bins2D[i].get_loc(state[i])) * bases[i+1]
    return (n + int(state[-2]))*2 + int(state[-1])

In [None]:
"""
Main experiment loop for SARSA Agent
""" 

def get_epsilon(episode_number):
    if episode_number < 200:
      return 0.5
    if episode_number < 1000:
      return 0.2
    if episode_number < 1500:
      return 0.1
    if episode_number < 8000:
      return 0.01
    if episode_number < 9000:
      return 0.001
    return 0

env = gym.make('LunarLander-v2')

cum_rewards, timesteps, epsilons = [], [], []

for experiment in range(n_experiments):
  agent = SARSAAgent(env.action_space.n)
  cum_rewards_experiment = []
  timesteps_experiment = []
  for episode in range(n_episodes_sarsa):
      state = env.reset()
      t = 0
      cum_reward_episode = 0
      alpha = (n_episodes_sarsa - episode) / n_episodes_sarsa
      epsilon = get_epsilon(episode)
      while True:
        prev_state = state
        prev_state_idx = agent.get_state_index(prev_state)
        action = agent.get_action(state, epsilon)
        state, reward, done, info = env.step(action)
        cum_reward_episode += reward
        agent.q_update(prev_state, action, reward, state, alpha)
        state_idx = agent.get_state_index(state)
        if done:
            print(f"Episode {episode+1} - {t+1} timesteps, cum_reward = {cum_reward_episode} \n")
            timesteps_experiment.append(t+1)
            cum_rewards_experiment.append(cum_reward_episode)
            if experiment == 0:
              epsilons.append(epsilon)
            break
        t += 1

  cum_rewards.append(cum_rewards_experiment)
  timesteps.append(timesteps_experiment)
  print(f"Experiment {experiment+1} finished.\n")

env.close()