<a href="https://colab.research.google.com/github/vaghyjuli/RL/blob/main/Final_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## based on https://github.com/DanielPalaio/LunarLander-v2_DeepRL

!pip install gym[box2d]==0.17

import gym
import numpy as np
import pandas as pd
import random

In [None]:
"""
Constants for DQN Agent
"""

epsilon = 1 # amount of exploration
gamma = .99 # temporal discount
learning_rate = 0.001 # alpha
tau = 0.01 # update rate of target network

memory_buffer_size= 500000 # number of experiences possibly stored at once
batch_size = 64 # number of samples in each batch
n_actions = 4
state_dim = 8
n_episodes = 500

from enum import Enum
class DecayType(Enum):
    EXPONENTIAL = 0
    LINEAR = 1

epsilon_decay_type = DecayType.LINEAR

lin_epsilon_decay_param = 0.001 # constant subtraction
exp_epsilon_decay_param = 0.99 # constant multiplication
min_eps = 0.01 # final epsilon value

In [None]:
class MemoryBuffer:
    """
    Stores experiences (state, action, reward, new_state, done) for i.i.d. assumption during learning
    """ 
    def __init__(self):
        self.states = np.zeros((memory_buffer_size, 8), np.float64)
        self.actions = np.zeros(memory_buffer_size, np.intc)
        self.rewards = np.zeros(memory_buffer_size, np.float64)
        self.new_states = np.zeros((memory_buffer_size, 8), np.float64)
        self.dones = np.zeros(memory_buffer_size, np.bool_)

        self.head = 0

    def write(self, state, action, reward, new_state, done):
        index = self.head % memory_buffer_size

        self.states[index] = state
        self.actions[index] = action
        self.rewards[index] = reward
        self.new_states[index] = new_state
        self.dones[index] = done

        self.head += 1

    def sample(self):
        sample = np.random.choice(min(self.head, memory_buffer_size), batch_size)

        return (self.states[sample],
                self.actions[sample],
                self.rewards[sample],
                self.new_states[sample],
                self.dones[sample])

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.activations import relu, linear
from keras import Sequential

class DQN(Sequential):
    def __init__(self):
        super().__init__()
        self.add(keras.layers.Dense(256, input_dim=state_dim, activation=relu))
        self.add(keras.layers.Dense(256, activation=relu))
        self.add(keras.layers.Dense(n_actions, activation=linear))
        self.compile(loss="mse", optimizer=keras.optimizers.Adam(learning_rate=learning_rate ))

In [None]:
class DQNAgent:
    """
    Agent that plays and learns.
    It has access to the memory buffer and stores the DQN and DQN target.
    """ 
    def __init__(self, buffer):
        self.epsilon = epsilon
        self.buffer = buffer
        self.dqn = DQN()
        self.dqn_target = DQN()

    def get_action(self, state):
        if np.random.random() < self.epsilon:
            return np.random.choice(n_actions)
        state = np.array([state])
        qs = self.dqn(state)
        return np.argmax(qs)

    def update_target(self):
        new_weights = []
        target_weights = self.dqn_target.get_weights()
        main_weights = self.dqn.get_weights()
        for target_weight, main_weight in zip(target_weights, main_weights):
            new_weights.append((1 - tau) * target_weight + tau * main_weight)
        self.dqn_target.set_weights(new_weights)

    def update(self):
        states, actions, rewards, new_states, dones = self.buffer.sample()
        # we change y only in the (s, a) positions where experiences are available
        y = np.copy(self.dqn(states))
        next_qs = self.dqn_target(new_states)
        max_next_qs = np.amax(next_qs, axis=1)

        for idx in range(batch_size):
            # times done to avoid learning over final experiences
            y[idx, actions[idx]] = rewards[idx] + (1 - dones[idx]) * gamma * max_next_qs[idx]

        self.dqn.train_on_batch(states, y)
        self.update_target()

        if self.epsilon > min_eps:
            if epsilon_decay_type == DecayType.LINEAR:
                self.epsilon -= lin_epsilon_decay_param
            elif epsilon_decay_type == DecayType.EXPONENTIAL:
                self.epsilon *= exp_epsilon_decay_param
            else:
                print("please choose decay type")
                exit()

In [None]:
class SARSAAgent():
  """
  Agent that learns using the SARSA algorithm. 
  """
  def __init__(self, n_actions):
    self.gamma = 0.99                               # discount

    self.num_bins = 3                               # number of bins for discretizing continuous state variables
    self.bins2D = []                                # num_bins bins for each of the 6 continuous variables
    self.init_bins()

    self.n_actions = n_actions                               # number of actions
    self.dim_state = (self.num_bins ** 6) * (2 ** 2)         # 6 continuous, 2 binary
    self.Q = np.random.rand(self.dim_state, n_actions)       # initialize Q(s,a) table

  def get_action(self, state, epsilon):
    if random.uniform(0, 1) < epsilon:
      return np.random.choice(range(self.n_actions))
    else:
      return np.argmax(self.Q[self.get_state_index(state)])

  def q_update(self, prev_state, action, reward, new_state, alpha):
    prev_state_idx = self.get_state_index(prev_state)
    new_state_idx = self.get_state_index(new_state)
    self.Q[prev_state_idx][action] += alpha * (reward + self.gamma * np.max(self.Q[new_state_idx]) - self.Q[prev_state_idx][action])

  def init_bins(self):
    state_min = [-1.01975346, -0.44636688, -2.29403067, -2.22342443, -4.9213028, -8.83636475]
    state_max = [1.0239284, 1.8099494, 2.47327113, 0.62224495, 4.42566919, 8.98535538]
    half_mid_range = 20
    for i in range(6):
      #mid_bound = (state_max[i] - state_min[i]) / half_mid_range
      mid_bound = 0.05
      left_bounds = [-float("inf"), -mid_bound, mid_bound]
      right_bounds = [-mid_bound, mid_bound, float("inf")]
      self.bins2D.append(pd.IntervalIndex.from_arrays(left_bounds, right_bounds, closed="neither"))

  def get_state_representation(self, state):
    representation = [self.bins2D[i].get_loc(state[i]) for i in range(6)]
    representation.append(int(state[6]))
    representation.append(int(state[7]))
    return representation

  def get_state_index(self, state):
    bases = [self.num_bins]*6 + [2, 2]
    n = 0
    for i in range(len(state) - 2):
      n = (n + self.bins2D[i].get_loc(state[i])) * bases[i+1]
    return (n + int(state[-2]))*2 + int(state[-1])

In [None]:
from numpy.lib.twodim_base import triu_indices_from
"""
Main experiment loop for DQN Agent
""" 

env = gym.make("LunarLander-v2")

n_experiments = 5
cum_rewards = []
timesteps = []

for experiment in range(n_experiments):

  buffer = MemoryBuffer()
  agent = DQNAgent(buffer)

  cum_rewards_experiment = []
  timesteps_experiment = []

  for episode in range(n_episodes):
    state = env.reset()
    t = 0
    cum_reward_episode = 0
    while True:
      prev_state = state
      action = agent.get_action(state)
      state, reward, done, _ = env.step(action)
      cum_reward_episode += reward
      buffer.write(prev_state, action, reward, state, done)
      agent.update()
      if done:
        print(f"Episode {episode+1} - {t+1} timesteps, cum_reward = {cum_reward_episode} \n")
        timesteps_experiment.append(t+1)
        cum_rewards_experiment.append(cum_reward_episode)
        break
      t += 1

  cum_rewards.append(cum_rewards_experiment)
  timesteps.append(timesteps_experiment)
  print(f"Experiment {experiment+1} finished.\n")

env.close()

In [None]:
"""
Main experiment loop for SARSA Agent
""" 

def get_epsilon(episode_number):
    if episode_number < 200:
      return 0.5
    if episode_number < 1000:
      return 0.2
    if episode_number < 1500:
      return 0.1
    if episode_number < 8000:
      return 0.01
    if episode_number < 9000:
      return 0.001
    return 0

env = gym.make('LunarLander-v2')

n_experiments = 5
n_episodes = 10000
cum_rewards = []
timesteps = []  

for experiment in range(n_experiments):
  agent = SARSAAgent(env.action_space.n)
  cum_rewards_experiment = []
  timesteps_experiment = []
  for episode in range(n_episodes):
      state = env.reset()
      t = 0
      cum_reward_episode = 0
      alpha = (n_episodes - episode) / n_episodes
      epsilon = get_epsilon(episode)
      while True:
        prev_state = state
        prev_state_idx = agent.get_state_index(prev_state)
        action = agent.get_action(state, epsilon)
        state, reward, done, info = env.step(action)
        cum_reward_episode += reward
        agent.q_update(prev_state, action, reward, state, alpha)
        state_idx = agent.get_state_index(state)
        if done:
            print(f"Episode {episode+1} - {t+1} timesteps, cum_reward = {cum_reward_episode} \n")
            timesteps_experiment.append(t+1)
            cum_rewards_experiment.append(cum_reward_episode)
            break
        t += 1

  cum_rewards.append(cum_rewards_experiment)
  timesteps.append(timesteps_experiment)
  print(f"Experiment {experiment+1} finished.\n")

env.close()