<h1><center><b>REINFORCEMENT LEARNING</b></center></h1>
<h1><center><b>Final Project - Part 1.1</b></center></h1>

<h3><center>Lunar Lander v2</center></h3>

---


<h3><center>DQN, DDQN, and D3QN</center></h3>

Names:  
<font color='red'>
Zuriya Ansbacher ID. 208532515  
Akiva Bruno Melka ID. 332629393  
</font>

https://github.com/zuriyaAnsbacher/Reinforcement_Project

# **1. Install Environment**

In [None]:
!sudo apt-get update
!pip install gym
!apt-get install python-opengl -y
!apt install xvfb -y
!pip install pyvirtualdisplay
!pip install piglet
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()
!pip install gym[box2d]

In [None]:
# This code creates a virtual display to draw game images on. 
# If you are running locally, just ignore it
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY"))==0:
    !bash ../xvfb start
    %env DISPLAY=:1

# **2. Import Libraries**

In [None]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) # error only
from gym.wrappers.monitoring import video_recorder

import os
import copy
import random
import argparse
import numpy as np
from statistics import mean
from collections import deque
from dataclasses import dataclass
from itertools import product

import torch
import torch.nn as nn
import torch.nn.functional as F
import tensorflow as tf
import torch.optim as optim
from torch.optim import Adam

import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay

# **3. Utility functions**

### **3.2 Miscellaneous**

In [None]:
# Create the seed
def set_seed(env, seed=0):
    np.random.seed(seed)
    env.seed(seed)
    torch.manual_seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

# Discretization of the action space
def quantize_space(actions_range, bins):
    discrete_actions = product(*[np.linspace(start, end, num=num_of_splits) for
                                 (start, end), num_of_splits in zip(actions_range, bins)])
    return list(discrete_actions)

"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video(file):
  mp4list = glob.glob(file)
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")

    
def create_plot(scores, model_name, title, avg_lag):
    
    avg = []
    for i in range(len(scores)):
        j = 0 if i <= avg_lag - 1 else i - (avg_lag - 1)
        avg.append(mean(scores[j:i+1]))

    fig, axis = plt.subplots()
    axis.clear()
    axis.plot(scores, 'c', label='Score', alpha=0.7)
    axis.plot(avg, 'orange', label='Average score')
    axis.axhline(200, c='gray', label='Goal', alpha=0.7)
    axis.set_xlabel('Episodes')
    axis.set_ylabel('Scores')
    axis.legend(loc='lower right')
    plt.title(title + f'Average score: {mean(scores):.02f}')
    plt.show()


### **3.2 Memory replay class**

In [None]:
@ dataclass
class Sample:
    state: np.ndarray
    action: int or np.ndarray
    reward: float
    next_state: np.ndarray
    done: bool


class SamplesMemory:
    def __init__(self, max_size, device):
        self.max_size = max_size
        self.device = device
        self.memory_buffer = deque(maxlen=max_size)  # maxlen ensure that samples num won't exceed

    def add_sample(self, state, action, reward, next_state, done):
        sample = Sample(state, action, reward, next_state, done)
        self.memory_buffer.append(sample)

    def get_batch(self, batch_size, continuous_action=False):
        batch = random.sample(self.memory_buffer, batch_size)
        f = lambda x, my_type: torch.tensor(np.vstack(x), device=self.device, dtype=my_type)

        state_batch = f([sample.state for sample in batch], torch.float)
        action_batch = f([sample.action for sample in batch], torch.float) if continuous_action \
            else f([sample.action for sample in batch], torch.long)
        reward_batch = f([sample.reward for sample in batch], torch.float)
        next_state_batch = f([sample.next_state for sample in batch], torch.float)
        done_batch = f([sample.done for sample in batch], torch.float)

        return state_batch, action_batch, reward_batch, next_state_batch, done_batch



# **4. Agents**

### **4.1 Networks architecture**

In [None]:
"""Models"""
class DQN(nn.Module):
    def __init__(self, input_size, output_size, hidd1_size, hidd2_size):
        super(DQN, self).__init__()

        self.fc1 = nn.Sequential(nn.Linear(input_size, hidd1_size), nn.ReLU())
        self.fc2 = nn.Sequential(nn.Linear(hidd1_size, hidd2_size), nn.ReLU())
        self.fc3 = nn.Linear(hidd2_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x


class DuelingDQN(nn.Module):
    def __init__(self, input_size, output_size, linear_hid_size, adv_hid_size, val_hid_size):
        super(DuelingDQN, self).__init__()

        # common linear layer
        self.linear1 = nn.Linear(input_size, linear_hid_size)

        # 2 linear layers for advantage calculation
        self.linear_adv_1 = nn.Linear(linear_hid_size, adv_hid_size)
        self.linear_adv_2 = nn.Linear(adv_hid_size, output_size)

        # 2 linear layers for value calculation
        self.linear_val_1 = nn.Linear(linear_hid_size, val_hid_size)
        self.linear_val_2 = nn.Linear(val_hid_size, 1)

    def forward(self, state):
        x = F.relu(self.linear1(state))
        adv = self.linear_adv_2(F.relu(self.linear_adv_1(x)))
        val = self.linear_val_2(F.relu(self.linear_val_1(x)))

        return val + (adv - adv.mean())


### **4.2 Super Agent class**

In [None]:
""" Agent """
class Agent:
    def __init__(self, action_space, output_size, batch_size, gamma, memory_size,
                 max_eps, min_eps, eps_decay, target_update, device):

        self.output_size = output_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.device = device

        # epsilon parameters
        self.min_eps = min_eps
        self.eps_decay = eps_decay
        self.cur_eps = max_eps

        # updated parameters
        self.target_update = target_update
        self.update = 0
        self.full_target = False

        # will be defined in children classes
        self.policy_net = None
        self.target_net = None
        self.optimizer = None
        self.model_name = None
        self.memory = SamplesMemory(memory_size, device)

        self.idx2action = {i: action for i, action in enumerate(action_space)}
        self.action2idx = {action: i for i, action in enumerate(action_space)}

    def get_action(self, state, just_greedy=False):
        # epsilon greedy
        if not just_greedy and np.random.random() < self.cur_eps:  # exploration
            return self.idx2action[np.random.choice(self.output_size)]
        else:  # exploitation
            with torch.no_grad():
                state = torch.tensor(state, dtype=torch.float, device=self.device)
                return self.idx2action[self.policy_net(state).argmax().item()]

    def decrement_epsilon(self):
        return max(self.cur_eps * self.eps_decay, self.min_eps)

    def learn(self):
        states, actions, rewards, next_states, dones = self.memory.get_batch(self.batch_size)
        q_val = self.policy_net(states).gather(1, actions)

        if self.full_target:  # ddqn, d3qn
            with torch.no_grad():
                next_q_values_argmax = self.policy_net(next_states).argmax(1)
            next_q_val = self.target_net(next_states).gather(1, next_q_values_argmax.unsqueeze(1)).detach()
        else:  # dqn
            next_q_val = self.target_net(next_states).max(1, keepdim=True)[0].detach()

        # formula
        target = (rewards + self.gamma * next_q_val * (1 - dones)).to(self.device)

        # loss and optimizer
        self.optimizer.zero_grad()
        loss = F.smooth_l1_loss(q_val, target)
        loss.backward()
        self.optimizer.step()
        self.update += 1

        # every target_update num, load weights from policy_net to target_net
        if self.update % self.target_update == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())

        return loss.item()

    def set_eval(self):
        self.policy_net.eval()

    def save(self, file_path):
        torch.save(self.policy_net.state_dict(), file_path)

    def load(self, file_path):
        self.policy_net.load_state_dict(torch.load(file_path, map_location=self.device))

    def test(self, env, model_name, record, record_freq, episode_num=100, steps_num=1000):

      print("Start evaluation...")

      self.set_eval()
      scores_list = []
      success = False

      for episode in range(1, episode_num + 1):
          state = env.reset()
          score = 0

          for step in range(1, steps_num + 1):

              action = self.get_action(state, just_greedy=True)
              next_state, reward, done, _ = env.step(action)
              score += reward
              state = next_state
              if done: 
                  break

          scores_list.append(score)
          print(f'Episode: {episode}, Test score: {score:.02f}')

      if mean(scores_list) >= 200:
          success = True
          create_plot(scores_list, self.model_name, f'{self.model_name} Test.', episode_num)

      else:
          print(f'The evaluation has not achieved the goal yet. The model will be training more..')

      return success

    def train(self, env, episodes_num, steps_num, learn_freq, 
              record, record_freq):
        
        scores_list = []
        count_test = 1

        vid_path = f"./video.mp4"
        video = video_recorder.VideoRecorder(env, path=vid_path) 
        video.enabled = False

        self.policy_net.train()
        for episode in range(1, episodes_num + 1):
            print(f'Episode: {episode}', end='')
            state = env.reset()
            score = 0 

            for step in range(1, steps_num + 1):  

                if (episode - 1) % record_freq == 0 and record:
                    env.render()
                    video.enabled = True
                    video.capture_frame()      

                action = self.get_action(state)
                next_state, reward, done, _ = env.step(action)
                action = self.action2idx[action]

                self.memory.add_sample(state, action, reward, next_state, done)
                if len(self.memory.memory_buffer) > self.batch_size and step % learn_freq == 0:
                    self.learn()

                score += reward
                state = next_state

                if done:
                    self.cur_eps = self.decrement_epsilon()
                    break

            print(f'| Score: {score}')
            scores_list.append(score)

            if len(scores_list) >= 50 and mean(scores_list[-10:]) >= 200 and episode >= count_test + 10:
              count_test = episode
              success = self.test(env, self.model_name, record, record_freq)
              if success:
                print(f'Mission accomplished in episode {episode}!')
                create_plot(scores_list, self.model_name, f'{self.model_name} Train.', 10)
                break
   
        video.close()
        show_video(vid_path)
        env.close()



### **4.3 Models sub-classes**

In [None]:
class DQNAgent(Agent):
    def __init__(self, input_size, output_size, action_space, batch_size, lr, gamma, eps_decay,
                 target_update, hidden_layers_size, memory_size, max_eps, min_eps, device):
        super(DQNAgent, self).__init__(action_space, output_size, batch_size, gamma, memory_size, max_eps,
                                       min_eps, eps_decay, target_update, device)

        self.model_name = 'DQN'
        self.hidd1 = hidden_layers_size[0]
        self.hidd2 = hidden_layers_size[1]
        self.policy_net = DQN(input_size, self.output_size, self.hidd1, self.hidd2).to(device)
        self.target_net = DQN(input_size, self.output_size, self.hidd1, self.hidd2).to(device)  # copy.deepcopy?
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.full_target = False  # unnecessary

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)


class DoubleDQNAgent(DQNAgent):
    def __init__(self, input_size, output_size, action_space, batch_size, lr, gamma, eps_decay,
                 target_update, hidden_layers_size, memory_size, max_eps, min_eps, device):
        super(DoubleDQNAgent, self).__init__(input_size, output_size, action_space, batch_size, lr, gamma, eps_decay,
                                             target_update, hidden_layers_size, memory_size, max_eps, min_eps, device)

        self.model_name = 'DoubleDQN'
        self.full_target = True


class DuelingDDQNAgent(Agent):
    def __init__(self, input_size, output_size, action_space, batch_size, lr, gamma, eps_decay,
                 target_update, hidden_layers_size, memory_size, max_eps, min_eps, device):
        super(DuelingDDQNAgent, self).__init__(action_space, output_size, batch_size, gamma, memory_size,
                                               max_eps, min_eps, eps_decay, target_update, device)

        self.model_name = 'DuelingDDQN'
        self.hid_size_linear = hidden_layers_size[0]
        self.hid_size_adv = hidden_layers_size[1]
        self.hid_size_val = hidden_layers_size[2]
        self.policy_net = DuelingDQN(input_size, self.output_size, self.hid_size_linear, self.hid_size_adv,
                                     self.hid_size_val).to(device)
        self.target_net = DuelingDQN(input_size, self.output_size, self.hid_size_linear, self.hid_size_adv,
                                     self.hid_size_val).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.full_target = True

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)

### **4.4 Models parameters optimized with NNI**

In [None]:
"""DQN"""
best_params_dqn = {"batch_size": 256,
    "gamma": 0.9791633725985145,
    "lr": 0.0015830847211986054,
    "target_update": 400,
    "learn_freq": 3,
    "eps_decay": 0.9590128678975974,
    "max_steps": 900,
    "hidden_layers_size": [256, 64]}

"""DDQN"""
best_params_ddqn = {"batch_size": 128,
    "gamma": 0.9856368308835148,
    "lr": 0.002066460106748642,
    "target_update": 100,
    "learn_freq": 8,
    "eps_decay": 0.9826503107232751,
    "max_steps": 500,
    "hidden_layers_size": [64, 64]}

"""D3QN"""
best_params_d3qn = {"batch_size": 64,
    "gamma": 0.9828312425604563,
    "lr": 0.001920259375173547,
    "target_update": 100,
    "learn_freq": 4,
    "eps_decay": 0.9519693635850672,
    "max_steps": 1000,
    "hidden_layers_size": [256, 32, 128]}

# **5. Main**

In [None]:
def main():

  parser = argparse.ArgumentParser()

  # args that change each run
  parser.add_argument('--model', choices=['dqn', 'ddqn', 'd3qn'], default='d3qn')
  parser.add_argument('--use_nni_params', default=True, help='if true, get params from json file')
  parser.add_argument('--set_num', type=str, default='2')

  # args that usually stay fixed
  parser.add_argument('--memory_size', type=int, default=100000)
  parser.add_argument('--episodes', type=int, default=800, help='number of episodes in train')
  parser.add_argument('--cuda_device', type=int, default=0)
  parser.add_argument('--max_eps', type=float, default=1.0)
  parser.add_argument('--min_eps', type=float, default=0.01)
  parser.add_argument('--record', type=bool, default=True)
  parser.add_argument('--record_freq', type=int, default=10)

  args = parser.parse_args(args=[])

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  print(device)

  if args.model == 'dqn':
      best_params = best_params_dqn
  elif args.model == 'ddqn':
      best_params = best_params_ddqn
  elif args.model == 'd3qn':
      best_params = best_params_d3qn

  env = gym.make('LunarLanderContinuous-v2')
  set_seed(env)
  env.reset()

  state_size = env.observation_space.shape[0] 
  action_space = quantize_space(actions_range=[(-1, 1), (-1, 1)], bins=[5, 5])

  agent_params = [state_size, len(action_space), action_space,
                    best_params['batch_size'], best_params['lr'], best_params['gamma'],
                    best_params['eps_decay'], best_params['target_update'], best_params['hidden_layers_size'],
                    args.memory_size, args.max_eps, args.min_eps, device]

  if args.model == 'dqn':
      agent = DQNAgent(*agent_params)
  elif args.model == 'ddqn':
      agent = DoubleDQNAgent(*agent_params)
  elif args.model == 'd3qn':
      agent = DuelingDDQNAgent(*agent_params)

  train_params = [env, args.episodes, best_params['max_steps'], best_params['learn_freq'], args.record, args.record_freq]
  agent.train(*train_params)
  print("program is over")

if __name__ == '__main__':
  main()

