<a href="https://colab.research.google.com/github/vaghyjuli/RL/blob/main/Continuous2_Lunar_Lander.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%bash

# install required system dependencies
apt-get install -y xvfb x11-utils

# install required python dependencies (might need to install additional gym extras depending)
pip install gym[box2d]==0.17.* pyvirtualdisplay==0.2.* PyOpenGL==3.1.* PyOpenGL-accelerate==3.1.*

Reading package lists...
Building dependency tree...
Reading state information...
x11-utils is already the newest version (7.7+3build1).
xvfb is already the newest version (2:1.19.6-1ubuntu4.10).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.


In [None]:
import pyvirtualdisplay

_display = pyvirtualdisplay.Display(visible=False,  # use False with Xvfb
                                    size=(1400, 900))
_ = _display.start()

In [None]:
import gym
import numpy as np
import pandas as pd
import random

from tensorflow import keras
import tensorflow as tf
from keras.activations import relu, linear

In [None]:
epsilon = 1
gamma = .99
batch_size = 64
min_eps = 0.01
learning_rate = 0.001

In [None]:
class Model(tf.keras.Sequential):
  def __init__(self):
    super().__init__()
    self.epsilon = epsilon
    self.add(keras.layers.Dense(64, input_dim=8, activation=relu))
    self.add(keras.layers.Dense(64, activation=relu))
    self.add(keras.layers.Dense(4, activation=linear))
    self.compile(loss="mse", optimizer=keras.optimizers.Adam(learning_rate=learning_rate ))

  def get_action(self, state):
      pass

  def replay_experiences(self, memory):
    if len(memory) >= batch_size:
        sample_choices = np.array(memory)
        mini_batch_index = np.random.choice(len(sample_choices), batch_size)
        #batch = random.sample(memory, batch_size)
        states = []
        actions = []
        next_states = []
        rewards = []
        finishes = []
        for index in mini_batch_index:
            states.append(memory[index][0])
            actions.append(memory[index][1])
            next_states.append(memory[index][2])
            rewards.append(memory[index][3])
            finishes.append(memory[index][4])
        states = np.array(states)
        actions = np.array(actions)
        next_states = np.array(next_states)
        rewards = np.array(rewards)
        finishes = np.array(finishes)
        states = np.squeeze(states)
        next_states = np.squeeze(next_states)
        q_vals_next_state = self.predict_on_batch(next_states)
        q_vals_target = self.predict_on_batch(states)
        max_q_values_next_state = np.amax(q_vals_next_state, axis=1)
        q_vals_target[np.arange(batch_size), actions] = rewards + gamma * (max_q_values_next_state) * (1 - finishes)
        self.fit(states, q_vals_target, verbose=0)
        global epsilon
        if epsilon > min_eps:
            epsilon *= 0.996

In [None]:
memory = []
model = Model()

env = gym.make('LunarLander-v2')
# env.seed(0)
num_episodes = 400
np.random.seed(0)
scores  = []
for i in range(num_episodes+1):
    score = 0
    state = env.reset()
    finished = False
    if i != 0 and i % 50 == 0:
        model.save(".\saved_models\model_"+str(i)+"_episodes.h5")
    for j in range(3000):
        state = np.reshape(state, (1, 8))
        if np.random.random() <= epsilon:
            action =  np.random.choice(4)
        else:
            action_values = model.predict(state)
            action = np.argmax(action_values[0])

        #env.render()
        next_state, reward, finished, metadata = env.step(action)
        next_state = np.reshape(next_state, (1, 8))
        memory.append((state, action, next_state, reward, finished))
        model.replay_experiences(memory)
        score += reward
        state = next_state
        if finished:
            scores.append(score)
            print("Episode = {}, Score = {}, Avg_Score = {}".format(i, score, np.mean(scores[-100:])))
            break


  from ipykernel import kernelapp as app


Episode = 0, Score = -90.24569945676006, Avg_Score = -90.24569945676006
Episode = 1, Score = -202.3822027228816, Avg_Score = -146.31395108982082
Episode = 2, Score = -212.05975184757034, Avg_Score = -168.22921800907065
Episode = 3, Score = -494.71907397606475, Avg_Score = -249.85168200081918
Episode = 4, Score = -360.72299574761183, Avg_Score = -272.0259447501777
Episode = 5, Score = -224.13026553395676, Avg_Score = -264.0433315474742
Episode = 6, Score = -108.73899158696996, Avg_Score = -241.8569972674022
Episode = 7, Score = -192.82670503482765, Avg_Score = -235.72821073833035
Episode = 8, Score = -253.95123929364232, Avg_Score = -237.75299168892056
Episode = 9, Score = -206.02370723673442, Avg_Score = -234.58006324370194
Episode = 10, Score = -173.54685324987793, Avg_Score = -229.03158960789978
Episode = 11, Score = -320.4812640058092, Avg_Score = -236.65239580772558
Episode = 12, Score = -108.63570696493953, Avg_Score = -226.8049582044343
Episode = 13, Score = -262.07047703340106, 

In [None]:
import matplotlib.pyplot as plt

kernel_size = 200
kernel = np.ones(kernel_size) / kernel_size
y = np.convolve(cum_rewards, kernel, mode="valid")

plt.plot(y, color="red")

plt.show()