In [1]:
!pip install gymnasium gymnasium[box2d] tensorflow-addons


Collecting gymnasium
  Downloading gymnasium-0.27.1-py3-none-any.whl (883 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting jax-jumpy>=0.2.0
  Downloading jax_jumpy-0.2.0-py3-none-any.whl (11 kB)
Collecting typing-extensions>=4.3.0
  Downloading typing_extensions-4.4.0-py3-none-any.whl (26 kB)
Collecting gymnasium-notices>=0.0.1
  Downloading gymnasium_notices-0.0.1-py3-none-any.whl (2.8 kB)
Collecting pygame==2.1.3.dev8
  Downloading pygame-2.1.3.dev8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.7/13.7 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting swig==4.*
  Downloading swig-4.1.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m68.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting 

In [2]:
import os
import json
import wandb
import numpy as np
import imageio
from PIL import Image
import PIL.ImageDraw as ImageDraw
import gymnasium as gym

from time import time
from tensorflow_addons.layers import NoisyDense
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam

In [3]:
class EpisodeSaver:
    def __init__(self, env, frames, algo, episode_number):
        self.env = env
        self.frames = frames
        self.dir = f'./gifs/{algo}/'
        self.episode_number = episode_number
        self.fname = f'episode_{self.episode_number}.gif'

        if not os.path.exists('./gifs'):
            os.mkdir('./gifs')

        if not os.path.exists(self.dir):
            os.mkdir(self.dir)

    def label_frames(self):
        labeled_frames = []

        for frame in self.frames:
            img = Image.fromarray(frame)
            draw = ImageDraw.Draw(img)
            # draw on each frame
            draw.text((10, 10), f'Episode: {self.episode_number}', fill=(255, 255, 255))
            labeled_frames.append(np.array(img))

        return labeled_frames

    def save(self):
        labeled_frames = self.label_frames()
        imageio.mimsave(self.dir + self.fname, labeled_frames, fps=60)
        

class ReplayBuffer:
    def __init__(self, max_length, state_size, action_size):
        self.memory_counter = 0
        self.max_length = max_length
        self.state_memory = np.zeros((self.max_length, state_size))
        self.new_state_memory = np.zeros((self.max_length, state_size))
        self.action_memory = np.zeros((self.max_length, action_size), dtype=np.int8)
        self.reward_memory = np.zeros(self.max_length)
        self.done_memory = np.zeros(self.max_length, dtype=np.float32)

    def append(self, state, action, reward, new_state, done):
        idx = self.memory_counter % self.max_length

        self.state_memory[idx] = state
        actions = np.zeros(self.action_memory.shape[1])
        actions[action] = 1.0
        self.action_memory[idx] = actions
        self.new_state_memory[idx] = new_state
        self.reward_memory[idx] = reward
        self.done_memory[idx] = 1 - done
        self.memory_counter += 1

    def sample(self, batch_size):
        max_memory = min(self.memory_counter, self.max_length)
        sampled_batch = np.random.choice(max_memory, batch_size)
        
        states= self.state_memory[sampled_batch]
        actions = self.action_memory[sampled_batch]
        rewards= self.reward_memory[sampled_batch]
        new_states = self.new_state_memory[sampled_batch]
        dones = self.done_memory[sampled_batch]

        return states, actions, rewards, new_states, dones 

In [4]:
class DQL:
    def __init__(self, env, alpha, gamma, epsilon, epsilon_decay=0.99, epsilon_min=0.01, batch_size=64):
        self.env = env 
        self.action_size = self.env.action_space.n
        self.action_space = [i for i in range(self.action_size)]
        self.state_size = self.env.observation_space.shape[0]
        self.alpha = alpha  # learning rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.batch_size = batch_size
        self.memory = ReplayBuffer(10_000, self.state_size, self.action_size)
        self.qnet = self.create_qnet('qnet')

    def create_qnet(self, name):
        model = Sequential([
            NoisyDense(units=256, activation='relu', input_shape=(self.state_size,)),
            NoisyDense(units=256, activation='relu'),
            NoisyDense(units=int(self.action_size), activation='linear')
        ], name=name)

        model.compile(loss='mse', optimizer=Adam(learning_rate=self.alpha))
        return model
    
    def remember(self, state, action, reward, new_state, done):
        self.memory.append(state, action, reward, new_state, done)

    def act(self, state):
        '''
        Epsilon-greedy policy is used to choose action.
        This means that if we choose to exploit, we choose the action with the highest Q-value.
        '''
        state = np.reshape(state, [1, self.state_size])
        rand = np.random.random()
        
        if rand < self.epsilon:
            return np.random.choice(self.action_space)
        else:
            return np.argmax(self.qnet.predict(state, verbose=0))

    def update(self):
        '''
        Q(S, A) ← Q(S, A) + α[R + γmax_a'Q(S', a) - Q(S, A)]

        where 
        - Q(S, A) is being updated
        - Q(s, a) is the current Q-value
        - R + γmax_a'Q(S', a) is the target Q-value
        '''
        if self.memory.memory_counter > self.batch_size:
            state, action, reward, new_state, done = self.memory.sample(self.batch_size)

            action_values = np.array(self.action_space, dtype=np.int8)
            action_idx = np.dot(action, action_values)

            q_current = self.qnet.predict(state, verbose=0)
            q_future = self.qnet.predict(new_state, verbose=0)
            q_target = q_current.copy()

            batch_idx = np.arange(self.batch_size, dtype=np.int32)
            q_target[batch_idx, action_idx] = reward + self.gamma * np.max(q_future, axis=1) * done

            self.qnet.fit(x=state, y=q_target, verbose=0)
            self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)

    def train(self, n_episodes, max_steps=1000, log_wandb=False, 
                update=True, save_episodes=False, save_interval=10):
        '''
        ---------------------------------------------------------
        Deep Q-Learning with Experience Replay
        ---------------------------------------------------------
        Initialize replay memory D to capacity N
        Initialize action-value function Q with random weights

        for episode = 1, M do
            Initialize sequence s1 = {x1} and preprocessed sequence φ1 = φ(s1)
            for t = 1, T do
                With probability ε select a random action at
                otherwise select at = argmaxaQ(φ(st), a; θ)
                Execute action at in emulator and observe reward rt and new state st+1
                Set st+1 = st, at and preprocess φt+1 = φ(st+1)
                Store transition (φt, at, rt, φt+1) in D
                Sample random minibatch of transitions (φj , aj , rj , φj+1) from D
                Set yj = rj if the episode ends at j + 1, otherwise yj = rj + γmaxaQ(φj+1, a; θ)
                Perform a gradient descent step on (yj − Q(φj , aj ; θ))2 with respect to the network parameters θ
            end for
        end for
        '''
        history = {'reward': [], 'avg_reward_100': [], 'steps': []}

        for episode in range(n_episodes):
            start_time = time()
            state = self.env.reset()
            state = state[0]
            done = False
            episode_reward = 0
            episode_steps = 0
            frames = []

            for _ in range(max_steps):
                action = self.act(state)
                new_state, reward, done, _, _ = self.env.step(action)
                frames.append(self.env.render())

                if update:
                    self.remember(state, action, reward, new_state, done)
                    self.update()

                state = new_state
                episode_reward += reward
                episode_steps += 1

                if done:
                    break

            # self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)

            if log_wandb:
                wandb.log({
                    'reward': episode_reward,
                    'steps': episode_steps, 
                    'epsilon': self.epsilon
                })

            if save_episodes:
                if (episode + 1) % save_interval == 0 or (episode == 0):
                    s = EpisodeSaver(self.env, frames, 'DQL', episode + 1)
                    s.save()

            print(f'[EP {episode + 1}/{n_episodes}] - Reward: {episode_reward:.4f} - Steps: {episode_steps} - Eps: {self.epsilon:.4f} - Time: {time() - start_time:.2f}s')

            history['reward'].append(episode_reward)
            history['avg_reward_100'].append(np.mean(history['reward'][-100:]))
            history['steps'].append(episode_steps)

        self.env.close()
        
        if log_wandb:
            wandb.finish()

        self.save('dql.h5')

        return history
    
    def save(self, fname):
        if not os.path.exists('./assets'):
            os.mkdir('./assets')

        self.qnet.save(f'./assets/{fname}')

    def load(self, fname):
        self.qnet = load_model(f'./assets/{fname}')

In [5]:
!wandb login 5f71bc6f91cdaa551a70e88cf2522fcc1425d29b

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [6]:
env = gym.make('LunarLander-v2', continuous=False, render_mode='rgb_array')
learning_rate = 0.0001
discount_factor = 0.79
exploration_rate = 1.0
episodes = 500
max_steps = 1000

wandb.init(project='lunar-lander-rl', entity='timothyckl', config={
    'learning_rate': learning_rate,
    'discount_factor': discount_factor,
    'exploration_rate': exploration_rate,
    'episodes': episodes,
    'max_steps': max_steps
})

[34m[1mwandb[0m: Currently logged in as: [33mtimothyckl[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
agent = DQL(env, learning_rate, discount_factor, exploration_rate)
history = agent.train(episodes, max_steps, log_wandb=True, save_episodes=True, save_interval=100)

[EP 1/500] - Reward: -199.2271 - Steps: 104 - Eps: 0.6690 - Time: 12.13s
[EP 2/500] - Reward: -114.8536 - Steps: 65 - Eps: 0.3481 - Time: 12.75s
[EP 3/500] - Reward: -210.0770 - Steps: 90 - Eps: 0.1409 - Time: 18.21s
[EP 4/500] - Reward: -620.7632 - Steps: 81 - Eps: 0.0624 - Time: 12.49s
[EP 5/500] - Reward: -113.1948 - Steps: 58 - Eps: 0.0348 - Time: 8.71s
[EP 6/500] - Reward: -341.0526 - Steps: 82 - Eps: 0.0153 - Time: 12.14s
[EP 7/500] - Reward: -99.6255 - Steps: 74 - Eps: 0.0100 - Time: 12.03s
[EP 8/500] - Reward: -126.1074 - Steps: 91 - Eps: 0.0100 - Time: 13.79s
[EP 9/500] - Reward: 5.9744 - Steps: 69 - Eps: 0.0100 - Time: 10.94s
[EP 10/500] - Reward: -270.9009 - Steps: 98 - Eps: 0.0100 - Time: 15.40s
[EP 11/500] - Reward: -231.1958 - Steps: 94 - Eps: 0.0100 - Time: 14.53s
[EP 12/500] - Reward: -296.9996 - Steps: 88 - Eps: 0.0100 - Time: 13.19s
[EP 13/500] - Reward: -148.8141 - Steps: 51 - Eps: 0.0100 - Time: 7.77s
[EP 14/500] - Reward: -524.8676 - Steps: 94 - Eps: 0.0100 - Time:

2023-01-28 05:04:23.156330: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


[EP 404/500] - Reward: -35.6143 - Steps: 1000 - Eps: 0.0100 - Time: 157.13s
[EP 405/500] - Reward: 239.3493 - Steps: 225 - Eps: 0.0100 - Time: 35.71s
[EP 406/500] - Reward: 257.4426 - Steps: 207 - Eps: 0.0100 - Time: 31.90s
[EP 407/500] - Reward: 289.8971 - Steps: 251 - Eps: 0.0100 - Time: 39.07s
[EP 408/500] - Reward: 227.9770 - Steps: 211 - Eps: 0.0100 - Time: 32.41s
[EP 409/500] - Reward: 290.6784 - Steps: 228 - Eps: 0.0100 - Time: 35.86s
[EP 410/500] - Reward: -139.9821 - Steps: 169 - Eps: 0.0100 - Time: 27.31s
[EP 411/500] - Reward: -324.4651 - Steps: 85 - Eps: 0.0100 - Time: 12.98s
[EP 412/500] - Reward: -67.9969 - Steps: 117 - Eps: 0.0100 - Time: 18.36s
[EP 413/500] - Reward: 275.5010 - Steps: 184 - Eps: 0.0100 - Time: 29.23s
[EP 414/500] - Reward: 6.9703 - Steps: 117 - Eps: 0.0100 - Time: 17.55s
[EP 415/500] - Reward: -214.8524 - Steps: 125 - Eps: 0.0100 - Time: 20.03s
[EP 416/500] - Reward: 296.1515 - Steps: 232 - Eps: 0.0100 - Time: 36.20s
[EP 417/500] - Reward: 267.4782 - St

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epsilon,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
reward,▁▄▄▄▅▅▅▅▄▅▅▆▆▇▆▅█▆▇▅███▆█▇▅██▆▇▇▇▅█████▇
steps,▁▁▅▂███▁█▁▃██▄█▂▂█▃▁▃▂▃▂▂█▁▂▂▁█▂▂▁▄▂▂▃▇▂

0,1
epsilon,0.01
reward,-37.88553
steps,113.0


In [8]:
# save history
if not os.path.exists('./history'):
        os.mkdir('./history')

with open('./history/dqn_history.json', 'w') as file:
    json.dump(history, file)