In [1]:
# !pip install torch
# !pip install torchsummary
# !pip install gymnasium
# !pip install gymnasium[box2d]
# !pip install pygame
# !pip install matplotlib

# Import Library

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from tqdm.notebook import tqdm_notebook as tqdm
# from tqdm import tqdm
from torchsummary import summary

import gym_agent as ga

import utils

# Hyperparams

In [3]:
n_games = 500
max_episode_steps = 500
chkpt_dir = "checkpoints/LunarLander-v2/DQN"

env_id = 'LunarLander-v2'

### Define Model

In [4]:
class DeepQNetwork(nn.Module):
    def __init__(self, n_inp: int, n_out: int, features: list[int] = [256, 256]):
        super().__init__()

        layers = [n_inp] + features + [n_out]

        self.net = nn.Sequential()

        for i in range(len(layers)-1):
            self.net.append(nn.Linear(layers[i], layers[i+1]))

            if i < len(layers)-2:
                self.net.append(nn.ReLU())
        
    def forward(self, X):
        return self.net(X)

In [None]:
model = DeepQNetwork(8, 4)
summary(model, (8,), device='cpu')

In [6]:
class Policy(nn.Module):
    def __init__(self, n_inp: int, n_out: int, features: list[int] = [256, 256]):
        super().__init__()

        self.network = DeepQNetwork(n_inp, n_out, features)

        self.target_network = DeepQNetwork(n_inp, n_out, features)

        self.soft_update(1)

    def forward(self, X: torch.Tensor):
        return self.network(X)
    
    def target(self, X: torch.Tensor):
        return self.target_network(X)
    
    def soft_update(self, tau: float):
        for target_param, param in zip(self.target_network.parameters(), self.network.parameters()):
            target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data)

### Define Agent

In [7]:
class DQN(ga.OffPolicyAgent):
    policy: Policy
    def __init__(
            self, 
            policy, 
            env, 
            action_space: list,
            lr=1e-3,
            gamma = 0.99,
            tau = 1e-3,
            eps_start = 1.0,
            eps_decay = 0.99,
            eps_min = 0.01,
            batch_size = 64, 
        ):
        super().__init__(policy, env, batch_size=batch_size)

        self.gamma = gamma
        self.tau = tau
        self.eps_start = eps_start
        self.eps_decay = eps_decay
        self.eps_min = eps_min

        self.eps = eps_start

        self.action_space = action_space

        self.optimizer = optim.Adam(self.policy.network.parameters(), lr=lr)
    
    def reset(self):
        self.eps = self.eps * self.eps_decay
        if self.eps < self.eps_min:
            self.eps = self.eps_min
    
    @torch.no_grad()
    def predict(self, observations: np.ndarray, deterministic = True):
        if deterministic:
            eps = 0
        else:
            eps = self.eps
        
        if np.random.random() > eps:
            tensor_observations = torch.from_numpy(observations).float().to(self.device)

            actions_value: torch.Tensor = self.policy.forward(tensor_observations)

            return np.argmax(actions_value.cpu().numpy(), axis=1)
        else:
            return np.random.choice(self.action_space, observations.shape[0])

    def learn(self, observations: torch.Tensor, actions: torch.Tensor, rewards: torch.Tensor, next_observations: torch.Tensor, terminals: torch.Tensor):
        q_next: torch.Tensor = self.policy.target(next_observations).detach()

        q_next_max = q_next.max(dim=1)[0]

        q_target = rewards + self.gamma * q_next_max * (~terminals)

        q_expected = self.policy.forward(observations).gather(1, actions.unsqueeze(1).long()).squeeze(1)

        # loss = F.mse_loss(q_expected, q_target)
        loss = F.smooth_l1_loss(q_expected, q_target)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.policy.soft_update(self.tau)

In [8]:
env = ga.make_vec(env_id, max_episode_steps=max_episode_steps)
agent = DQN(
    policy=Policy(8, 4, features=[256, 256]),
    action_space=[0, 1, 2, 3],
    env=env,
    lr=1e-3,
    batch_size=64
    
)

### Training

In [None]:
agent.fit(n_games=n_games, save_best=True, save_every=100, save_dir=chkpt_dir, progress_bar=tqdm)

In [None]:
agent.load(chkpt_dir, "best")
agent.plot_scores()

In [None]:
env = ga.make(env_id, render_mode='rgb_array')

agent.load(chkpt_dir, "best")

agent.play_jupyter(env, seed=0)

### TODO:
- using Deep Q Learning to solve LunarLander with observation is iamge (96, 96, 3) (below env)

In [18]:
import gymnasium as gym
class ImageAsObs(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(96, 96, 3), dtype=np.uint8)

    def observation(self, observation):
        image = self.env.unwrapped.render()
        image = Image.fromarray(image)
        image = image.resize((96, 96))
        return np.array(image)

In [20]:
env = ga.make(env_id, render_mode='rgb_array')
env.add_wrapper(ImageAsObs)