In [None]:
from ale_py import ALEInterface
import gymnasium as gym
import ale_py
import matplotlib.pyplot as plt
import torch.nn as nn
import torch
from collections import namedtuple, deque
import torch.optim as optim

import random
import cv2
import numpy as np
import os
import json



In [None]:
def make_env(env_name, **kwargs):
    env = gym.make(env_name, **kwargs)
    env = gym.wrappers.AtariPreprocessing(
        env           = env,
        noop_max      = 30,
        frame_skip    = 4,
        screen_size   = 84,
        grayscale_obs = True,
        scale_obs     = True,
    )
    env = gym.wrappers.FrameStackObservation(env, 4)
    return env
class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""
    def __init__(self, max_length = 50000, batch_size = 32, device = torch.device("cuda" if torch.cuda.is_available() else "cpu")):
        self.memory = deque(maxlen=max_length)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.device = device

        print("Replay Buffer started")
        print("Memory buffer=", max_length, ", batch size=", self.batch_size)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        exp = self.experience(state, action, reward, next_state, done)
        self.memory.append(exp)

    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)
        states = torch.tensor(np.array([e.state for e in experiences]), dtype=torch.float32, device=self.device)
        actions = torch.tensor(np.array([e.action for e in experiences]), dtype=torch.long, device=self.device)
        rewards = torch.tensor(np.array([e.reward for e in experiences]), dtype=torch.float32, device=self.device)
        next_states = torch.tensor(np.array([e.next_state for e in experiences]), dtype=torch.float32, device=self.device)
        dones = torch.tensor(np.array([e.done for e in experiences], dtype=np.uint8), dtype=torch.float32, device=self.device)
        return states, actions, rewards, next_states, dones

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# env = gym.make("ALE/Pong-v5", render_mode=None)
env = make_env("PongNoFrameskip-v4")
# obs_space = env.observation_space
# action_size = env.action_space.n
# state_shape = obs_space.shape
# input_shape = (4, 84, 84)
# gamma = 0.99
# epsilon = 1.0
# epsilon_min = 0.1
# # epsilon_decay = 0.995
# learning_rate = 5e-5
# memory_size = 100000
# batch_size = 64
# target_update_freq = 1000
# max_episodes = 3500
# train_freq = 4
GAMMA = 0.99
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY_STEPS = 1_000_000
LEARNING_RATE = 1e-4 
MEMORY_SIZE = 100_000 
BATCH_SIZE = 32
TARGET_UPDATE_FREQ = 1_000
MAX_EPISODES = 5000
REPLAY_START_SIZE = 10_000
TRAIN_FREQ = 4
class QNetwork(nn.Module):
    def __init__(self, action_size, input_shape=(4, 84, 84)):
        super(QNetwork, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        )
        with torch.no_grad():
            dummy_input = torch.zeros(1, *input_shape)
            feature_size = self.conv(dummy_input).view(1, -1).size(1)
        self.fc = nn.Sequential(
            nn.Linear(feature_size, 512),
            nn.ReLU(),
            nn.Linear(512, action_size)
        )

    def forward(self, x):
        x = self.conv(x)
        x = x.reshape(x.size(0), -1) 
        x = self.fc(x)
        return x
class Agent:
    def __init__(self, action_size, ddqn = False):
        self.action_size = action_size
        self.ddqn = ddqn
        
        self.model = QNetwork(action_size).to(device)
        self.target_model = QNetwork(action_size).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=LEARNING_RATE, weight_decay=0.00015)
        self.memory = ReplayBuffer(MEMORY_SIZE, BATCH_SIZE, device)
        self.epsilon = EPSILON_START

        # self.step_to_update = 0
        # self.update_every = 1
        # self.replay_after = 1000
        # self.tau = 0.01
        # self.training_error = []
        # self.final_epsilon = epsilon_min
        # self.epsilon_decay = (self.epsilon - self.final_epsilon) / max_episodes

        # self.target_model.eval()
        self.update_target()
        self.target_model.eval() 

    def remember(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

    def act(self, state, is_evaluating=False):
        # print(f"epsilon is: {epsilon}")
        # print(f"device is: {device}")
        if not is_evaluating and random.random() < self.epsilon:
            return env.action_space.sample()
        else:
            state = torch.FloatTensor(state).unsqueeze(0).to(device)
            self.model.eval()
            with torch.no_grad():
                q_values = self.model.forward(state)
            self.model.train()
            # print(f"q val cpu: {q_values.cpu().data.numpy()}")
            return np.argmax(q_values.cpu().data.numpy()).item()
    def update_target(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def update(self):
        # self.step_to_update = (self.step_to_update + 1) % self.update_every
        if len(self.memory) <= REPLAY_START_SIZE:
            return
        
        (states, actions, rewards, next_states, dones) = self.memory.sample()
        q_values = self.model.forward(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        with torch.no_grad():
            # if double deep qn
            if (self.ddqn):
                next_actions = self.model.forward(next_states).argmax(1)
                next_q_values = self.target_model.forward(next_states).gather(1, next_actions.unsqueeze(1)).squeeze(1)
            # if standard dqn
            else: 
                next_q_values = self.target_model.forward(next_states).detach().max(1)[0]
        
        target_q_values = rewards + GAMMA * next_q_values * (1 - dones)

        loss = nn.MSELoss()(q_values, target_q_values)
        self.optimizer.zero_grad()
        # backtracking
        loss.backward()
        self.optimizer.step()
        return loss.item()
    def decay_epsilon(self, current_step):
        fraction = min(1.0, current_step / EPSILON_DECAY_STEPS)
        self.epsilon = EPSILON_START + fraction * (EPSILON_END - EPSILON_START)

is_ddqn = False
standard_dqn_agent = Agent(env.action_space.n, is_ddqn)
steps = []
total_steps = 0
episode_rewards = []
average_losses = []
epsilon_trend = []
for e in range(MAX_EPISODES):
    state, info = env.reset()
    # state= stack_frames(None, state, True)
    done = False
    total_reward = 0
    episode_step = 0
    losses = []
    while not done:
        episode_step += 1
        action = standard_dqn_agent.act(state)
        next_state, reward, done, _, _ = env.step(action)
        # next_state= stack_frames(state, next_frame, False)
        standard_dqn_agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        total_steps +=1
        standard_dqn_agent.decay_epsilon(total_steps)
        if total_steps % TRAIN_FREQ == 0:
            loss = standard_dqn_agent.update()
            if loss is not None:
                losses.append(loss)
        if total_steps % TARGET_UPDATE_FREQ == 0:
            standard_dqn_agent.update_target()


    # rewards for each episode
    episode_rewards.append(total_reward)
    average_losses.append(np.mean(losses) if losses else 0)
    epsilon_trend.append(standard_dqn_agent.epsilon)
    
    if e % 100 == 0:
        standard_dqn_agent.update_target()
        os.makedirs(f'./episodes/{"ddqn" if is_ddqn else "dqn"}', exist_ok=True)
        torch.save(
            {
                'episode': e,
                'model_state_dict': standard_dqn_agent.model.state_dict(),
                'target_model_state_dict': standard_dqn_agent.target_model.state_dict(),
                'optimizer_state_dict': standard_dqn_agent.optimizer.state_dict(),
                'epsilon': standard_dqn_agent.epsilon,
            }, 
            f'./episodes/{"ddqn" if is_ddqn else "dqn"}/pong_model_episode_{e}.pth'
        )
        print(f"*** Model saved at episode {e} ***")

    print(f"Episode {e+1}/{MAX_EPISODES} | Reward: {total_reward:.2f} | "
        f"Loss: {average_losses[-1]:.5f} | Epsilon: {standard_dqn_agent.epsilon:.3f}")
env.close()

Replay Buffer started
Memory buffer= 100000 , batch size= 32
*** Model saved at episode 0 ***
Episode 1/5000 | Reward: -21.00 | Loss: 0.00000 | Epsilon: 0.999
Episode 2/5000 | Reward: -18.00 | Loss: 0.00000 | Epsilon: 0.998
Episode 3/5000 | Reward: -21.00 | Loss: 0.00000 | Epsilon: 0.997
Episode 4/5000 | Reward: -21.00 | Loss: 0.00000 | Epsilon: 0.996
Episode 5/5000 | Reward: -20.00 | Loss: 0.00000 | Epsilon: 0.995
Episode 6/5000 | Reward: -20.00 | Loss: 0.00000 | Epsilon: 0.995
Episode 7/5000 | Reward: -21.00 | Loss: 0.00000 | Epsilon: 0.994
Episode 8/5000 | Reward: -21.00 | Loss: 0.00000 | Epsilon: 0.993
Episode 9/5000 | Reward: -21.00 | Loss: 0.00000 | Epsilon: 0.992
Episode 10/5000 | Reward: -20.00 | Loss: 0.00000 | Epsilon: 0.991
Episode 11/5000 | Reward: -21.00 | Loss: 0.00000 | Epsilon: 0.990
Episode 12/5000 | Reward: -21.00 | Loss: 0.02320 | Epsilon: 0.989
Episode 13/5000 | Reward: -20.00 | Loss: 0.02407 | Epsilon: 0.988
Episode 14/5000 | Reward: -21.00 | Loss: 0.02385 | Epsilo

In [None]:

# env = make_env("PongNoFrameskip-v4")
# def preprocess_frame(screen, output):
#     screen = cv2.cvtColor(screen, cv2.COLOR_RGB2GRAY)
#     screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
#     screen = screen[35:195]        
#     screen = cv2.resize(screen, (output, output), interpolation = cv2.INTER_AREA)
#     return screen
# state, info = env.reset()
# for i in range (50):
#     next_frame, reward, done, _, _ = env.step(env.action_space.sample())
# # next_frame = preprocess_frame(next_frame, 84)
# plt.imshow(state[-1], cmap='gray')
# plt.title("Preprocessed Frame")
# plt.axis('off')
# plt.show()
# next_frame
# def stack_frames(frames, state, is_new_episode=False):
#     frame = preprocess_frame(state, None, 84)
#     if is_new_episode:
#         frames = np.stack(arrays=[frame, frame, frame, frame])
#     else:
#         frames[0] = frames[1]
#         frames[1] = frames[2]
#         frames[2] = frames[3]
#         frames[3] = frame
#     return frames
# def stack_frames(frames, state, is_new_episode=False):
#     frame = preprocess_frame(state, 84)
#     if is_new_episode:
#         frames = np.stack(arrays=[frame, frame, frame, frame])
#     else:
#         frames[0] = frames[1]
#         frames[1] = frames[2]
#         frames[2] = frames[3]
#         frames[3] = frame
#     return frames

In [106]:

data_dict = {
    "episode_rewards": episode_rewards, 
    "average_losses": average_losses,
    "epsilon_trend": epsilon_trend
}
with open(f'{"ddqn" if is_ddqn else "dqn"}_res.json', 'w') as json_file:
    json.dump(data_dict, json_file)

In [107]:
# from gymnasium.wrappers import RecordVideo
# def record_gameplay(agent, episode_to_load):
#     print(f"\n--- Recording gameplay for episode {episode_to_load} model ---")
    
#     model_path = f'./episodes/{"ddqn" if is_ddqn else "dqn"}/pong_model_episode_{episode_to_load}.pth'
#     agent.model.load_state_dict(torch.load(model_path))
#     agent.model.eval()
#     os.makedirs('./videos', exist_ok=True)
#     record_env = gym.make("ALE/Pong-v5", render_mode="rgb_array")
    
#     record_env = RecordVideo(record_env, video_folder="./videos", name_prefix=f"pong-episode-{episode_to_load}")

#     state, _ = record_env.reset()
#     state= stack_frames(None, state, True)
#     done = False
#     total_reward = 0
    
#     while not done:
#         action = agent.act(state) 
        
#         next_frame, reward, done, _, _ = record_env.step(action)
    
#         next_state= stack_frames(state, next_frame, False)
#         state = next_state
#         total_reward += reward

#     print(f"Recording finished. Final score: {total_reward}")
#     record_env.close()

    
# record_gameplay(standard_dqn_agent, episode_to_load=10)