In [2]:
import random
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import wandb

In [3]:
# Configure device (GPU if available) and CUDA settings for faster training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
# Enable some CUDA optimizations when using GPU
if device.type == 'cuda':
    torch.backends.cudnn.benchmark = True
    torch.cuda.empty_cache()
    torch.cuda.manual_seed_all(42)

Using device: cuda


In [5]:
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim)
        )

    def forward(self, x):
        return self.net(x)


In [6]:
class DQNAgent:
    def __init__(self, state_dim, action_dim, gamma=0.99, lr=1e-3, epsilon_start=1.0, epsilon_end=0.01, epsilon_decay=0.995, enable_wandb=True):
        self.action_dim = action_dim
        self.gamma = gamma # discount factor: how much future rewards are valued
        self.epsilon = epsilon_start # exploration rate: probability of choosing a random action
        self.epsilon_min = epsilon_end # minimum exploration rate
        self.epsilon_decay = epsilon_decay # rate of decay for exploration probability

        # Use the notebook/global device if available, otherwise fall back to torch detection
        try:
            # if `device` was defined in a prior cell (GPU config), use it; else detect
            self.device = device
        except NameError:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.model = DQN(state_dim, action_dim).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.loss_fn = nn.MSELoss()
        
        # Only watch with WandB if enabled (during training)
        if enable_wandb:
            wandb.watch(self.model, log="all", log_freq=10)

    def select_action(self, state):
        if random.random() < self.epsilon:  # exploration
            return random.randrange(self.action_dim)
        # create tensor directly on the right device to avoid copies
        state_t = torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)
        q_values = self.model(state_t)
        return torch.argmax(q_values).item()  # exploitation

    def train_step(self, memory, batch_size):
        if len(memory) < batch_size:
            return 0.0

        states, actions, rewards, next_states, dones = memory.sample(batch_size)

        # Create tensors directly on the agent device (avoids extra .to() copies)
        states = torch.tensor(states, dtype=torch.float32, device=self.device)
        next_states = torch.tensor(next_states, dtype=torch.float32, device=self.device)
        actions = torch.tensor(actions, dtype=torch.long, device=self.device).unsqueeze(1)
        rewards = torch.tensor(rewards, dtype=torch.float32, device=self.device).unsqueeze(1)
        dones = torch.tensor(dones, dtype=torch.float32, device=self.device).unsqueeze(1)

        # Q(s, a)
        q_values = self.model(states).gather(1, actions)

        # Target: r + Î³ * max_a' Q(next_state, a')
        next_q_values = self.model(next_states).max(1)[0].unsqueeze(1)
        target_q = rewards + (1 - dones) * self.gamma * next_q_values

        loss = self.loss_fn(q_values, target_q.detach())

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # decay epsilon
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
        return loss.item()

In [7]:

class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.memory = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.memory, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (
            np.array(states),
            np.array(actions),
            np.array(rewards),
            np.array(next_states),
            np.array(dones)
        )

    def __len__(self):
        return len(self.memory)


In [None]:
with open("key.txt", "r") as f:
    api_key = f.read().strip()
wandb.login(key=api_key)

def train_dqn():

    wandb.init(project="dqn-cartpole", name="dqn_experiment_1")
    
    env = gym.make("CartPole-v1")
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n

    # Corrected print statement
    print(f"state dimension: {state_dim}, action dimension: {action_dim}")

    agent = DQNAgent(state_dim, action_dim, lr=5e-4, epsilon_decay=0.99)
    memory = ReplayBuffer(10000)
    episodes = 500      
    batch_size = 64


    print("Starting training...")

    for ep in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        episode_loss = 0
        train_steps = 0

        for t in range(500):
            action = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            memory.push(state, action, reward, next_state, done)
            loss = agent.train_step(memory, batch_size)  # Now returns loss
            
            if loss > 0:  # Only count when training happened
                episode_loss += loss
                train_steps += 1

            state = next_state
            total_reward += reward

            if done:
                break

        # Calculate average loss
        avg_loss = episode_loss / train_steps if train_steps > 0 else 0
        
        # ADD THESE LINES for WandB logging:
        wandb.log({
            "episode": ep + 1,
            "total_reward": total_reward,
            "epsilon": agent.epsilon,
            "avg_loss": avg_loss,
            "buffer_size": len(memory)
        })

        print(f"Episode {ep+1}, Reward: {total_reward}, Epsilon: {agent.epsilon:.3f}, Avg Loss: {avg_loss:.4f}")

        # ADD THESE LINES before torch.save:
    wandb.config.update({
        "state_dim": state_dim,
        "action_dim": action_dim,
        "gamma": agent.gamma,
        "learning_rate": 5e-4,
        "epsilon_start": 1.0,
        "epsilon_end": 0.01,
        "epsilon_decay": 0.99,
        "batch_size": batch_size,
        "buffer_size": 10000,
        "episodes": episodes
    })

    torch.save(agent.model.state_dict(), "dqn_model.pth")
    wandb.save("dqn_model.pth")  # <-- ADD THIS to save model to WandB
    wandb.finish()  # <-- ADD THIS at the very end


if __name__ == "__main__":
    train_dqn()


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\ayaha\_netrc
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\ayaha\_netrc
[34m[1mwandb[0m: Currently logged in as: [33mayahayman[0m ([33mayahayman-cairo-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Currently logged in as: [33mayahayman[0m ([33mayahayman-cairo-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


state dimension: 4, action dimension: 2
Starting training...
Episode 1, Reward: 25.0, Epsilon: 1.000, Avg Loss: 0.0000
Episode 2, Reward: 18.0, Epsilon: 1.000, Avg Loss: 0.0000
Starting training...
Episode 1, Reward: 25.0, Epsilon: 1.000, Avg Loss: 0.0000
Episode 2, Reward: 18.0, Epsilon: 1.000, Avg Loss: 0.0000
Episode 3, Reward: 35.0, Epsilon: 0.860, Avg Loss: 1.0551
Episode 3, Reward: 35.0, Epsilon: 0.860, Avg Loss: 1.0551
Episode 4, Reward: 26.0, Epsilon: 0.662, Avg Loss: 1.2963
Episode 5, Reward: 12.0, Epsilon: 0.587, Avg Loss: 1.7328
Episode 4, Reward: 26.0, Epsilon: 0.662, Avg Loss: 1.2963
Episode 5, Reward: 12.0, Epsilon: 0.587, Avg Loss: 1.7328
Episode 6, Reward: 11.0, Epsilon: 0.526, Avg Loss: 2.3530
Episode 6, Reward: 11.0, Epsilon: 0.526, Avg Loss: 2.3530
Episode 7, Reward: 19.0, Epsilon: 0.434, Avg Loss: 4.3615
Episode 8, Reward: 11.0, Epsilon: 0.389, Avg Loss: 10.2971
Episode 9, Reward: 11.0, Epsilon: 0.348, Avg Loss: 15.7984
Episode 7, Reward: 19.0, Epsilon: 0.434, Avg L

OSError: [WinError 1314] A required privilege is not held by the client: 'c:\\Users\\ayaha\\Desktop\\University\\Semester 9\\Reinforcement Learning\\Assignments\\Assignment 2\\Reinforcement-Learning-Ass2\\dqn_model.pth' -> 'c:\\Users\\ayaha\\Desktop\\University\\Semester 9\\Reinforcement Learning\\Assignments\\Assignment 2\\Reinforcement-Learning-Ass2\\wandb\\run-20251112_131913-tm7gmwdu\\files\\dqn_model.pth'

In [None]:
import os
from datetime import datetime
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Create a unique folder for videos
video_folder = f"videos/run_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
os.makedirs(video_folder, exist_ok=True)

env = gym.make("CartPole-v1", render_mode="rgb_array")
env = gym.wrappers.TimeLimit(env, max_episode_steps=1000)
env = gym.wrappers.RecordVideo(env, video_folder, episode_trigger=lambda e: True)

# Load the trained model WITH WandB DISABLED
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = DQNAgent(state_dim, action_dim, enable_wandb=False)  # â ADD THIS PARAMETER
agent.model.load_state_dict(torch.load("dqn_model.pth"))
agent.epsilon = 0  # Disable exploration

# Record multiple episodes
for episode in range(3):
    state, _ = env.reset()
    done = False
    while not done:
        action = agent.select_action(state)
        state, _, terminated, truncated, _ = env.step(action)
        print("terminated", terminated, "truncated", truncated)
        done = terminated or truncated

env.close()

terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated False truncated False
terminated