<a href="https://colab.research.google.com/github/yasamankfd/2-functions-With-Thread/blob/master/petting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install 'pettingzoo[atari]'
!pip install supersuit
!pip install autorom
!AutoROM --accept-license
!pip install stable-baselines3[extra] pettingzoo supersuit


Collecting pettingzoo[atari]
  Downloading pettingzoo-1.24.3-py3-none-any.whl.metadata (8.5 kB)
Collecting multi-agent-ale-py==0.1.11 (from pettingzoo[atari])
  Downloading multi-agent-ale-py-0.1.11.tar.gz (551 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m552.0/552.0 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pygame==2.3.0 (from pettingzoo[atari])
  Downloading pygame-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading pygame-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pettingzoo-1.24.3-py3-none-any.whl (847 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m847.8/847.8 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: multi-age

In [None]:
from google.colab import drive
drive.mount('/content/drive')

SAVE_PATH = "/content/drive/My Drive/Pong_DQN_Models/"

import os
os.makedirs(SAVE_PATH, exist_ok=True)

import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import supersuit as ss
from pettingzoo.atari import pong_v3

# 🚀 Environment Preprocessing Function
def customized_pong_env(color_reduction_mode="B", resize=[84, 84], frame_skip=1, frame_stack=3):
    env = pong_v3.env(render_mode='rgb_array')
    env = ss.color_reduction_v0(env, mode=color_reduction_mode)
    env = ss.resize_v1(env, resize[0], resize[1])
    env = ss.frame_skip_v0(env, frame_skip)
    env = ss.frame_stack_v1(env, frame_stack)
    return env

# 🚀 Initialize Environment
env = customized_pong_env()
env.reset()

# 🚀 Hyperparameters
BATCH_SIZE = 64  # Increased batch size for better training stability
GAMMA = 0.99
LR = 1e-4
EPSILON_START = 1.0
EPSILON_END = 0.05  # Lower minimum epsilon for better exploitation
EPSILON_DECAY = 5000000  # Faster decay for quicker learning
MEMORY_SIZE = 100000
TARGET_UPDATE = 10000  # More stable training
NUM_EPISODES = 2000  # Increased episodes for better learning

# 🚀 Device Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 🚀 CNN-based DQN Model
class DQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(DQN, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2), nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1), nn.ReLU()
        )
        conv_out_size = self._get_conv_out(input_shape)
        self.fc = nn.Sequential(
            nn.Linear(conv_out_size, 512), nn.ReLU(),
            nn.Linear(512, num_actions)
        )

    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, x):
        x = self.conv(x)
        x = x.reshape(x.size(0), -1)
        return self.fc(x)



# 🚀 Initialize Networks
input_shape = (3, 84, 84)
num_actions = env.action_space("first_0").n
dqn = DQN(input_shape, num_actions).to(device)

LOAD_PATH = "/content/drive/My Drive/dqn/pong_dqn_episode_445.pth"  # Path to saved model

if os.path.exists(LOAD_PATH):
    dqn.load_state_dict(torch.load(LOAD_PATH, map_location=device))
    print(f"✅ Loaded model from {LOAD_PATH}")
else:
    print("⚠️ No saved model found, training from scratch.")


target_dqn = DQN(input_shape, num_actions).to(device)
target_dqn.load_state_dict(dqn.state_dict())



# 🚀 Optimizer
optimizer = optim.Adam(dqn.parameters(), lr=LR)
# 🚀 Optimizer & Scheduler
optimizer = optim.Adam(dqn.parameters(), lr=LR)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.95)  # Learning rate decay

# 🚀 Replay Memory
memory = deque(maxlen=MEMORY_SIZE)

# 🚀 Epsilon-Greedy Policy
def select_action(state, epsilon):
    if random.random() < epsilon:
        return random.randrange(num_actions)
    else:
        with torch.no_grad():
            state = torch.FloatTensor(state).permute(2, 0, 1).unsqueeze(0).to(device)
            return dqn(state).argmax().item()

# 🚀 Training Function
def train():
    if len(memory) < 5000:  # Ensure memory has enough samples before training
        return

    batch = random.sample(memory, BATCH_SIZE)
    state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch)

    state_batch = torch.FloatTensor(np.array(state_batch) / 255.0).permute(0, 3, 1, 2).to(device)
    next_state_batch = torch.FloatTensor(np.array(next_state_batch) / 255.0).permute(0, 3, 1, 2).to(device)

    action_batch = torch.LongTensor(action_batch).to(device)
    reward_batch = torch.FloatTensor(reward_batch).to(device)
    done_batch = torch.FloatTensor(done_batch).to(device)

    q_values = dqn(state_batch).gather(1, action_batch.unsqueeze(1)).squeeze(1)
    next_q_values = target_dqn(next_state_batch).max(1)[0].detach()
    target_q_values = reward_batch + GAMMA * next_q_values * (1 - done_batch)

    loss = F.smooth_l1_loss(q_values, target_q_values)  # Huber Loss
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    scheduler.step()

# 🚀 Training Loop
epsilon = EPSILON_START
steps_done = 0

MAX_STEPS = 2500  # Limit the number of steps per episode

for episode in range(NUM_EPISODES):
    env.reset()
    state, reward, done, truncated, info = env.last()
    done = False
    total_reward = 0
    step_count = 0  # Track the number of steps in the episode

    while not done:
        step_count += 1  # Increment step counter

        action = select_action(state, epsilon)
        env.step(action)
        next_state, reward, done, _, _ = env.last()

        # Reward Shaping
        if reward > 0:
            reward += 0.5
        elif reward < 0:
            reward -= 0.5
        elif action == 0:
            reward -= 0.1

        memory.append((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward
        train()
        steps_done += 1
        epsilon = max(EPSILON_END, EPSILON_START - (steps_done / EPSILON_DECAY))

        if steps_done % TARGET_UPDATE == 0:
            target_dqn.load_state_dict(dqn.state_dict())

        if step_count >= MAX_STEPS:  # Stop the episode if max_steps is reached
            break

    model_filename = f"pong_dqn_episode_{episode}.pth"
    torch.save(dqn.state_dict(), os.path.join(SAVE_PATH, model_filename))
    print(f"Model saved to {SAVE_PATH}{model_filename}")
    print(f"Episode {episode}, Reward: {total_reward:.2f}, Steps: {step_count}, Epsilon: {epsilon:.4f}")


# 🚀 Save Final Model
torch.save(dqn.state_dict(), "pong_dqn_pettingzoo.pth")


MessageError: Error: credential propagation was unsuccessful

In [None]:
!pip install init

Collecting init
  Downloading init-0.1.0.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting terminal (from init)
  Downloading terminal-0.4.0.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: init, terminal
  Building wheel for init (setup.py) ... [?25l[?25hdone
  Created wheel for init: filename=init-0.1.0-py3-none-any.whl size=36331 sha256=3e7941656b990d0a147fa8ba6a5b042e4a985099d080f7c4a816ef881449a375
  Stored in directory: /root/.cache/pip/wheels/f4/0d/98/f00d85bf31ce348a7b45fa330d807c1dba9f0e2d5c045aba50
  Building wheel for terminal (setup.py) ... [?25l[?25hdone
  Created wheel for terminal: filename=terminal-0.4.0-py3-none-any.whl size=13146 sha256=f24456c306444d6887aaba80bda77ba0f3868a261f3a81ec5b9989395c757fbe
  Stored in directory: /root/.cache/pip/wheels/80/d4/04/8561fe46804330b0f11536467cb88d5a66a4a6d9338ef62b82
Successfully built init terminal
Installing collected packages: terminal,

In [None]:
import torch
import numpy as np
import random
import cv2
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import supersuit as ss
from pettingzoo.atari import pong_v3
import torch.nn as nn
import torch.nn.init as init


def generate_pong_video(env=None, policy_left=None, policy_right=None, video_path='pong.mp4', max_frames=10_000):
    """
    شبیه‌سازی بازی پنگ با استفاده از سیاست‌های مشخص‌شده برای بازیکنان چپ و راست.
    اگر یک سیاست مقدار None داشته باشد، بازیکن به‌طور تصادفی عمل می‌کند.

    ورودی‌ها:
        policy_left: تابعی برای تصمیم‌گیری اعمال بازیکن چپ.
        policy_right: تابعی برای تصمیم‌گیری اعمال بازیکن راست.
        video_path: مسیر ذخیره‌سازی ویدئو.
        env: محیط بازی.
        max_frames: حداکثر تعداد فریم برای ضبط.
    """
    if env is None:
        env = pong_v3.env(render_mode='rgb_array')
    env.reset(seed=42)

    height, width, _ = env.render().shape
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video = cv2.VideoWriter(video_path, fourcc, 30, (width, height))

    frame_count = 0
    for agent in env.agent_iter():
        observation, reward, termination, truncation, info = env.last()

        if termination or truncation:
            action = None
        else:
            if agent == 'second_0':
                action = (policy_left(observation)
                          if policy_left else env.action_space(agent).sample())
            elif agent == 'first_0':
                action = (policy_right(observation)
                          if policy_right else env.action_space(agent).sample())

        env.step(action)
        frame = env.render()
        video.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

        frame_count += 1
        if frame_count >= max_frames:
            break

    env.close()
    video.release()


def customized_pong_env(color_reduction_mode="B", resize=[84, 84], frame_skip=1, frame_stack=3):
    env = pong_v3.env(render_mode='rgb_array')
    env = ss.color_reduction_v0(env, mode=color_reduction_mode)
    env = ss.resize_v1(env, resize[0], resize[1])
    env = ss.frame_skip_v0(env, frame_skip)
    env = ss.frame_stack_v1(env, frame_stack)
    return env
class DQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(DQN, self).__init__()

        # Define the layers with Instance Normalization for convolutional layers
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.InstanceNorm2d(32),  # InstanceNorm2d works for convolutional layers
            nn.ReLU(),

            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.InstanceNorm2d(64),  # InstanceNorm2d works for convolutional layers
            nn.ReLU(),

            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.InstanceNorm2d(64),  # InstanceNorm2d works for convolutional layers
            nn.ReLU()
        )

        conv_out_size = self._get_conv_out(input_shape)

        # Remove InstanceNorm1d or LayerNorm after fully connected layer
        self.fc = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),

            nn.Linear(512, num_actions)
        )

        # Apply He Initialization
        init.kaiming_normal_(self.conv[0].weight, mode='fan_out', nonlinearity='relu')  # First Conv Layer
        init.kaiming_normal_(self.conv[3].weight, mode='fan_out', nonlinearity='relu')  # Second Conv Layer
        init.kaiming_normal_(self.conv[6].weight, mode='fan_out', nonlinearity='relu')  # Third Conv Layer
        init.kaiming_normal_(self.fc[0].weight, mode='fan_out', nonlinearity='relu')  # First FC Layer

    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, x):
        x = self.conv(x)
        x = x.reshape(x.size(0), -1)
        return self.fc(x)


# Load trained DQN model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dqn = DQN((3, 84, 84), 6).to(device)  # 6 actions in Pong
dqn.load_state_dict(torch.load("pong_dqn_episode_200.pth", map_location=device))
dqn.eval()

# Define policy function using the trained model
def dqn_policy(observation):
    state_tensor = torch.FloatTensor(observation).permute(2, 0, 1).unsqueeze(0).to(device) / 255.0
    with torch.no_grad():
        action = dqn(state_tensor).argmax().item()
    return action
# Create customized environment
env = customized_pong_env()

# Generate video using trained model for both players
generate_pong_video(env, policy_left=dqn_policy, policy_right=dqn_policy, video_path='pong_ai_vs_ai.mp4', max_frames=10000)

print("🎥 Video saved as pong_ai_vs_ai.mp4")


  dqn.load_state_dict(torch.load("pong_dqn_episode_200.pth", map_location=device))


🎥 Video saved as pong_ai_vs_ai.mp4
