In [96]:
import gymnasium
import flappy_bird_gymnasium
import numpy as np
import pygame
import random
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
from enum import IntEnum
from torchvision.transforms import Compose, ToTensor, Resize, Grayscale
from flappy_bird_gymnasium.envs.flappy_bird_env import FlappyBirdEnv
from flappy_bird_gymnasium.envs.flappy_bird_env import Actions
from flappy_bird_gymnasium.envs.lidar import LIDAR
from flappy_bird_gymnasium.envs.constants import (
    PLAYER_FLAP_ACC,
    PLAYER_ACC_Y,
    PLAYER_MAX_VEL_Y,
    PLAYER_HEIGHT,
    PLAYER_VEL_ROT,
    PLAYER_WIDTH,
    PIPE_WIDTH,
    PIPE_VEL_X,
)



In [111]:

# Neural Network Model for Q-value approximation
# Fully connected model for 1D state inputs
class DQN(nn.Module):
    def __init__(self, input_dim, action_space):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # First hidden layer
        self.fc2 = nn.Linear(128, 128)       # Second hidden layer
        self.fc3 = nn.Linear(128, action_space)  # Output layer for Q-values

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)



# Replay Memory to store experiences
class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


# DQN Agent Class
class DQNAgent:
    def __init__(self, env):
        # Environment
        self.env = env
        self.state_dim = env.observation_space.shape[0]  # First dimension of observation
        self.action_space = env.action_space.n

        # Hyperparameters
        self.learning_rate = 0.001
        self.discount_factor = 0.99
        self.epsilon = 1.0  
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.01
        self.batch_size = 64
        self.memory_size = 10000
        self.episodes = 50000
        self.target_update_freq = 10

        # Initialize policy and target networks
        self.policy_net = DQN(self.state_dim, self.action_space)
        self.target_net = DQN(self.state_dim, self.action_space)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        # Optimizer and Replay Memory
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
        self.memory = ReplayMemory(self.memory_size)

    def select_action(self, state, testing=False):
        """Epsilon-greedy action selection."""
        if not testing and random.random() < self.epsilon:
            return random.randint(0, self.action_space - 1)  # Random action
        else:
            with torch.no_grad():
                state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
                return self.policy_net(state).argmax(dim=1).item()

    def optimize_model(self):
        """Sample a batch from memory and optimize the policy network."""
        if len(self.memory) < self.batch_size:
            return

        batch = self.memory.sample(self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        # Convert to tensors
        states = torch.tensor(states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.long).unsqueeze(1)
        rewards = torch.tensor(rewards, dtype=torch.float32).unsqueeze(1)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.float32).unsqueeze(1)

        # Compute Q-values and targets
        q_values = self.policy_net(states).gather(1, actions)
        next_q_values = self.target_net(next_states).max(1, keepdim=True)[0]
        targets = rewards + (self.discount_factor * next_q_values * (1 - dones))

        # Loss and backpropagation
        loss = nn.MSELoss()(q_values, targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def train(self):
        res=[]
        """Train the agent."""
        for episode in range(self.episodes):
            state, _ = self.env.reset()
            done = False
            total_reward = 0

            while not done:
                # Select and execute action
                action = self.select_action(state)
                next_state, reward, done, _, _ = self.env.step(action)
                self.memory.push(state, action, reward, next_state, done)
                state = next_state
                total_reward += reward

                # Optimize model
                self.optimize_model()

            # Update target network periodically
            if episode % self.target_update_freq == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())

            # Decay epsilon
            self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

            res.append(total_reward)
            print(f"Episode: {episode + 1}, Total Reward: {total_reward}, Epsilon: {self.epsilon:.4f}")

        self.env.close()
        print("Training complete!")

        return res

    def test(self, num_episodes=10,render="human"):
        """Test the trained policy with real-time rendering."""
        print("\nTesting the trained policy...\n")
        self.epsilon = 0.0  # Disable exploration
        test_env = gymnasium.make("FlappyBird-v0", render_mode=render,use_lidar=True)  # Render in "human" mode 
        total_rewards = []

        for episode in range(num_episodes):
            state, _ = test_env.reset()
            done = False
            total_reward = 0

            while not done:
                action = self.select_action(state, testing=True)
                next_state, reward, done, _, _ = test_env.step(action)
                state = next_state
                total_reward += reward

            total_rewards.append(total_reward)
            print(f"Test Episode: {episode + 1}, Total Reward: {total_reward}")

        avg_reward = np.mean(total_rewards)
        print(f"\nAverage Reward over {num_episodes} Test Episodes: {avg_reward}")
        test_env.close()

# Policy Gradient Neural Network Model
class PolicyGradientNet(nn.Module):
    def __init__(self, input_dim, action_space):
        super(PolicyGradientNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # First hidden layer
        self.fc2 = nn.Linear(128, 128)       # Second hidden layer
        self.fc3 = nn.Linear(128, action_space)  # Output layer for Q-values


    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension: (batch_size, 1, input_dim)
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = torch.relu(self.fc1(x))
        return torch.softmax(self.fc2(x), dim=-1)  # Apply softmax for probabilities


# Policy Gradient Agent Class
class PolicyGradientAgent:
    def __init__(self, env):
        # Environment
        self.env = env
        self.state_dim = env.observation_space.shape[0]  # First dimension of observation
        self.action_space = env.action_space.n

        # Hyperparameters
        self.learning_rate = 0.001
        self.discount_factor = 0.99
        self.episodes = 50000

        # Initialize policy network
        self.policy_net = PolicyGradientNet(self.state_dim, self.action_space)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)

        # Memory for rewards and log probabilities
        self.log_probs = []
        self.rewards = []

    def select_action(self, state):
        """Select an action based on policy."""
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        probs = self.policy_net(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        self.log_probs.append(action_dist.log_prob(action))
        return action.item()

    def compute_returns(self):
        """Compute discounted returns for each time step."""
        returns = []
        G = 0
        for reward in reversed(self.rewards):
            G = reward + self.discount_factor * G
            returns.insert(0, G)
        returns = torch.tensor(returns, dtype=torch.float32)
        # Normalize returns for better convergence
        returns = (returns - returns.mean()) / (returns.std() + 1e-8)
        return returns

    def train_step(self):
        """Perform a training step after one episode."""
        returns = self.compute_returns()
        loss = 0
        for log_prob, G in zip(self.log_probs, returns):
            loss += -log_prob * G  # Negative sign for gradient ascent

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Clear memory
        self.log_probs = []
        self.rewards = []

    def train(self):
        res=[]
        """Train the agent."""
        for episode in range(self.episodes):
            state, _ = self.env.reset()
            done = False
            total_reward = 0

            while not done:
                action = self.select_action(state)
                next_state, reward, done, _, _ = self.env.step(action)
                self.rewards.append(reward)
                state = next_state
                total_reward += reward

            # Perform a training step after the episode ends
            self.train_step()

            res.append(total_reward)
            print(f"Episode: {episode + 1}, Total Reward: {total_reward}")

        self.env.close()
        print("Training complete!")
        return res

    def test(self, num_episodes=10,render="human"):
        """Test the trained policy with real-time rendering."""
        print("\nTesting the trained policy...\n")
        test_env = gymnasium.make("FlappyBird-v0", render_mode=render, use_lidar=True)
        total_rewards = []

        for episode in range(num_episodes):
            state, _ = test_env.reset()
            done = False
            total_reward = 0

            while not done:
                state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
                probs = self.policy_net(state)
                action = torch.argmax(probs).item()  # Select action with highest probability
                next_state, reward, done, _, _ = test_env.step(action)
                state = next_state
                total_reward += reward

            total_rewards.append(total_reward)
            print(f"Test Episode: {episode + 1}, Total Reward: {total_reward}")

        avg_reward = np.mean(total_rewards)
        print(f"\nAverage Reward over {num_episodes} Test Episodes: {avg_reward}")
        test_env.close()




In [112]:
env = gymnasium.make("FlappyBird-v0", render_mode="rgb_array",use_lidar=True ) ##use_lidar=True option avaliable


agent = DQNAgent(env)
agent.train()
agent.test(num_episodes=10)


pg_agent = PolicyGradientAgent(env)
pg_agent.train()
pg_agent.test(num_episodes=10)


  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(f"{pre} is not within the observation space.")


Episode: 1, Total Reward: -5.699999999999998, Epsilon: 0.9950
Episode: 2, Total Reward: -8.099999999999998, Epsilon: 0.9900
Episode: 3, Total Reward: -8.099999999999998, Epsilon: 0.9851
Episode: 4, Total Reward: -8.099999999999998, Epsilon: 0.9801
Episode: 5, Total Reward: -8.099999999999998, Epsilon: 0.9752
Episode: 6, Total Reward: -6.899999999999999, Epsilon: 0.9704
Episode: 7, Total Reward: -6.899999999999999, Epsilon: 0.9655
Episode: 8, Total Reward: -6.899999999999999, Epsilon: 0.9607
Episode: 9, Total Reward: -6.299999999999999, Epsilon: 0.9559
Episode: 10, Total Reward: -6.899999999999999, Epsilon: 0.9511
Episode: 11, Total Reward: -5.099999999999998, Epsilon: 0.9464
Episode: 12, Total Reward: -7.499999999999998, Epsilon: 0.9416
Episode: 13, Total Reward: -8.099999999999998, Epsilon: 0.9369
Episode: 14, Total Reward: -6.899999999999999, Epsilon: 0.9322
Episode: 15, Total Reward: -8.099999999999998, Epsilon: 0.9276
Episode: 16, Total Reward: -8.099999999999998, Epsilon: 0.9229
E

KeyboardInterrupt: 

In [109]:
class DQNCNN(nn.Module):
    def __init__(self, input_dim, action_space):
        super(DQNCNN, self).__init__()
        self.conv1 = nn.Conv2d(input_dim[0], 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(self._calculate_conv_output(input_dim), 512)
        self.fc2 = nn.Linear(512, action_space)

    def _calculate_conv_output(self, input_dim):
        dummy_input = torch.zeros(1, *input_dim)
        x = self.conv1(dummy_input)
        x = self.conv2(x)
        x = self.conv3(x)
        return int(torch.prod(torch.tensor(x.shape[1:])))
    
    def forward(self, x):
        x = x.view(x.size(0), -1, x.size(-2), x.size(-1))
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = torch.relu(self.fc1(x))
        return self.fc2(x)
    

class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


class DQNAgent:
    def __init__(self, env, input_dim, action_space):
        self.env = env
        self.input_dim = input_dim
        self.action_space = action_space

        # Hyperparameters
        self.learning_rate = 0.0001
        self.discount_factor = 0.99
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.1
        self.batch_size = 64
        self.memory_size = 10000
        self.target_update_freq = 10

        # Networks
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = DQNCNN(input_dim, action_space).to(self.device)
        self.target_net = DQNCNN(input_dim, action_space).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
        self.memory = ReplayMemory(self.memory_size)

        # Preprocessing pipeline
        self.preprocess = Compose([ToTensor(), Resize((84, 84)), Grayscale()])

    def preprocess_frame(self, frame):
        return self.preprocess(frame).numpy()

    def select_action(self, state, testing=False):
        if not testing and np.random.rand() < self.epsilon:
            return random.randint(0, self.action_space - 1)
        else:
            state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            with torch.no_grad():
                return self.policy_net(state).argmax(dim=1).item()

    def optimize_model(self):
        if len(self.memory) < self.batch_size:
            return

        batch = self.memory.sample(self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).unsqueeze(1).to(self.device)
        rewards = torch.FloatTensor(rewards).unsqueeze(1).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).unsqueeze(1).to(self.device)

        # Q-value computation
        q_values = self.policy_net(states).gather(1, actions)
        with torch.no_grad():
            next_q_values = self.target_net(next_states).max(1, keepdim=True)[0]
            target_q_values = rewards + (1 - dones) * self.discount_factor * next_q_values

        loss = nn.MSELoss()(q_values, target_q_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def train(self, episodes=1000):
        results = []
        for episode in range(episodes):
            state, _ = self.env.reset()
            frame = self.env.render()  # Query the initial rendered frame
            state = self.preprocess_frame(frame)
            state_stack = np.stack([state] * 4, axis=0)
            total_reward = 0
            done = False

            while not done:
                action = self.select_action(state_stack)
                next_state, reward, done, truncated, _ = self.env.step(action)
                next_frame = self.env.render()  # Query the next rendered frame
                next_state = self.preprocess_frame(next_frame)
                next_state_stack = np.append(state_stack[1:], [next_state], axis=0)

                self.memory.push(state_stack, action, reward, next_state_stack, done)
                self.optimize_model()
                state_stack = next_state_stack
                total_reward += reward

                if truncated:
                    done = True

            if episode % self.target_update_freq == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())

            self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
            results.append(total_reward)
            print(f"Episode {episode + 1}, Total Reward: {total_reward}, Epsilon: {self.epsilon:.4f}")
        return results


    def test(self, num_episodes=10):
        self.epsilon = 0.0  # Test with deterministic policy
        total_rewards = []

        for episode in range(num_episodes):
            state, _ = self.env.reset()
            frame = self.env.render()  # Query the rendered frame
            state = self.preprocess_frame(frame)
            state_stack = np.stack([state] * 4, axis=0)
            total_reward = 0
            done = False

            while not done:
                action = self.select_action(state_stack, testing=True)
                next_state, reward, done, truncated, _ = self.env.step(action)
                next_frame = self.env.render()  # Query the rendered frame
                state_stack = np.append(state_stack[1:], [self.preprocess_frame(next_frame)], axis=0)
                total_reward += reward

                if truncated:
                    done = True

            total_rewards.append(total_reward)
            print(f"Test Episode {episode + 1}, Total Reward: {total_reward}")

        avg_reward = np.mean(total_rewards)
        print(f"\nAverage Reward over {num_episodes} Test Episodes: {avg_reward}")




In [110]:
env = gymnasium.make("FlappyBird-v0", render_mode="rgb_array")
input_dim = (4, 84, 84)  # 4 stacked grayscale frames
action_space = env.action_space.n

agent = DQNAgent(env, input_dim, action_space)
agent.train(episodes=1000)
agent.test(num_episodes=10)


Episode 1, Total Reward: -8.099999999999998, Epsilon: 0.9950
Episode 2, Total Reward: -6.899999999999999, Epsilon: 0.9900
Episode 3, Total Reward: -7.499999999999998, Epsilon: 0.9851
Episode 4, Total Reward: -6.299999999999999, Epsilon: 0.9801
Episode 5, Total Reward: -7.499999999999998, Epsilon: 0.9752
Episode 6, Total Reward: -8.099999999999998, Epsilon: 0.9704
Episode 7, Total Reward: -8.099999999999998, Epsilon: 0.9655
Episode 8, Total Reward: -8.099999999999998, Epsilon: 0.9607


KeyboardInterrupt: 