In [1]:
import gymnasium
import flappy_bird_gymnasium
import numpy as np
import pygame
import itertools
import random
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
from enum import IntEnum
from torchvision.transforms import Compose, ToTensor, Resize, Grayscale
from flappy_bird_gymnasium.envs.flappy_bird_env import FlappyBirdEnv
from flappy_bird_gymnasium.envs.flappy_bird_env import Actions
from flappy_bird_gymnasium.envs.lidar import LIDAR
from flappy_bird_gymnasium.envs.constants import (
    PLAYER_FLAP_ACC,
    PLAYER_ACC_Y,
    PLAYER_MAX_VEL_Y,
    PLAYER_HEIGHT,
    PLAYER_VEL_ROT,
    PLAYER_WIDTH,
    PIPE_WIDTH,
    PIPE_VEL_X,
)

def new_render(self):
    """Renders the next frame."""
    if self.render_mode == "rgb_array":
        self._draw_surface(show_score=False, show_rays=False)
        # Flip the image to retrieve a correct aspect
        return np.transpose(pygame.surfarray.array3d(self._surface), axes=(1, 0, 2))
    else:
        self._draw_surface(show_score=True, show_rays=False)
        if self._display is None:
            self._make_display()

        self._update_display()
        self._fps_clock.tick(self.metadata["render_fps"])


FlappyBirdEnv.render = new_render

In [7]:
######## Optimized AC ########
######## For Video Record ########

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from torch.distributions import Categorical

class ActorCriticNet(nn.Module):
    def __init__(self, input_dim, action_space):
        super().__init__()
        self.shared = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU()
        )
        self.actor = nn.Sequential(
            nn.Linear(128, action_space)
        )
        self.critic = nn.Linear(128, 1)

    def forward(self, x):
        shared = self.shared(x)
        return F.softmax(self.actor(shared), dim=-1), self.critic(shared)

class OptimizedActorCritic:
    def __init__(self, env, config):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.env = env
        self.state_dim = env.observation_space.shape[0]
        self.action_space = env.action_space.n
        
        # Parameters
        self.actor_lr = config.get('actor_lr', 0.001)
        self.critic_lr = config.get('critic_lr', 0.005)
        self.gamma = config.get('gamma', 0.99)
        self.gae_lambda = config.get('gae_lambda', 0.95)
        self.entropy_coef = config.get('entropy_coef', 0.01)
        self.value_loss_coef = config.get('value_loss_coef', 0.5)
        self.max_grad_norm = config.get('max_grad_norm', 0.5)
        self.episodes = config.get('episodes', 5000)
        
        self.network = ActorCriticNet(self.state_dim, self.action_space).to(self.device)
        self.actor_optimizer = optim.Adam(self.network.parameters(), lr=self.actor_lr)
        self.critic_optimizer = optim.Adam(self.network.parameters(), lr=self.critic_lr)
        
        self.log_probs = []
        self.values = []
        self.rewards = []
        self.masks = []

    def select_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            probs, value = self.network(state)
            dist = Categorical(probs)
            action = dist.sample()
            log_prob = dist.log_prob(action)
        
        return action.item(), log_prob.item(), value.item()

    def compute_gae(self):
        returns = []
        gae = 0
        for step in reversed(range(len(self.rewards))):
            if step == len(self.rewards) - 1:
                next_value = 0
            else:
                next_value = self.values[step + 1]
                
            delta = self.rewards[step] + self.gamma * next_value * self.masks[step] - self.values[step]
            gae = delta + self.gamma * self.gae_lambda * self.masks[step] * gae
            returns.insert(0, gae + self.values[step])
            
        return torch.FloatTensor(returns).to(self.device)

    def train_step(self):
        returns = self.compute_gae()
        
        values = torch.FloatTensor(self.values).to(self.device)
        log_probs = torch.FloatTensor(self.log_probs).to(self.device)
        advantages = returns - values
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
        
        states = torch.FloatTensor(self.states).to(self.device)
        actions = torch.LongTensor(self.actions).to(self.device)
        
        # Forward pass
        new_probs, new_values = self.network(states)
        dist = Categorical(new_probs)
        new_log_probs = dist.log_prob(actions)
        entropy = dist.entropy().mean()
        
        # Actor loss
        ratio = torch.exp(new_log_probs - log_probs)
        surr1 = ratio * advantages
        actor_loss = -(surr1.mean() + self.entropy_coef * entropy)
        
        # Critic loss
        critic_loss = self.value_loss_coef * F.mse_loss(new_values.squeeze(-1), returns)
        
        # Total loss
        total_loss = actor_loss + critic_loss
        
        # Optimize
        self.actor_optimizer.zero_grad()
        self.critic_optimizer.zero_grad()
        total_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.network.parameters(), self.max_grad_norm)
        self.actor_optimizer.step()
        self.critic_optimizer.step()
        
        # Clear memory
        self.log_probs.clear()
        self.values.clear()
        self.rewards.clear()
        self.masks.clear()
        self.states.clear()
        self.actions.clear()

    def train(self):
        rewards_history = []
        
        for episode in range(self.episodes):
            state, _ = self.env.reset()
            done = False
            total_reward = 0
            
            self.states = []
            self.actions = []
            
            while not done:
                action, log_prob, value = self.select_action(state)
                next_state, reward, done, truncated, _ = self.env.step(action)
                
                self.states.append(state)
                self.actions.append(action)
                self.rewards.append(reward)
                self.log_probs.append(log_prob)
                self.values.append(value)
                self.masks.append(1 - done)
                
                state = next_state
                total_reward += reward
                
                if truncated:
                    break
            
            self.train_step()
            rewards_history.append(total_reward)
            
            if episode % 100 == 0:
                avg_reward = np.mean(rewards_history[-100:])
                print(f"Episode {episode}, Reward: {total_reward:.2f}, Avg: {avg_reward:.2f}")
        
        return rewards_history

    def test(self, num_episodes=10):
        total_rewards = []
        for episode in range(num_episodes):
            state, _ = self.env.reset()
            total_reward = 0
            done = False
            
            while not done:
                state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
                with torch.no_grad():
                    probs, _ = self.network(state)
                action = probs.argmax().item()
                state, reward, done, truncated, _ = self.env.step(action)
                total_reward += reward
                if truncated:
                    break
            
            total_rewards.append(total_reward)
            # print(f"Test Episode {episode}, Total Reward: {total_reward}")

            if episode % 100 == 0:
                avg_reward = np.mean(total_rewards[-100:])
                print(f"Test Episode {episode}, Reward: {total_reward:.2f}, Avg: {avg_reward:.2f}")
                
        print(f"Average Test Reward: {np.mean(total_rewards):.2f}")
        return total_rewards
    
    def play(self, num_episodes=1, render=True):
        """
        Play the game with the trained agent and visualize the gameplay using a separate environment.
        
        Args:
            num_episodes (int): Number of episodes to play
            render (bool): Whether to render the environment
        """
        # Create a separate environment for visualization
        if render:
            viz_env = gymnasium.make("FlappyBird-v0", render_mode="human", use_lidar=True)
        else:
            viz_env = self.env
            
        try:
            for episode in range(num_episodes):
                state, _ = viz_env.reset()
                done = False
                total_reward = 0
                episode_steps = 0
                
                while not done:
                    if render:
                        viz_env.render()
                    
                    # Convert state to tensor and get action
                    state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
                    with torch.no_grad():
                        probs, _ = self.network(state)
                    action = probs.argmax().item()
                    
                    # Take action in environment
                    state, reward, done, truncated, _ = viz_env.step(action)
                    total_reward += reward
                    episode_steps += 1
                    
                    if truncated:
                        break
                
                print(f"Episode {episode + 1} finished with reward: {total_reward:.2f} in {episode_steps} steps")
        
        finally:
            if render:
                viz_env.close()
                
        return total_reward
            
            
import gymnasium
import itertools
import pickle
import gc

env = gymnasium.make("FlappyBird-v0", render_mode="rgb_array", use_lidar=True)

# Real
hyper = {
   "actor_lr": [0.0005],
   "critic_lr": [0.001], 
   "gamma": 0.99,
   "gae_lambda": [0.9],
   "entropy_coef": [0.05],
   "value_loss_coef": [1.0],
   "max_grad_norm": 0.5,
   "episodes": 30000
}

# For test
# hyper = {
#    "actor_lr": [0.0005],
#    "critic_lr": [0.005], 
#    "gamma": 0.99,
#    "gae_lambda": [0.9, 0.95],
#    "entropy_coef": [0.01, 0.05],
#    "value_loss_coef": [0.5, 1.0],
#    "max_grad_norm": 0.5,
#    "episodes": 200
# }

param_combinations = itertools.product(
   hyper["actor_lr"],
   hyper["critic_lr"], 
   hyper["gae_lambda"],
   hyper["entropy_coef"],
   hyper["value_loss_coef"]
)

exp_res = {}
test_res = {}

for actor_lr, critic_lr, gae_lambda, entropy_coef, value_loss_coef in param_combinations:
   current_hyperparams = {
       "actor_lr": actor_lr,
       "critic_lr": critic_lr,
       "gamma": hyper["gamma"],
       "gae_lambda": gae_lambda,
       "entropy_coef": entropy_coef,
       "value_loss_coef": value_loss_coef,
       "max_grad_norm": hyper["max_grad_norm"],
       "episodes": hyper["episodes"]
   }
   
   try:
       agent = OptimizedActorCritic(env, current_hyperparams)
       exp_key = f"a_lr={actor_lr}_c_lr={critic_lr}_gae={gae_lambda}_ent={entropy_coef}_val={value_loss_coef}"
       exp_res[exp_key] = agent.train()
       test_res[exp_key] = agent.test(num_episodes=3000)
       
    #    with open("ac_exp_res.pkl", "wb") as f:
    #        pickle.dump(exp_res, f)
    #    with open("ac_test_res.pkl", "wb") as f:
    #        pickle.dump(test_res, f)
           
       print(f"Completed: {exp_key}")
       
   finally:
       gc.collect()
       env.close()

# with open("ac_exp_res.pkl", "rb") as f:
#    exp_res = pickle.load(f)
# with open("ac_test_res.pkl", "rb") as f:
#    test_res = pickle.load(f)

print("Experiment complete")


Episode 0, Reward: -8.10, Avg: -8.10
Episode 100, Reward: 1.40, Avg: -2.01
Episode 200, Reward: 0.80, Avg: -0.10
Episode 300, Reward: 2.10, Avg: 0.05
Episode 400, Reward: -0.90, Avg: 0.16
Episode 500, Reward: 0.80, Avg: 0.35
Episode 600, Reward: 1.50, Avg: 0.35
Episode 700, Reward: 1.80, Avg: 0.59
Episode 800, Reward: -0.90, Avg: 0.19
Episode 900, Reward: 1.50, Avg: 0.65
Episode 1000, Reward: 0.80, Avg: 0.13
Episode 1100, Reward: 1.60, Avg: 0.56
Episode 1200, Reward: 0.80, Avg: 0.28
Episode 1300, Reward: 0.80, Avg: 0.67
Episode 1400, Reward: -0.90, Avg: 0.31
Episode 1500, Reward: 0.80, Avg: 0.07
Episode 1600, Reward: 2.10, Avg: 0.39
Episode 1700, Reward: 0.10, Avg: 0.35
Episode 1800, Reward: 0.80, Avg: 0.06
Episode 1900, Reward: -0.30, Avg: -0.19
Episode 2000, Reward: -0.40, Avg: -0.09
Episode 2100, Reward: -0.30, Avg: 0.80
Episode 2200, Reward: -1.50, Avg: 1.25
Episode 2300, Reward: 12.70, Avg: 3.44
Episode 2400, Reward: 10.50, Avg: 3.51
Episode 2500, Reward: 24.00, Avg: 4.52
Episode 

KeyboardInterrupt: 

In [8]:
agent.play(num_episodes=10)

Episode 1 finished with reward: 17.40 in 296 steps
Episode 2 finished with reward: 25.00 in 633 steps
Episode 3 finished with reward: 29.90 in 445 steps
Episode 4 finished with reward: 13.80 in 296 steps
Episode 5 finished with reward: 147.80 in 2965 steps
Episode 6 finished with reward: 128.00 in 2176 steps
Episode 7 finished with reward: 82.90 in 1347 steps
Episode 8 finished with reward: 15.80 in 331 steps
Episode 9 finished with reward: 9.10 in 219 steps
Episode 10 finished with reward: 28.80 in 818 steps


28.8000000000003