Install the enviroment

In [34]:
# !pip install flappy-bird-gymnasium
# !pip install tensorflow 


Play the game (use space)

In [35]:
#!flappy_bird_gymnasium

Testing the enviroment, checking the returned variables

In [36]:
# import flappy_bird_gymnasium
# import gymnasium
# env = gymnasium.make("FlappyBird-v0", render_mode="human", use_lidar=False)

# obs, _ = env.reset()
# while True:
#     # Next action:
#     # (feed the observation to your agent here)
    
#     action = env.action_space.sample()

#     # Processing:
#     #obs is the next state after performing the action
    
#     obs, reward, terminated, _, info = env.step(action)
    
#     # Checking if the player is still alive
#     if terminated:
#         break

# env.close()
# # the numbers in obs are actually normalized between -1 and 1
# print("Final Observation: ", obs)

Install pytorch for the neural network, i used command for gpu

In [37]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

Looking in indexes: https://download.pytorch.org/whl/cu124


initializing hyperparameters

In [38]:


batchSize =  32
epsilon= 1
epsilonDecay= 0.9992
epsilonMIn= 0.05
copyRate = 10
alpha = 0.99
learningRate = 0.0002

    


In [39]:
!pip install pyyaml



Creating the neural network

In [40]:
import torch
from torch import nn
import torch.nn.functional as F

class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(12, 200)
        self.fc2 = nn.Linear(200, 200)
        self.fc3 = nn.Linear(200, 100)
        self.fc4 = nn.Linear(100, 2)
        self.activations = []

    def forward(self, x):
        self.activations = []  # Clear previous activations
        x = F.relu(self.fc1(x))
        self.activations.append(x)
        x = F.relu(self.fc2(x))
        self.activations.append(x)
        x = self.fc3(x)
        self.activations.append(x)
        x = self.fc4(x)
        return x


In [41]:
print(torch.cuda.is_available())

False


Replay buffer class

In [42]:

#A deque is fifo, using it ensures we will not run out of memory
#As we will be removing the oldest memory when we reach the max size
#In general the on element is a tuple of(prev state, action, reward, next state, dead or not)
from collections import deque
import random
class ReplayMemory():
    def __init__(self, maxlen, seed=None):
        self.memory = deque([], maxlen=maxlen)

        
        if seed is not None:
            random.seed(seed)

    def append(self, transition):
        self.memory.append(transition)
    #randomly take a memeory sample with specifed size
    def sample(self, sample_size):
        return random.sample(self.memory, sample_size)

    def __len__(self):
        return len(self.memory)

In [44]:
import torch
import torch.nn as nn
import yaml
import flappy_bird_gymnasium
import gymnasium as gym
import numpy as np
import itertools
import random
import os
import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime, timedelta


DATE_FORMAT = "%m-%d %H:%M:%S"

RUNS_DIR = "runs"
os.makedirs(RUNS_DIR, exist_ok=True)

matplotlib.use('Agg')

device = 'cuda' if torch.cuda.is_available() else 'cpu'

#window = PygameWindow(window_title="MLP Visualization")




class Agent:
    def __init__(self, hyperparameter_set):
        with open('hyperparameters.yml', 'r') as file:
            all_hyperparameter_sets = yaml.safe_load(file)
            hyperparameters = all_hyperparameter_sets[hyperparameter_set]
        self.hyperparameter_set = hyperparameter_set

        # Hyperparameters, can adjust from yml file
        self. envId = hyperparameters['envId']
        self.learningRatea = hyperparameters['learningRatea'] 
        self.discountFactor = hyperparameters['discountFactorg']
        self.networkSyncRate = hyperparameters['networkSyncRate'] 
        
        self.epsilon_init = hyperparameters['epsilonInit']  
        self.epsilon_decay = hyperparameters['epsilonDecay']  
        self.epsilon_min = hyperparameters['epsilonMin'] 
        self.stop_on_reward = hyperparameters['stopOnReward']  
        self.env_make_params = hyperparameters.get('env_make_params', {})  
        self.lossF = nn.MSELoss()
        self.optimizer = None
        self.LOG_FILE = os.path.join(RUNS_DIR, f'{self.hyperparameter_set}.log')
        self.MODEL_FILE = os.path.join(RUNS_DIR, f'{self.hyperparameter_set}.pt')
        

        self.replayMemorySize = hyperparameters['replayMemorySize']  
        self.miniBatchSize = hyperparameters['miniBatchSize']

    def run(self, training=True, render=False):
        if training:
            startTime = datetime.now()
            

            log_message = f"{startTime.strftime(DATE_FORMAT)}:Started training"
            print(log_message)
            with open(self.LOG_FILE, 'w') as file:
                file.write(log_message + '\n')

        env = gym.make(self. envId, render_mode='human' if render else None, **self.env_make_params)
        statesCount = env.observation_space.shape[0]
        rewards = []
        actionsCount = env.action_space.n

        dqn = DQN().to(device)
        if training:
            replayBuffer = ReplayMemory(self.replayMemorySize)
            steps = 0
            epsilons = []
            epsilon = self.epsilon_init
            maxReward = -999999
            # creating the target network, initializing to be the same as training one
            targetDqn = DQN().to(device)
            self.optimizer = torch.optim.Adam(dqn.parameters(), lr=self.learningRatea)
            targetDqn.load_state_dict(dqn.state_dict())
        else:
            # Load learned policy
            dqn.load_state_dict(torch.load(self.MODEL_FILE, map_location=torch.device('cpu')))

            # switch model to evaluation mode
            dqn.eval()

        # train indefinitely until results are satisfying enough
        for episode in itertools.count():
            terminated = False
            state, _ = env.reset()
            state = torch.tensor(state, dtype=torch.float32).to(device)
            epReward = 0.0
            biasedEpisodes = 0
            while not terminated and epReward < self.stop_on_reward:
                # if the random number, between 0,1, is less than epsilon, take a random action
                # now if we will be greedy(exploit), then we will take the action with the highest q value from the network
                if training:
                    if episode < biasedEpisodes:
                        # Apply biased action selection for the first 'biased_episodes' episodes
                        if random.random() < epsilon:
                            # Bias the random action selection towards action 0
                            if random.random() < 0.9:  # 90% probability for action 0
                                action = torch.tensor(0).to(device)
                            else:
                                action = torch.tensor(1).to(device)
                        else:
                            with torch.no_grad():
                                action = dqn(state.unsqueeze(dim=0)).squeeze().argmax()
                    else:
                        # Regular epsilon-greedy action selection after 'biased_episodes' episodes
                        if random.random() < epsilon:
                            action = env.action_space.sample()
                            with open('output.txt', 'w') as file:
                                file.write(str(action.item()))
                            action = torch.tensor(action).to(device)
                        else:
                            with torch.no_grad():
                                action = dqn(state.unsqueeze(dim=0)).squeeze().argmax()
                                with open('output.txt', 'w') as file:
                                    file.write(str(action.item()))
                else:
                    with torch.no_grad():
                        action = dqn(state.unsqueeze(dim=0)).squeeze().argmax()
                        
                
                        

                # Processing:
                # obs is the next state after performing the action
                # giving the network to the training device

                newState, reward, terminated, _, info = env.step(action.item())
                
                # Convert newState to a string format suitable for saving to a file
                newState_str = ' '.join(map(str, newState))

                # Save to input.txt, overwriting previous content
                with open('input.txt', 'w') as file:
                    file.write(newState_str)
                newState = torch.tensor(newState, dtype=torch.float32).to(device)
                reward = torch.tensor(reward, dtype=torch.float32).to(device)
                epReward += reward

                # after taking an action save the tuple in the replay buffer
                if training:
                    replayBuffer.append((state, action, newState, reward, terminated))
                    steps += 1
                # update the state, so that when saving the next action tuple the vars dont get mixed up
                state = newState

            rewards.append(epReward)
            if training:
                if epReward > maxReward:
                    log_message = f"{datetime.now().strftime(DATE_FORMAT)}: New best reward {epReward:0.1f} ({(epReward - maxReward) / maxReward * 100:+.1f}%) at episode {episode}, saving model..."
                    print(log_message)
                    with open(self.LOG_FILE, 'a') as file:
                        file.write(log_message + '\n')

                    torch.save(dqn.state_dict(), self.MODEL_FILE)
                    maxReward = epReward

                
                current_time = datetime.now()
                
                # applying a decay to the current epsilon to encourage exploitation
                # cannot go under epsilonMin so there will always be some sort of exploration

                if len(replayBuffer) > self.miniBatchSize:
                    newBatch = replayBuffer.sample(self.miniBatchSize)
                    self.train(dqn, targetDqn, newBatch)
                    epsilons.append(epsilon)
                    epsilon = max(epsilon * self.epsilon_decay, self.epsilon_min)
                    # i could have used modulus but i am afraid steps can get too big and overflow
                    # anyway after a certain number of steps, copy the training network to the target network
                    # i believe it is best keeping a low rate as we do not want chasing a bad target for long
                    # another thing is, how about increasing the rate as we go
                    if steps > self.networkSyncRate:
                        targetDqn.load_state_dict(dqn.state_dict())
                        steps = 0

    
        

    def train(self, dqn, targetDqn, batch):
        # Transpose the list of experiences and separate each element
        states, actions, new_states, rewards, terminations = zip(*batch)

        # Stack tensors to create batch tensors
        states = torch.stack(states)
        actions = torch.stack(actions)
        newStates = torch.stack(new_states)
        rewards = torch.stack(rewards)
        terminations = torch.tensor(terminations).float().to(device)

        with torch.no_grad():
            # Calculate target Q values (expected returns)
            targetQ = rewards + (1 - terminations) * self.discountFactor * targetDqn(newStates).max(dim=1)[0]
        currQ = dqn(states).gather(dim=1, index=actions.unsqueeze(dim=1)).squeeze()
        loss = self.lossF(currQ, targetQ)

    
        self.optimizer.zero_grad()
        loss.backward()  
        self.optimizer.step()  


hyperparameter_set = 'flappybird'   
dql = Agent(hyperparameter_set=hyperparameter_set)
dql.run(training=False, render=True)


08-10 12:01:12:Started training


  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(f"{pre} is not within the observation space.")


08-10 12:01:15: New best reward -6.9 (-100.0%) at episode 0, saving model...


RuntimeError: mat1 and mat2 shapes cannot be multiplied (124x180 and 12x200)

: 