In [1]:
from utils import *

pygame 2.4.0 (SDL 2.26.4, Python 3.11.3)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [11]:
# Example implementation
random.seed(0)
env = Environment()
current_state = env.reset()
print(current_state)

for _ in range(100):
    instruction = (_ % 20 == 0)
    next_state, reward = env.game_step(instruction)
    if reward != 0: print(f"state: {next_state}, reward: {reward}, old_score: {env.previous_score}, current_score: {env.current_score}")

{'bird_y': 300, 'pipe_low': [435.0, 473], 'pipe_high': [435.0, 173], 'is_alive': True}
state: {'bird_y': 206, 'pipe_low': [363.0, 451], 'pipe_high': [363.0, 151], 'is_alive': True}, reward: 10, old_score: 1, current_score: 1


# 1 - Hyperparameters

In [2]:
MEMORY_SIZE = 100_000     # size of memory buffer
GAMMA = 0.995             # discount factor
ALPHA = 10**(-3)          # learning rate  
NUM_STEPS_FOR_UPDATE = 4  # perform a learning update every C time steps

## 2.1 - Action space
Do Nothing = 0 <br>
Jump = 1

## 2.2 - Observation space

1. Bird's y-pos
2. Next pipe's top and bottom

## 2.3 - Reward

Crossing a pipe gains 15 points, crashing looses 1000.

# 3 - Building the brain of our bird

In [3]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.losses import MSE
from tensorflow.keras.optimizers import Adam

In [4]:
state_size = 5  # Observation space's size
num_actions = 2

q_network = Sequential([
    Input(shape=state_size),
    Dense(units=8, activation="relu"),
    Dense(units=8, activation="relu"),
    Dense(units=num_actions, activation="linear")    
])

target_q_network = Sequential([
    Input(shape=state_size),
    Dense(units=8, activation="relu"),
    Dense(units=8, activation="relu"),
    Dense(units=num_actions, activation="linear")
])

optimizer = Adam(learning_rate=ALPHA)

In [5]:
# Store experiences as named tuples
from collections import namedtuple

experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "is_alive"])

In [6]:
def compute_loss(experiences, gamma, q_network, target_q_network):
    """ 
    Args:
      experiences: (tuple) tuple of ["state", "action", "reward", "next_state", "done"] namedtuples
      gamma: (float) The discount factor.
      q_network: (tf.keras.Sequential) Keras model for predicting the q_values
      target_q_network: (tf.keras.Sequential) Keras model for predicting the targets
          
    Returns:
      loss: (TensorFlow Tensor(shape=(0,), dtype=int32)) the Mean-Squared Error between
            the y targets and the Q(s,a) values.
    """

    # Unpack the mini-batch of experience tuples
    states, actions, rewards, next_states, is_alive = experiences
    
    # Compute max Q^(s,a)
    max_qsa = tf.reduce_max(target_q_network(next_states), axis=-1)
    
    # Set y = R if episode terminates, otherwise set y = R + γ max Q^(s,a).
    y_targets = rewards + is_alive * gamma * max_qsa
    
    # Get the q_values and reshape to match y_targets
    q_values = q_network(states)
    q_values = tf.gather_nd(q_values, tf.stack([tf.range(q_values.shape[0]),
                                                tf.cast(actions, tf.int32)], axis=1))
        
    # Compute the loss
    loss = MSE(y_targets, q_values)
    
    return loss

In [7]:
@tf.function
def agent_learn(experiences, gamma):
    # Calculate the loss
    with tf.GradientTape() as tape:
        loss = compute_loss(experiences, gamma, q_network, target_q_network)

    # Get the gradients of the loss with respect to the weights.
    gradients = tape.gradient(loss, q_network.trainable_variables)
    
    # Update the weights of the q_network.
    optimizer.apply_gradients(zip(gradients, q_network.trainable_variables))

    # update the weights of target q_network
    update_target_network(q_network, target_q_network)

# 4 - Training model progressively

## 4.1 - Basic training
`GAP_WIDTH = 300` where HEIGHT = 600px <br>
`pipe.y_center = HEIGHT // 4`

In [9]:
import numpy as np
import time
from collections import deque

start = time.time()

num_episodes = 2_000
max_num_timesteps = 100_000

total_point_history = []

num_p_av = 100    # number of total points to use for averaging
epsilon = 1.0     # initial ε value for ε-greedy policy

# Create a memory buffer D with capacity N
memory_buffer = deque(maxlen=MEMORY_SIZE)

# Set the target network weights equal to the Q-Network weights
target_q_network.set_weights(q_network.get_weights())

for i in range(num_episodes):
    # Reset the environment to the initial state and get the initial state
    state = env.reset()
    state_array = np.array([
        state["bird_y"], 
        state["pipe_low"][0], 
        state["pipe_low"][1], 
        state["pipe_high"][0], 
        state["pipe_high"][1]], dtype=np.float32)
    is_alive = int(state["is_alive"])

    total_points = 0
    
    for t in range(max_num_timesteps):
        # From the current state S choose an action A using an ε-greedy policy
        state_qn = np.expand_dims(state_array, axis=0)  # state needs to be the right shape for the q_network
        q_values = q_network(state_qn)
        action = get_action(q_values, epsilon)  # from utils
        
        # Take action A and receive reward R and the next state S'
        next_state, reward = env.game_step(action)
        next_state_array = np.array([
            next_state["bird_y"], 
            next_state["pipe_low"][0], 
            next_state["pipe_low"][1], 
            next_state["pipe_high"][0], 
            next_state["pipe_high"][1]], dtype=np.float32)
        is_alive = int(state["is_alive"])
        
        # Store experience tuple (S,A,R,S') in the memory buffer.
        # We store the done variable as well for convenience.
        memory_buffer.append(experience(state_array, action, reward, next_state_array, is_alive))
        
        # Only update the network every NUM_STEPS_FOR_UPDATE time steps.
        update = check_update_conditions(t, NUM_STEPS_FOR_UPDATE, memory_buffer)  # from utils
        
        if update:
            # Sample random mini-batch of experience tuples (S,A,R,S') from D
            experiences = get_experiences(memory_buffer)  # from utils
            
            # Set the y targets, perform a gradient descent step,
            # and update the network weights.
            agent_learn(experiences, GAMMA)
        
        state = next_state.copy()
        state_array = np.array([
            state["bird_y"], 
            state["pipe_low"][0], 
            state["pipe_low"][1], 
            state["pipe_high"][0], 
            state["pipe_high"][1]], dtype=np.float32)
        is_alive = int(state["is_alive"])

        total_points += reward
        
        if not is_alive:
            break
            
    total_point_history.append(total_points)
    av_latest_points = np.mean(total_point_history[-num_p_av:])
    
    # Update the ε value
    epsilon = get_new_eps(epsilon)

    print(f"\rEpisode {i+1} | Total point average of the last {num_p_av} episodes: {av_latest_points:.2f}", end="")

    if (i+1) % num_p_av == 0:
        print(f"\rEpisode {i+1} | Total point average of the last {num_p_av} episodes: {av_latest_points:.2f}")

    # We will consider that the environment is solved if we get an
    # average of 100 points in the last 100 episodes.
    if av_latest_points >= 100:
        print(f"\n\nEnvironment solved in {i+1} episodes!")
        q_network.save('flappy_bird_model.h5')
        break
        
tot_time = time.time() - start

print(f"\nTotal Runtime: {tot_time:.2f} s ({(tot_time/60):.2f} min)")

Episode 100 | Total point average of the last 100 episodes: -100.00
Episode 200 | Total point average of the last 100 episodes: -100.00
Episode 300 | Total point average of the last 100 episodes: -98.900
Episode 400 | Total point average of the last 100 episodes: -97.90
Episode 500 | Total point average of the last 100 episodes: -98.40
Episode 600 | Total point average of the last 100 episodes: -98.80
Episode 700 | Total point average of the last 100 episodes: -99.60
Episode 800 | Total point average of the last 100 episodes: -99.80
Episode 900 | Total point average of the last 100 episodes: -99.90
Episode 1000 | Total point average of the last 100 episodes: -100.00
Episode 1100 | Total point average of the last 100 episodes: -100.00
Episode 1200 | Total point average of the last 100 episodes: -100.00
Episode 1300 | Total point average of the last 100 episodes: -98.200
Episode 1400 | Total point average of the last 100 episodes: -97.50
Episode 1500 | Total point average of the last 100

In [10]:
# Reset the environment to the initial state and get the initial state
state = env.reset()
state_array = np.array([
    state["bird_y"], 
    state["pipe_low"][0], 
    state["pipe_low"][1], 
    state["pipe_high"][0], 
    state["pipe_high"][1]], dtype=np.float32)
is_alive = int(state["is_alive"])

total_points = 0

actions = []

for t in range(max_num_timesteps):
    # From the current state S choose an action A using an ε-greedy policy
    state_qn = np.expand_dims(state_array, axis=0)  # state needs to be the right shape for the q_network
    q_values = q_network(state_qn)
    action = get_action(q_values, epsilon)  # from utils

    actions.append(action)
    
    # Take action A and receive reward R and the next state S'
    next_state, reward = env.game_step(action)
    next_state_array = np.array([
        next_state["bird_y"], 
        next_state["pipe_low"][0], 
        next_state["pipe_low"][1], 
        next_state["pipe_high"][0], 
        next_state["pipe_high"][1]], dtype=np.float32)
    is_alive = int(state["is_alive"])
    
    state = next_state.copy()
    state_array = np.array([
        state["bird_y"], 
        state["pipe_low"][0], 
        state["pipe_low"][1], 
        state["pipe_high"][0], 
        state["pipe_high"][1]], dtype=np.float32)
    is_alive = int(state["is_alive"])

    if not is_alive:
        break

    if reward > 0:
        total_points += 1
    
print("points:", total_points)

points: 1470


In [14]:
q_network.save('logs/1.fb_large_width_no_rng.h5')



## 4.2 - Randomise height of pipes in a small range

In [1]:
from utils import *
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.losses import MSE
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from collections import namedtuple

MEMORY_SIZE = 100_000     # size of memory buffer
GAMMA = 0.995             # discount factor
ALPHA = 10**(-3)          # learning rate  
NUM_STEPS_FOR_UPDATE = 4  # perform a learning update every C time steps

env = Environment()

state_size = 5  # Observation space's size
num_actions = 2

q_network = Sequential([
    Input(shape=state_size),
    Dense(units=8, activation="relu"),
    Dense(units=8, activation="relu"),
    Dense(units=num_actions, activation="linear")    
])

target_q_network = Sequential([
    Input(shape=state_size),
    Dense(units=8, activation="relu"),
    Dense(units=8, activation="relu"),
    Dense(units=num_actions, activation="linear")
])

optimizer = Adam(learning_rate=ALPHA)

q_network = load_model('logs/1.fb_large_width_no_rng.h5')

experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "is_alive"])

def compute_loss(experiences, gamma, q_network, target_q_network):
    """ 
    Args:
      experiences: (tuple) tuple of ["state", "action", "reward", "next_state", "done"] namedtuples
      gamma: (float) The discount factor.
      q_network: (tf.keras.Sequential) Keras model for predicting the q_values
      target_q_network: (tf.keras.Sequential) Keras model for predicting the targets
          
    Returns:
      loss: (TensorFlow Tensor(shape=(0,), dtype=int32)) the Mean-Squared Error between
            the y targets and the Q(s,a) values.
    """

    # Unpack the mini-batch of experience tuples
    states, actions, rewards, next_states, is_alive = experiences
    
    # Compute max Q^(s,a)
    max_qsa = tf.reduce_max(target_q_network(next_states), axis=-1)
    
    # Set y = R if episode terminates, otherwise set y = R + γ max Q^(s,a).
    y_targets = rewards + is_alive * gamma * max_qsa
    
    # Get the q_values and reshape to match y_targets
    q_values = q_network(states)
    q_values = tf.gather_nd(q_values, tf.stack([tf.range(q_values.shape[0]),
                                                tf.cast(actions, tf.int32)], axis=1))
        
    # Compute the loss
    loss = MSE(y_targets, q_values)
    
    return loss

@tf.function
def agent_learn(experiences, gamma):
    # Calculate the loss
    with tf.GradientTape() as tape:
        loss = compute_loss(experiences, gamma, q_network, target_q_network)

    # Get the gradients of the loss with respect to the weights.
    gradients = tape.gradient(loss, q_network.trainable_variables)
    
    # Update the weights of the q_network.
    optimizer.apply_gradients(zip(gradients, q_network.trainable_variables))

    # update the weights of target q_network
    update_target_network(q_network, target_q_network)
    

pygame 2.4.0 (SDL 2.26.4, Python 3.11.3)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
import numpy as np
import time
from collections import deque

start = time.time()

num_episodes = 2_000
max_num_timesteps = 100_000

total_point_history = []

num_p_av = 100    # number of total points to use for averaging
epsilon = 1.0     # initial ε value for ε-greedy policy

# Create a memory buffer D with capacity N
memory_buffer = deque(maxlen=MEMORY_SIZE)

# Set the target network weights equal to the Q-Network weights
target_q_network.set_weights(q_network.get_weights())

for i in range(num_episodes):
    # Reset the environment to the initial state and get the initial state
    state = env.reset()
    state_array = np.array([
        state["bird_y"], 
        state["pipe_low"][0], 
        state["pipe_low"][1], 
        state["pipe_high"][0], 
        state["pipe_high"][1]], dtype=np.float32)
    is_alive = int(state["is_alive"])

    total_points = 0
    
    for t in range(max_num_timesteps):
        # From the current state S choose an action A using an ε-greedy policy
        state_qn = np.expand_dims(state_array, axis=0)  # state needs to be the right shape for the q_network
        q_values = q_network(state_qn)
        action = get_action(q_values, epsilon)  # from utils
        
        # Take action A and receive reward R and the next state S'
        next_state, reward = env.game_step(action)
        next_state_array = np.array([
            next_state["bird_y"], 
            next_state["pipe_low"][0], 
            next_state["pipe_low"][1], 
            next_state["pipe_high"][0], 
            next_state["pipe_high"][1]], dtype=np.float32)
        is_alive = int(state["is_alive"])
        
        # Store experience tuple (S,A,R,S') in the memory buffer.
        # We store the done variable as well for convenience.
        memory_buffer.append(experience(state_array, action, reward, next_state_array, is_alive))
        
        # Only update the network every NUM_STEPS_FOR_UPDATE time steps.
        update = check_update_conditions(t, NUM_STEPS_FOR_UPDATE, memory_buffer)  # from utils
        
        if update:
            # Sample random mini-batch of experience tuples (S,A,R,S') from D
            experiences = get_experiences(memory_buffer)  # from utils
            
            # Set the y targets, perform a gradient descent step,
            # and update the network weights.
            agent_learn(experiences, GAMMA)
        
        state = next_state.copy()
        state_array = np.array([
            state["bird_y"], 
            state["pipe_low"][0], 
            state["pipe_low"][1], 
            state["pipe_high"][0], 
            state["pipe_high"][1]], dtype=np.float32)
        is_alive = int(state["is_alive"])

        total_points += reward
        
        if not is_alive:
            break
            
    total_point_history.append(total_points)
    av_latest_points = np.mean(total_point_history[-num_p_av:])
    
    # Update the ε value
    epsilon = get_new_eps(epsilon)

    print(f"\rEpisode {i+1} | Total point average of the last {num_p_av} episodes: {av_latest_points:.2f}", end="")

    if (i+1) % num_p_av == 0:
        print(f"\rEpisode {i+1} | Total point average of the last {num_p_av} episodes: {av_latest_points:.2f}")

    # We will consider that the environment is solved if we get an
    # average of 100 points in the last 100 episodes.
    if av_latest_points >= 100:
        print(f"\n\nEnvironment solved in {i+1} episodes!")
        q_network.save('flappy_bird_model.h5')
        break
        
tot_time = time.time() - start

print(f"\nTotal Runtime: {tot_time:.2f} s ({(tot_time/60):.2f} min)")

Episode 100 | Total point average of the last 100 episodes: -1000.00
Episode 200 | Total point average of the last 100 episodes: -1000.00
Episode 300 | Total point average of the last 100 episodes: -999.100
Episode 400 | Total point average of the last 100 episodes: -998.40
Episode 500 | Total point average of the last 100 episodes: -994.30
Episode 600 | Total point average of the last 100 episodes: -989.30
Episode 700 | Total point average of the last 100 episodes: -999.80
Episode 800 | Total point average of the last 100 episodes: -999.900
Episode 900 | Total point average of the last 100 episodes: -985.70
Episode 1000 | Total point average of the last 100 episodes: -999.90
Episode 1100 | Total point average of the last 100 episodes: -1000.00
Episode 1200 | Total point average of the last 100 episodes: -1000.00
Episode 1300 | Total point average of the last 100 episodes: -1000.00
Episode 1400 | Total point average of the last 100 episodes: -1000.00
Episode 1500 | Total point average 

In [13]:
# Reset the environment to the initial state and get the initial state
random.seed(42)
state = env.reset()
state_array = np.array([
    state["bird_y"], 
    state["pipe_low"][0], 
    state["pipe_low"][1], 
    state["pipe_high"][0], 
    state["pipe_high"][1]], dtype=np.float32)
is_alive = int(state["is_alive"])

total_points = 0

actions = []

for t in range(max_num_timesteps):
    # From the current state S choose an action A using an ε-greedy policy
    state_qn = np.expand_dims(state_array, axis=0)  # state needs to be the right shape for the q_network
    q_values = q_network(state_qn)
    action = get_action(q_values, epsilon)  # from utils

    actions.append(action)
    
    # Take action A and receive reward R and the next state S'
    next_state, reward = env.game_step(action)
    next_state_array = np.array([
        next_state["bird_y"], 
        next_state["pipe_low"][0], 
        next_state["pipe_low"][1], 
        next_state["pipe_high"][0], 
        next_state["pipe_high"][1]], dtype=np.float32)
    is_alive = int(state["is_alive"])
    
    state = next_state.copy()
    state_array = np.array([
        state["bird_y"], 
        state["pipe_low"][0], 
        state["pipe_low"][1], 
        state["pipe_high"][0], 
        state["pipe_high"][1]], dtype=np.float32)
    is_alive = int(state["is_alive"])

    if not is_alive:
        break

    if reward > 0:
        total_points += 1
    
print("points:", total_points)

points: 165


In [5]:
q_network.save('logs/1.fb_large_width_smol_rng.h5')



In [14]:
import pandas as pd 

df = pd.DataFrame(actions)

df.to_csv("archive/instructions.csv", index=False)

## 4.3 - Reduce pipe gap
`GAP_SIZE = 200` which initially was `300`

In [2]:
from utils import *
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.losses import MSE
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from collections import namedtuple

MEMORY_SIZE = 100_000     # size of memory buffer
GAMMA = 0.995             # discount factor
ALPHA = 10**(-3)          # learning rate  
NUM_STEPS_FOR_UPDATE = 4  # perform a learning update every C time steps

env = Environment()

state_size = 5  # Observation space's size
num_actions = 2

q_network = Sequential([
    Input(shape=state_size),
    Dense(units=8, activation="relu"),
    Dense(units=8, activation="relu"),
    Dense(units=num_actions, activation="linear")    
])

target_q_network = Sequential([
    Input(shape=state_size),
    Dense(units=8, activation="relu"),
    Dense(units=8, activation="relu"),
    Dense(units=num_actions, activation="linear")
])

optimizer = Adam(learning_rate=ALPHA)

# q_network = load_model('logs/2.fb_large_width_smol_rng.h5')

experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "is_alive"])

def compute_loss(experiences, gamma, q_network, target_q_network):
    """ 
    Args:
      experiences: (tuple) tuple of ["state", "action", "reward", "next_state", "done"] namedtuples
      gamma: (float) The discount factor.
      q_network: (tf.keras.Sequential) Keras model for predicting the q_values
      target_q_network: (tf.keras.Sequential) Keras model for predicting the targets
          
    Returns:
      loss: (TensorFlow Tensor(shape=(0,), dtype=int32)) the Mean-Squared Error between
            the y targets and the Q(s,a) values.
    """

    # Unpack the mini-batch of experience tuples
    states, actions, rewards, next_states, is_alive = experiences
    
    # Compute max Q^(s,a)
    max_qsa = tf.reduce_max(target_q_network(next_states), axis=-1)
    
    # Set y = R if episode terminates, otherwise set y = R + γ max Q^(s,a).
    y_targets = rewards + is_alive * gamma * max_qsa
    
    # Get the q_values and reshape to match y_targets
    q_values = q_network(states)
    q_values = tf.gather_nd(q_values, tf.stack([tf.range(q_values.shape[0]),
                                                tf.cast(actions, tf.int32)], axis=1))
        
    # Compute the loss
    loss = MSE(y_targets, q_values)
    
    return loss

@tf.function
def agent_learn(experiences, gamma):
    # Calculate the loss
    with tf.GradientTape() as tape:
        loss = compute_loss(experiences, gamma, q_network, target_q_network)

    # Get the gradients of the loss with respect to the weights.
    gradients = tape.gradient(loss, q_network.trainable_variables)
    
    # Update the weights of the q_network.
    optimizer.apply_gradients(zip(gradients, q_network.trainable_variables))

    # update the weights of target q_network
    update_target_network(q_network, target_q_network)
    

import numpy as np
import time
from collections import deque

start = time.time()

num_episodes = 2_000
max_num_timesteps = 100_000

total_point_history = []

num_p_av = 100    # number of total points to use for averaging
epsilon = 1.0     # initial ε value for ε-greedy policy

# Create a memory buffer D with capacity N
memory_buffer = deque(maxlen=MEMORY_SIZE)

# Set the target network weights equal to the Q-Network weights
target_q_network.set_weights(q_network.get_weights())

for i in range(num_episodes):
    # Reset the environment to the initial state and get the initial state
    state = env.reset()
    state_array = np.array([
        state["bird_y"], 
        state["pipe_low"][0], 
        state["pipe_low"][1], 
        state["pipe_high"][0], 
        state["pipe_high"][1]], dtype=np.float32)
    is_alive = int(state["is_alive"])

    total_points = 0
    
    for t in range(max_num_timesteps):
        # From the current state S choose an action A using an ε-greedy policy
        state_qn = np.expand_dims(state_array, axis=0)  # state needs to be the right shape for the q_network
        q_values = q_network(state_qn)
        action = get_action(q_values, epsilon)  # from utils
        
        # Take action A and receive reward R and the next state S'
        next_state, reward = env.game_step(action)
        next_state_array = np.array([
            next_state["bird_y"], 
            next_state["pipe_low"][0], 
            next_state["pipe_low"][1], 
            next_state["pipe_high"][0], 
            next_state["pipe_high"][1]], dtype=np.float32)
        is_alive = int(state["is_alive"])
        
        # Store experience tuple (S,A,R,S') in the memory buffer.
        # We store the done variable as well for convenience.
        memory_buffer.append(experience(state_array, action, reward, next_state_array, is_alive))
        
        # Only update the network every NUM_STEPS_FOR_UPDATE time steps.
        update = check_update_conditions(t, NUM_STEPS_FOR_UPDATE, memory_buffer)  # from utils
        
        if update:
            # Sample random mini-batch of experience tuples (S,A,R,S') from D
            experiences = get_experiences(memory_buffer)  # from utils
            
            # Set the y targets, perform a gradient descent step,
            # and update the network weights.
            agent_learn(experiences, GAMMA)
        
        state = next_state.copy()
        state_array = np.array([
            state["bird_y"], 
            state["pipe_low"][0], 
            state["pipe_low"][1], 
            state["pipe_high"][0], 
            state["pipe_high"][1]], dtype=np.float32)
        is_alive = int(state["is_alive"])

        total_points += reward
        
        if not is_alive:
            break
            
    total_point_history.append(total_points)
    av_latest_points = np.mean(total_point_history[-num_p_av:])
    
    # Update the ε value
    epsilon = get_new_eps(epsilon)

    print(f"\rEpisode {i+1} | Total point average of the last {num_p_av} episodes: {av_latest_points:.2f}", end="")

    if (i+1) % num_p_av == 0:
        print(f"\rEpisode {i+1} | Total point average of the last {num_p_av} episodes: {av_latest_points:.2f}")

    # We will consider that the environment is solved if we get an
    # average of 100 points in the last 100 episodes.
    if av_latest_points >= 100:
        print(f"\n\nEnvironment solved in {i+1} episodes!")
        q_network.save('flappy_bird_model.h5')
        break
        
tot_time = time.time() - start

print(f"\nTotal Runtime: {tot_time:.2f} s ({(tot_time/60):.2f} min)")

Episode 100 | Total point average of the last 100 episodes: -500.00
Episode 200 | Total point average of the last 100 episodes: -500.00
Episode 300 | Total point average of the last 100 episodes: -500.00
Episode 400 | Total point average of the last 100 episodes: -500.00
Episode 500 | Total point average of the last 100 episodes: -500.00
Episode 600 | Total point average of the last 100 episodes: -500.00
Episode 700 | Total point average of the last 100 episodes: -499.50
Episode 800 | Total point average of the last 100 episodes: -492.10
Episode 900 | Total point average of the last 100 episodes: -497.00
Episode 1000 | Total point average of the last 100 episodes: -495.90
Episode 1100 | Total point average of the last 100 episodes: -495.30
Episode 1200 | Total point average of the last 100 episodes: -497.60
Episode 1300 | Total point average of the last 100 episodes: -490.90
Episode 1400 | Total point average of the last 100 episodes: -490.80
Episode 1500 | Total point average of the l