# Training a Q Learning Agent in Gymnasium's Blackjack Environment

## Imports and Installs

In [1]:
!pip install gym gym[atari] gym[accept-rom-license] agilerl accelerate>=0.21.0

[0m

In [2]:
import os
import imageio
import gymnasium as gym
import numpy as np

### These imports will be used to implement the NN Agent ##
#import torch
#from agilerl.algorithms.td3 import TD3
#from agilerl.components.replay_buffer import ReplayBuffer
#from agilerl.hpo.mutation import Mutations
#from agilerl.hpo.tournament import TournamentSelection
#from agilerl.training.train_off_policy import train_off_policy
#from agilerl.utils.utils import create_population, make_vect_envs

from tqdm import tqdm
from __future__ import annotations
from collections import defaultdict

In [10]:
env = gym.make("Blackjack-v1", render_mode="rgb_array")
n_episodes = 50_000
env = gym.wrappers.RecordEpisodeStatistics(env, deque_size=n_episodes)

In [11]:
class BlackjackAgent:
    def __init__(
        self,
        env,
        learning_rate: float,
        initial_epsilon: float,
        epsilon_decay: float,
        final_epsilon: float,
        discount_factor: float = 0.95,
    ):
        """Initialize a Reinforcement Learning agent with an empty dictionary
        of state-action values (q_values), a learning rate and an epsilon.

        Args:
            learning_rate: The learning rate
            initial_epsilon: The initial epsilon value
            epsilon_decay: The decay for epsilon
            final_epsilon: The final epsilon value
            discount_factor: The discount factor for computing the Q-value
        """
        self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))

        self.lr = learning_rate
        self.discount_factor = discount_factor

        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon

        self.training_error = []

    def get_action(self, env, obs: tuple[int, int, bool]) -> int:
        """
        Returns the best action with probability (1 - epsilon)
        otherwise a random action with probability epsilon to ensure exploration.
        """
        # with probability epsilon return a random action to explore the environment
        if np.random.random() < self.epsilon:
            return env.action_space.sample()

        # with probability (1 - epsilon) act greedily (exploit)
        else:
            return int(np.argmax(self.q_values[obs]))

    def update(
        self,
        obs: tuple[int, int, bool],
        action: int,
        reward: float,
        terminated: bool,
        next_obs: tuple[int, int, bool],
    ):
        """Updates the Q-value of an action."""
        future_q_value = (not terminated) * np.max(self.q_values[next_obs])
        temporal_difference = (
            reward + self.discount_factor * future_q_value - self.q_values[obs][action]
        )

        self.q_values[obs][action] = (
            self.q_values[obs][action] + self.lr * temporal_difference
        )
        self.training_error.append(temporal_difference)

    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)

In [12]:
# hyperparameters
learning_rate = 0.01
start_epsilon = 1.0
epsilon_decay = start_epsilon / (n_episodes / 2)  # reduce the exploration over time
final_epsilon = 0.1

agent = BlackjackAgent(
    env=env,
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
)

In [13]:
def train(agent, env):
    best_reward = -float('inf')
    for episode in tqdm(range(n_episodes)):
        done = False
        obs = env.reset()
        ### printing the obs for debugging
        #print(obs, ' obs')
        #print(type(obs), ' obs type')
        #print(obs[0][0], ' obs[0]')
        #print(obs[0][1], ' obs[1]')
        #print(obs[1], ' obs[1]')
        #print(type(obs[0]), ' obs[0] type')
        #print(type(obs[1]), ' obs[1] type')
        total_reward = 0.0
        while not done:
            action = agent.get_action(env, obs[0])
            next_obs, reward, terminated, truncated, info = env.step(action)

            agent.update(obs[0], action, reward, terminated, next_obs[0])
            done = terminated or truncated

            obs = next_obs
            total_reward += reward
        if total_reward > best_reward:
            best_reward = total_reward
        print("Episode#:{} reward:{} best_reward:{} eps:{}".format(episode,
                                     total_reward, best_reward, agent.epsilon))
        agent.decay_epsilon()
    # Return the trained policy

    return agent.q_values

In [14]:
learned_policy = train(agent, env)

Output hidden; open in https://colab.research.google.com to view.

In [15]:
def test(agent, env, policy):
    done = False
    obs = env.reset()
    total_reward = 0.0
    while not done:
        print(obs[0], ' obs')

        action = np.argmax(policy[obs[0]])
        next_obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        obs = next_obs
        total_reward += reward
    return total_reward

In [16]:
# Uses the Gym Monitor wrapper to evalaute the agent and record video
# only one video will be saved

# video of the final episode with the episode trigger
env = gym.wrappers.RecordVideo(
    env, "./gym_monitor_output", episode_trigger=lambda x: x == 0)

test(agent, env, learned_policy)

env.close()

  logger.warn(


(10, 3, 0)  obs
20  obs
Moviepy - Building video /content/gym_monitor_output/rl-video-episode-0.mp4.
Moviepy - Writing video /content/gym_monitor_output/rl-video-episode-0.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /content/gym_monitor_output/rl-video-episode-0.mp4


