<a href="https://colab.research.google.com/github/zahra-eslamian/artificial_intelligence_A_to_Z/blob/main/A3C_for_Kung_Fu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A3C for Kung Fu

## Part 0 - Installing the required packages and importing the libraries

### Installing Gymnasium

In [1]:
!pip install gymnasium
!pip install "gymnasium[atari]"
!pip install ale-py
!apt-get install -y swig
!pip install gymnasium[box2d]

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 41 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 1,116 kB in 1s (775 kB/s)
Selecting previously unselected package swig4.0.
(Reading database ... 121713 files and directories currently installed.)
Preparing to unpack .../swig4.0_4.0.2-1ubuntu1_amd64.deb ...
Unpacking swig4.0 (4.0.2-1ubuntu1) ...
Selecting previously unselected package swig.
Preparing to unpack .../swig_4.0.2-1ubunt

### Importing the libraries

In [2]:
import cv2
import math
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.multiprocessing as mp
import torch.distributions as distributions
from torch.distributions import Categorical
import ale_py
import gymnasium as gym
from gymnasium.spaces import Box
from gymnasium import ObservationWrapper

## Part 1 - Building the AI

### Creating the architecture of the Neural Network

In [3]:
class Network(nn.Module):

  def __init__(self, action_size):
    super(Network, self).__init__()
    # in_channels = 4 in the first conv layer: in A3C, we're going to have a stack of 4 gray scale frames from the Kung Fu Environment
    self.conv1 = nn.Conv2d(in_channels = 4, out_channels = 32, kernel_size = (3, 3), stride = 2)
    self.conv2 = nn.Conv2d(in_channels = 32, out_channels = 32, kernel_size = (3, 3), stride = 2)
    self.conv3 = nn.Conv2d(in_channels = 32, out_channels = 32, kernel_size = (3, 3), stride = 2)
    self.flatten = nn.Flatten()
    # 512: number of input features for the first fully connected layer which is actually the number of output feature coming from the flatten layer
    # there is a formula to calculate that, we can also give our Network architecture to ChatGPT and ask it to calculate it for us
    # 512 is the calculated number considering that we will resize 84 to 42 (84x84 is the standard Atari preprocessed frame size used by DeepMind)
    self.fc1 = nn.Linear(512, 128)
    # the followings are the output layers, as we have 2 outputs in A3C, one for action_size-dimension vector for Q(s,a) and one 1-dimention for V(s):
    self.fc2a = nn.Linear(128, action_size)
    self.fc2s = nn.Linear(128, 1)

  def forward(self, state):
    x = F.relu(self.conv1(state))
    x = F.relu(self.conv2(x))
    x = F.relu(self.conv3(x))
    x = self.flatten(x)
    x = F.relu(self.fc1(x))
    action_values = self.fc2a(x)
    state_values = self.fc2s(x)[0] # we get the value not the vector by accessing the index [0]
    return action_values, state_values

## Part 2 - Training the AI

### Setting up the environment

In [4]:
class PreprocessAtari(ObservationWrapper):

  def __init__(self, env, height = 42, width = 42, crop = lambda img: img, dim_order = 'pytorch', color = False, n_frames = 4):
    super(PreprocessAtari, self).__init__(env)
    self.img_size = (height, width)
    self.crop = crop
    self.dim_order = dim_order
    self.color = color
    self.frame_stack = n_frames
    n_channels = 3 * n_frames if color else n_frames
    obs_shape = {'tensorflow': (height, width, n_channels), 'pytorch': (n_channels, height, width)}[dim_order]
    self.observation_space = Box(0.0, 1.0, obs_shape)
    self.frames = np.zeros(obs_shape, dtype = np.float32)

  def reset(self):
    self.frames = np.zeros_like(self.frames)
    obs, info = self.env.reset()
    self.update_buffer(obs)
    return self.frames, info

  def observation(self, img):
    img = self.crop(img)
    img = cv2.resize(img, self.img_size)
    if not self.color:
      if len(img.shape) == 3 and img.shape[2] == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = img.astype('float32') / 255.
    if self.color:
      self.frames = np.roll(self.frames, shift = -3, axis = 0)
    else:
      self.frames = np.roll(self.frames, shift = -1, axis = 0)
    if self.color:
      self.frames[-3:] = img
    else:
      self.frames[-1] = img
    return self.frames

  def update_buffer(self, obs):
    self.frames = self.observation(obs)

def make_env():
  env = gym.make('KungFuMasterNoFrameskip-v0', render_mode = 'rgb_array') # The KungFuMaster environment was renamed 'KungFuMasterNoFrameskip-v0'
  env = PreprocessAtari(env, height = 42, width = 42, crop = lambda img: img, dim_order = 'pytorch', color = False, n_frames = 4)
  return env

env = make_env()

state_shape = env.observation_space.shape
number_actions = env.action_space.n
print("State shape:", state_shape)
print("Number actions:", number_actions)
print("Action names:", env.env.env.env.get_action_meanings())

  logger.deprecation(


State shape: (4, 42, 42)
Number actions: 14
Action names: ['NOOP', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'DOWNRIGHT', 'DOWNLEFT', 'RIGHTFIRE', 'LEFTFIRE', 'DOWNFIRE', 'UPRIGHTFIRE', 'UPLEFTFIRE', 'DOWNRIGHTFIRE', 'DOWNLEFTFIRE']


### Initializing the hyperparameters

In [5]:
learning_rate = 1e-4
discount_factor = 0.99
number_environments = 10

### Implementing the A3C class

In [6]:
class Agent():

  def __init__(self, action_size):
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.action_size = action_size
    self.network = Network(action_size).to(self.device)
    self.optimizer = torch.optim.Adam(self.network.parameters(), lr = learning_rate)

  def act(self, state):
    # If the input state has shape (C, H, W), it is a single frame stack.
    # PyTorch convolutional networks expect a *batch* dimension, so we wrap it into a list to create shape (1, C, H, W).
    if state.ndim == 3:
      state = [state]

    # Convert the state(s) into a PyTorch tensor (with the data type of float32) and move it to CPU/GPU.
    # The network only accepts torch tensors, not NumPy arrays.
    state = torch.tensor(state, dtype = torch.float32, device = self.device)

    # Pass the state through the A3C network.
    # Calling self.network(state) automatically invokes the network's forward() method, so we don't write self.network.forward(state)
    # PyTorch handles this via its internal wrapper, self.network.__call__(state):
    # which does preprocessing, calls self.network.forward(state), does postprocessing.
    # Calling .forward() manually skips all of that and may break training.
    # The network returns (action_logits, state_value), but here we only need the logits.
    action_values, _ = self.network(state)

    # Convert the logits to a probability distribution over actions.
    # dim = -1 means apply the softmax along the last dimension of the tensor.
    # action_values has shape:(batch_size, number_of_actions), so the last dimension (−1) is the one containing the action logits.
    policy = F.softmax(action_values, dim = -1)

    # For each state in the batch, sample an action according to its probability distribution.
    return np.array([np.random.choice(len(p), p = p) for p in policy.detach().cpu().numpy()])

  def step(self, state, action, reward, next_state, done):
    # In fact step func arguments (state, action, reward, next_state, done) are batches not single
    # Get the batch size so we can index actions later
    batch_size = state.shape[0]

    # Convert numpy arguments to tensors so the network can process them
    state = torch.tensor(state, dtype=torch.float32, device=self.device)
    next_state = torch.tensor(next_state, dtype=torch.float32, device=self.device)
    reward = torch.tensor(reward, dtype=torch.float32, device=self.device)
    done = torch.tensor(done, dtype=torch.bool, device=self.device).to(dtype=torch.float32)
    action_values, state_value = self.network(state)

    # Get only the critic's value estimate for the next state
    _, next_state_value = self.network(next_state)

    # Compute the TD target: reward + discounted next value (0 if episode ended), “How good was this state, based on what happened next?”
    target_state_value = reward + discount_factor * next_state_value * (1 - done)

    # Advantage = how much better the action was than expected
    advantage = target_state_value - state_value

    # Convert action logits into probabilities for sampling / entropy
    probs = F.softmax(action_values, dim=-1)

    # Log probabilities for computing policy gradients
    logprobs = F.log_softmax(action_values, dim=-1)

    # Entropy bonus encourages exploration (higher entropy = more randomness)
    entropy = -torch.sum(probs * logprobs, axis=-1)

    # Prepare indices so we can pick the logprob of the chosen actions
    batch_idx = np.arange(batch_size)

    # Log probability of each action actually taken by the agent
    # We use log probabilities because they:
    # (1) turn multiplication into addition (easier math),
    # (2) avoid numerical issues with tiny probabilities,
    # (3) are required in the policy-gradient formula.
    logp_actions = logprobs[batch_idx, action]

    # Actor loss: reinforce good actions using advantage + entropy for exploration
    actor_loss = -(logp_actions * advantage.detach()).mean() - 0.001 * entropy.mean()

    # Critic loss: train value function to match the TD target
    critic_loss = F.mse_loss(target_state_value.detach(), state_value)

    # Total loss combines actor and critic objectives
    total_loss = actor_loss + critic_loss

    # Clear old gradients before backpropagation
    self.optimizer.zero_grad()

    # Compute gradients based on total loss
    total_loss.backward()

    # Apply the gradient update to the network weights
    self.optimizer.step()

### Initializing the A3C agent

In [7]:
agent = Agent(number_actions)

### Evaluating our A3C agent on a certain number of episodes

In [8]:
def evaluate(agent, env, n_episodes=1):

    # Store the total reward for each evaluated episode
    episodes_rewards = []

    # Run the agent for the requested number of episodes
    for _ in range(n_episodes):

        # Reset the environment and get the initial state
        state, _ = env.reset()

        # Track how much reward the agent collects in this episode, initializing it to zero
        total_reward = 0

        # Run the episode step-by-step until the game ends
        while True:

            # Use the agent's policy to choose an action (no training here)
            action = agent.act(state)

            # Apply the chosen action in the environment
            state, reward, done, info, _ = env.step(action[0])

            # Accumulate reward to evaluate how well the agent performs
            total_reward += reward

            # If the episode is finished, stop the loop
            if done:
                break

        # Save the total reward for this episode
        episodes_rewards.append(total_reward)

    # Return the list of episode rewards (performance measure)
    return episodes_rewards

### Managing multiple environments simultaneously

In [9]:
class EnvBatch:

    def __init__(self, n_envs=10):
        # Create multiple independent environments to run in parallel.
        # This gives more diverse experience and speeds up training.
        self.envs = [make_env() for _ in range(n_envs)]

    def reset(self):
        # Reset all environments and collect their initial states.
        _states = []
        for env in self.envs:
            # env.reset() returns (state, info), we only need the state
            _states.append(env.reset()[0])
        # Return all initial states as a single batch array
        return np.array(_states)

    def step(self, actions):
        # Step each environment using its corresponding action.
        # zip(envs, actions) pairs each env with the action for that env.
        # map(..., zip(*...)) converts the list of tuples into arrays of
        # next_states, rewards, dones, infos for all envs.
        next_states, rewards, dones, infos, _ = map(
            np.array,
            zip(*[env.step(a) for env, a in zip(self.envs, actions)])
        )

        # If an environment finished (done=True), reset it immediately
        # so the batch always contains valid states.
        for i in range(len(self.envs)):
            if dones[i]:
                next_states[i] = self.envs[i].reset()[0]

        # Return the batch of results for all environments
        return next_states, rewards, dones, infos

### Training the A3C agent

In [10]:
import tqdm

# Create a batch of parallel environments to collect experience faster
env_batch = EnvBatch(number_environments)

# Reset all environments and get their initial states as a batch
batch_states = env_batch.reset()

# Progress bar for training iterations
with tqdm.trange(0, 3001) as progress_bar:
    for i in progress_bar:

        # Let the agent choose an action for each environment in the batch
        batch_actions = agent.act(batch_states)

        # Apply those actions in all environments at once
        batch_next_states, batch_rewards, batch_dones, _ = env_batch.step(batch_actions)

        # Scale rewards to keep training stable (a common A3C trick)
        batch_rewards *= 0.01

        # Update the A3C network using the collected batch experience
        agent.step(batch_states, batch_actions, batch_rewards, batch_next_states, batch_dones)

        # Move to the next states for the next training step
        batch_states = batch_next_states

        # Every 1000 steps, evaluate how good the agent is becoming
        if i % 1000 == 0:
            print("Average agent reward:", np.mean(evaluate(agent, env, n_episodes=10)))

  logger.deprecation(
  critic_loss = F.mse_loss(target_state_value.detach(), state_value)
  state = torch.tensor(state, dtype = torch.float32, device = self.device)
  0%|          | 7/3001 [02:04<10:49:30, 13.02s/it]  

Average agent reward: 900.0


 34%|███▎      | 1007/3001 [04:33<2:32:49,  4.60s/it]

Average agent reward: 830.0


 67%|██████▋   | 2005/3001 [06:54<1:34:12,  5.68s/it]

Average agent reward: 790.0


100%|██████████| 3001/3001 [09:17<00:00,  5.38it/s]

Average agent reward: 790.0





## Part 3 - Visualizing the results

In [11]:
import glob
import io
import base64
import imageio
from IPython.display import HTML, display

def show_video_of_model(agent, env):
  state, _ = env.reset()
  done = False
  frames = []
  while not done:
    frame = env.render()
    frames.append(frame)
    action = agent.act(state)
    state, reward, done, _, _ = env.step(action[0])
  env.close()
  imageio.mimsave('video.mp4', frames, fps=30)

show_video_of_model(agent, env)

def show_video():
    mp4list = glob.glob('*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")

show_video()

