<a href="https://colab.research.google.com/github/wilstermanz/holbertonschool-machine_learning/blob/main/reinforcement_learning/deep_q_learning/dqn_agent_breakout.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import tensorflow as tf
if tf.__version__!='2.11.0':
  !pip install tensorflow==2.11.0 --quiet
  os.kill(os.getpid(), 9)

In [None]:
!pip install keras-rl2 --quiet
!pip install gym[atari] --quiet
!pip install atari-py --quiet

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!python -m atari_py.import_roms /content/drive/MyDrive/dqn/roms/

copying breakout.bin from /content/drive/MyDrive/dqn/roms/Breakout - Breakaway IV.bin to /usr/local/lib/python3.10/dist-packages/atari_py/atari_roms/breakout.bin


In [None]:
import gym
from gym.envs.registration import register

  ROMS = resolve_roms()


In [None]:
register(
    id='Breakout-v4',
    entry_point='gym.envs.atari:AtariEnv',
    kwargs={'game': 'breakout', 'obs_type': 'image', 'frameskip': 1},
    max_episode_steps=10000,
    nondeterministic=False,
)

  and should_run_async(code)
  logger.warn(f"Overriding environment {spec.id}")


In [None]:
from __future__ import division
import argparse

from PIL import Image
import numpy as np
import gym

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Convolution2D, Permute, Normalization
from tensorflow.keras.optimizers.legacy import Adam
import tensorflow.keras.backend as K

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint


INPUT_SHAPE = (84, 84)
WINDOW_LENGTH = 4


class AtariProcessor(Processor):
    def process_observation(self, observation):
        assert observation.ndim == 3  # (height, width, channel)
        img = Image.fromarray(observation)
        img = img.resize(INPUT_SHAPE).convert('L')  # resize and convert to grayscale
        processed_observation = np.array(img)
        assert processed_observation.shape == INPUT_SHAPE
        return processed_observation.astype('uint8')  # saves storage in experience memory

    def process_state_batch(self, batch):
        # We could perform this processing step in `process_observation`. In this case, however,
        # we would need to store a `float32` array instead, which is 4x more memory intensive than
        # an `uint8` array. This matters if we store 1M observations.
        processed_batch = batch.astype('float32') / 255.
        return processed_batch

    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)

# parser = argparse.ArgumentParser()
# parser.add_argument('--mode', choices=['train', 'test'], default='train')
# parser.add_argument('--env-name', type=str, default='BreakoutDeterministic-v4')
# parser.add_argument('--weights', type=str, default=None)
# args = parser.parse_args()

env_name = 'Breakout-v4'
mode = 'train'
weights = None

# Get the environment and extract the number of actions.
env = gym.make(env_name)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Next, we build our model. We use the same model that was described by Mnih et al. (2015).
input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE
model = Sequential()

# (width, height, channels)
model.add(Permute((2, 3, 1), input_shape=input_shape))
model.add(Convolution2D(16, (8, 8), 4))
model.add(Activation('relu'))
model.add(Convolution2D(32, (4, 4), 2))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in tensorflow.keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
processor = AtariProcessor()

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
                              nb_steps=100000)

# The trade-off between exploration and exploitation is difficult and an on-going research topic.
# If you want, you can experiment with the parameters or use a different policy. Another popular one
# is Boltzmann-style exploration:
# policy = BoltzmannQPolicy(tau=1.)
# Feel free to give it a try!

dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory,
               processor=processor, nb_steps_warmup=5000, gamma=.99, target_model_update=10000,
               train_interval=4, delta_clip=1.)
dqn.compile(Adam(learning_rate=.00025), metrics=['mae'])

  logger.warn(
  deprecation(
  deprecation(


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 permute (Permute)           (None, 84, 84, 4)         0         
                                                                 
 conv2d (Conv2D)             (None, 20, 20, 16)        4112      
                                                                 
 activation (Activation)     (None, 20, 20, 16)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 9, 9, 32)          8224      
                                                                 
 activation_1 (Activation)   (None, 9, 9, 32)          0         
                                                                 
 flatten (Flatten)           (None, 2592)              0         
                                                                 
 dense (Dense)               (None, 256)               6

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [None]:
# Okay, now it's time to learn something! We capture the interrupt exception so that training
# can be prematurely aborted. Notice that now you can use the built-in tensorflow.keras callbacks!
weights_filename = f'/content/drive/MyDrive/dqn/dqn_{env_name}_weights.h5f'
checkpoint_weights_filename = 'dqn_' + env_name + '_weights_{step}.h5f'
log_filename = f'/content/drive/MyDrive/dqn/dqn_{env_name}_log.json'
callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)]
callbacks += [FileLogger(log_filename, interval=100)]
dqn.fit(env, callbacks=callbacks, nb_steps=150000, log_interval=10000, verbose = 2)

# After training is done, we save the final weights one more time.
dqn.save_weights(weights_filename, overwrite=True)

Training for 150000 steps ...


  updates=self.state_updates,
  logger.deprecation(


    503/150000: episode: 1, duration: 2.617s, episode steps: 503, steps per second: 192, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.577 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
    996/150000: episode: 2, duration: 1.869s, episode steps: 493, steps per second: 264, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.550 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
   1921/150000: episode: 3, duration: 3.495s, episode steps: 925, steps per second: 265, episode reward:  3.000, mean reward:  0.003 [ 0.000,  1.000], mean action: 1.462 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
   2791/150000: episode: 4, duration: 3.334s, episode steps: 870, steps per second: 261, episode reward:  2.000, mean reward:  0.002 [ 0.000,  1.000], mean action: 1.538 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
   3883/150000: episode: 5, duration: 6.739s, episode steps: 1092, steps per

  updates=self.state_updates,


   5110/150000: episode: 7, duration: 7.438s, episode steps: 726, steps per second:  98, episode reward:  2.000, mean reward:  0.003 [ 0.000,  1.000], mean action: 1.479 [0.000, 3.000],  loss: 0.000717, mae: 0.013818, mean_q: 0.023735, mean_eps: 0.954496
   5619/150000: episode: 8, duration: 17.073s, episode steps: 509, steps per second:  30, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.503 [0.000, 3.000],  loss: 0.000766, mae: 0.009769, mean_q: 0.017193, mean_eps: 0.951724
   6317/150000: episode: 9, duration: 25.495s, episode steps: 698, steps per second:  27, episode reward:  1.000, mean reward:  0.001 [ 0.000,  1.000], mean action: 1.542 [0.000, 3.000],  loss: 0.001074, mae: 0.010412, mean_q: 0.018612, mean_eps: 0.946288
   6924/150000: episode: 10, duration: 19.484s, episode steps: 607, steps per second:  31, episode reward:  1.000, mean reward:  0.002 [ 0.000,  1.000], mean action: 1.519 [0.000, 3.000],  loss: 0.000628, mae: 0.009261, mean_q: 0.015

In [None]:
# Finally, evaluate our algorithm for 10 episodes.
dqn.test(env, nb_episodes=10, visualize=False)

In [None]:
weights_filename = f'/content/drive/MyDrive/dqn/dqn_{env_name}_weights.h5f'
if weights:
    weights_filename = weights
dqn.load_weights(weights_filename)
dqn.test(env, nb_episodes=10, visualize=False)

Testing for 10 episodes ...


  updates=self.state_updates,
  logger.deprecation(


Episode 1: reward: 0.000, steps: 10000
Episode 2: reward: 0.000, steps: 10000
Episode 3: reward: 0.000, steps: 10000
Episode 4: reward: 0.000, steps: 10000
Episode 5: reward: 0.000, steps: 10000
Episode 6: reward: 0.000, steps: 10000
Episode 7: reward: 0.000, steps: 10000
Episode 8: reward: 0.000, steps: 10000
Episode 9: reward: 0.000, steps: 10000
Episode 10: reward: 0.000, steps: 10000


<keras.callbacks.History at 0x7e05a73d2320>

In [None]:
weights_filename = f'/content/drive/MyDrive/dqn/dqn_{env_name}_weights.h5f'
checkpoint_weights_filename = 'dqn_' + env_name + '_weights_{step}.h5f'
log_filename = f'/content/drive/MyDrive/dqn/dqn_{env_name}_log.json'
callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)]
callbacks += [FileLogger(log_filename, interval=100)]

dqn.load_weights(weights_filename)
dqn.fit(env, callbacks=callbacks, nb_steps=150000, log_interval=10000, verbose = 2)

# After training is done, we save the final weights one more time.
dqn.save_weights(weights_filename, overwrite=True)

Training for 150000 steps ...
    620/150000: episode: 1, duration: 3.449s, episode steps: 620, steps per second: 180, episode reward:  1.000, mean reward:  0.002 [ 0.000,  1.000], mean action: 1.561 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
   1431/150000: episode: 2, duration: 4.356s, episode steps: 811, steps per second: 186, episode reward:  2.000, mean reward:  0.002 [ 0.000,  1.000], mean action: 1.491 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
   1961/150000: episode: 3, duration: 3.353s, episode steps: 530, steps per second: 158, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.525 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
   2463/150000: episode: 4, duration: 2.137s, episode steps: 502, steps per second: 235, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.512 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
   2955/150000: episode: 5, duration: 1.847s, 

  updates=self.state_updates,


   5146/150000: episode: 9, duration: 9.633s, episode steps: 507, steps per second:  53, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.554 [0.000, 3.000],  loss: 0.000456, mae: 0.057111, mean_q: 0.076440, mean_eps: 0.954334
   5834/150000: episode: 10, duration: 21.956s, episode steps: 688, steps per second:  31, episode reward:  1.000, mean reward:  0.001 [ 0.000,  1.000], mean action: 1.516 [0.000, 3.000],  loss: 0.000214, mae: 0.059140, mean_q: 0.079460, mean_eps: 0.950590
   6528/150000: episode: 11, duration: 24.084s, episode steps: 694, steps per second:  29, episode reward:  1.000, mean reward:  0.001 [ 0.000,  1.000], mean action: 1.539 [0.000, 3.000],  loss: 0.000235, mae: 0.058436, mean_q: 0.078400, mean_eps: 0.944380
   7332/150000: episode: 12, duration: 26.394s, episode steps: 804, steps per second:  30, episode reward:  2.000, mean reward:  0.002 [ 0.000,  1.000], mean action: 1.575 [0.000, 3.000],  loss: 0.000122, mae: 0.062159, mean_q: 0.0

In [None]:
dqn.load_weights(weights_filename)
dqn.test(env, nb_episodes=10, visualize=False)

Testing for 10 episodes ...
Episode 1: reward: 0.000, steps: 10000
Episode 2: reward: 0.000, steps: 10000
Episode 3: reward: 0.000, steps: 10000
Episode 4: reward: 0.000, steps: 10000
Episode 5: reward: 0.000, steps: 10000
Episode 6: reward: 0.000, steps: 10000
Episode 7: reward: 0.000, steps: 10000
Episode 8: reward: 0.000, steps: 10000
Episode 9: reward: 0.000, steps: 10000
Episode 10: reward: 0.000, steps: 10000


<keras.callbacks.History at 0x7e05a739b490>

In [None]:
dqn.load_weights(weights_filename)
dqn.fit(env, callbacks=callbacks, nb_steps=150000, log_interval=10000, verbose = 2)

# After training is done, we save the final weights one more time.
dqn.save_weights(weights_filename, overwrite=True)

Training for 150000 steps ...
    508/150000: episode: 1, duration: 3.367s, episode steps: 508, steps per second: 151, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.455 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
   1010/150000: episode: 2, duration: 3.326s, episode steps: 502, steps per second: 151, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.408 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
   1816/150000: episode: 3, duration: 3.472s, episode steps: 806, steps per second: 232, episode reward:  2.000, mean reward:  0.002 [ 0.000,  1.000], mean action: 1.433 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
   2315/150000: episode: 4, duration: 1.885s, episode steps: 499, steps per second: 265, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.559 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
   2816/150000: episode: 5, duration: 1.951s, 

AssertionError: ignored

In [None]:
dqn.load_weights(weights_filename)
dqn.test(env, nb_episodes=10, visualize=False)