# Keras RL DQN on Pong

Implementation of a DQN agent on the famous game of Pong:

**Pong-v0**
(https://gym.openai.com/envs/Pong-v0/) <br />

**TASK: Import necessary libraries and create the environment. Also extract the possible actions** <br />

In [1]:
from PIL import Image  # To transform the image in the Processor
import numpy as np
import gym
from gym.utils import play

# Convolutional Backbone Network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Convolution2D, Permute
#from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.legacy import Adam

# Keras-RL
from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

pygame 2.3.0 (SDL 2.24.2, Python 3.10.9)
Hello from the pygame community. https://www.pygame.org/contribute.html


2023-03-21 14:55:39.114126: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-21 14:55:39.183384: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
env = gym.make("Pong-v0")
nb_actions = env.action_space.n

**Play the game manually (keys: a and d to move the bars)**

In [None]:
#play.play(env)

**Define an input size and the window length**

In [3]:
IMG_SHAPE = (84, 84)
WINDOW_LENGTH = 4

**Create the ImageProcessor** <br />
1. Resize the image
2. Convert it to grayscale
3. Standardize it
4. Be memory efficient

In [4]:
class ImageProcessor(Processor):
    def process_observation(self, observation):
        # First convert the numpy array to a PIL Image
        img = Image.fromarray(observation)
        # Then resize the image
        img = img.resize(IMG_SHAPE)
        # And convert it to grayscale  (The L stands for luminance)
        img = img.convert("L")
        # Convert the image back to a numpy array and finally return the image
        img = np.array(img)
        return img.astype('uint8')  # saves storage in experience memory
    
    def process_state_batch(self, batch):

        # We divide the observations by 255 to compress it into the intervall [0, 1].
        # This supports the training of the network
        # We perform this operation here to save memory.
        processed_batch = batch.astype('float32') / 255.
        return processed_batch

**The Convolutional Neural Network**

In [5]:
input_shape = (WINDOW_LENGTH, IMG_SHAPE[0], IMG_SHAPE[1])

model = Sequential()
model.add(Permute((2, 3, 1), input_shape=input_shape))

model.add(Convolution2D(32, (8, 8), strides=(4, 4),kernel_initializer='he_normal'))
model.add(Activation('relu'))
model.add(Convolution2D(64, (4, 4), strides=(2, 2), kernel_initializer='he_normal'))
model.add(Activation('relu'))
model.add(Convolution2D(64, (3, 3), strides=(1, 1), kernel_initializer='he_normal'))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 permute (Permute)           (None, 84, 84, 4)         0         
                                                                 
 conv2d (Conv2D)             (None, 20, 20, 32)        8224      
                                                                 
 activation (Activation)     (None, 20, 20, 32)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 9, 9, 64)          32832     
                                                                 
 activation_1 (Activation)   (None, 9, 9, 64)          0         
                                                                 
 conv2d_2 (Conv2D)           (None, 7, 7, 64)          36928     
                                                                 
 activation_2 (Activation)   (None, 7, 7, 64)          0

**Replay Memory**

In [6]:
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)

**Processor**

In [7]:
processor = ImageProcessor()

**Define the action selection policy**

In [8]:
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
                              nb_steps=1000000)

**Create the agent and compile**

In [9]:
dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory,
               processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000,
              train_interval=4, delta_clip=1)

In [10]:
dqn.compile(Adam(lr=.00025), metrics=['mae'])


  super().__init__(name, **kwargs)
2023-03-21 14:57:27.486652: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-21 14:57:27.505486: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:357] MLIR V1 optimization pass is not enabled
2023-03-21 14:57:27.539160: W tensorflow/c/c_api.cc:291] Operation '{name:'dense_2/kernel/Assign' id:210 op device:{requested: '', assigned: ''} def:{{{node dense_2/kernel/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](dense_2/kernel, dense_2/kernel/Initializer/stateless_random_uniform)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and 

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


**Define a checkpoint callback to store the weights during training.**

In [None]:
weights_filename = 'dqn_pong_weights.h5f'
checkpoint_weights_filename = 'dqn_' + "pong" + '_weights_{step}.h5f'
checkpoint_callback = ModelIntervalCheckpoint(checkpoint_weights_filename, interval=100000)

**Train the agent and save the weights**

In [None]:
dqn.fit(env, nb_steps=1500000, callbacks=[checkpoint_callback], log_interval=10000, visualize=False)

# After training is done, we save the final weights one more time.
dqn.save_weights(weights_filename, overwrite=True)

env.close()
env.reset()

**Load the weights and create an agent**

In [11]:
# Load the weights
model.load_weights("dqn_pong_weights_student_2000000.h5f")

memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1, value_min=.1, value_test=.05,
                              nb_steps=100000)

processor = ImageProcessor()

# Initialize the DQNAgent with the new model and updated policy and compile it
dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory,
               processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000)
dqn.compile(Adam(lr=.00025), metrics=['mae'])

2023-03-21 14:57:44.882998: W tensorflow/c/c_api.cc:291] Operation '{name:'total_2/Assign' id:376 op device:{requested: '', assigned: ''} def:{{{node total_2/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](total_2, total_2/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2023-03-21 14:57:45.015636: W tensorflow/c/c_api.cc:291] Operation '{name:'dense_3/bias/Assign' id:612 op device:{requested: '', assigned: ''} def:{{{node dense_3/bias/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](dense_3/bias, dense_3/bias/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes 

**Evaluate the agent.**

In [12]:
import numpy as np
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7f86746a5c30>

In [13]:
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay

def show_video_all():
  mp4list = glob.glob('videopong/*.mp4')
  for num in range(len(mp4list)):
    print(mp4list[num])
    mp4 = mp4list[num]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="{1}" autoplay 
                controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'),mp4list[num])))

def show_video(num):
  mp4list = glob.glob('videopong/*.mp4')
  if len(mp4list) > num:
    print(mp4list[num])
    mp4 = mp4list[num]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="{1}" autoplay 
                controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'),mp4list[num])))
  else: print("Could not find video")

def wrap_env(env): return Monitor(env, './videopong', force=True)

In [14]:
env = gym.make("Pong-v0")
env._max_episode_steps = 2500
env = wrap_env(env)
dqn.test(env, nb_episodes=2, visualize=True)
env.close()

Testing for 2 episodes ...


  updates=self.state_updates,
2023-03-21 14:57:50.264848: W tensorflow/c/c_api.cc:291] Operation '{name:'activation_4/activation_4/Identity' id:123 op device:{requested: '', assigned: ''} def:{{{node activation_4/activation_4/Identity}} = Identity[T=DT_FLOAT, _has_manual_control_dependencies=true](dense_1/BiasAdd)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2023-03-21 14:57:50.286049: W tensorflow/c/c_api.cc:291] Operation '{name:'count_6/Assign' id:759 op device:{requested: '', assigned: ''} def:{{{node count_6/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](count_6, count_6/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes aft

Episode 1: reward: -5.000, steps: 2500
Episode 2: reward: -6.000, steps: 2500


In [15]:
show_video_all()

videopong/openaigym.video.0.10262.video000000.mp4


videopong/openaigym.video.0.10262.video000001.mp4
