<a href="https://colab.research.google.com/github/volleyfreak/ALS/blob/master/%5Crl%5Cassignment%5CBreakout.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# HSKA AI-Lab RL: Deep Q-Network (DQN)

## Mount Google Drive as folder

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd /content/drive/My\ Drive/Colab\ Notebooks

Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks


Es soll ein DQN Agent trainiert werden, der [Atari Breakout](https://www.gymlibrary.ml/environments/atari/breakout/) spielen kann.
Der Ansatz ist frei – ihr könnt euch an Aufgabe 4 orientieren oder die Methode auf eure Art implementieren.

### "Quiz"

- Wann ist der Agent gut genug? Was ist ein gutes Erfolgskriterium?
- Was für eine Architektur soll das Q-Network haben?

### It's dangerous to go alone! Take this.

In [None]:
%tensorflow_version 1.x
%pip install --upgrade pip
%pip install gym[atari]==0.12.5
%pip install pyglet==1.3.2

import gym

import random
from collections import deque
from typing import Tuple
import time
from datetime import datetime
from contextlib import suppress

import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, Flatten, Dense, Lambda, multiply, Input
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.losses import huber_loss
from tensorflow.keras.backend import set_session
from loggers import TensorBoardLogger, tf_summary_image

%pip install matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from plot_utils import plot_statistics
from abstract_agent import AbstractAgent
from atari_helpers import LazyFrames, wrap_deepmind, make_atari

!apt-get install -y xvfb python-opengl
!python -m pip install pyvirtualdisplay
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

is_ipython = 'inline' in plt.get_backend()
if is_ipython:
    from IPython import display
    from IPython.display import SVG

plt.ion()

TensorFlow 1.x selected.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-22.1.2-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 33.5 MB/s 
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-22.1.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gym[atari]==0.12.5
  Downloading gym-0.12.5.tar.gz (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting atari_py~=0.1.4
  Downloading atari_py-0.1.15-cp37-cp37m-manylinux1_x86_64.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m

In [None]:
# familiar interface:
env = make_atari('BreakoutNoFrameskip-v0')
env = wrap_deepmind(env, frame_stack=True)

# or vanilla open ai gym:
# env = gym.make('BreakoutNoFrameskip-v0')

NoopResetEnv (max 30) wrapper is used.
MaxAndSkipEnv (skip 4) wrapper is used.
EpisodicLifeEnv wrapper is used.
FireResetEnv wrapper is used.
ClipRewardEnv wrapper is used.
FrameStack (4) wrapper is used.


In [None]:
class DQNAgent(AbstractAgent):

    def __init__(self, action_size: int, state_size: int,
                 gamma: float, epsilon: float, epsilon_decay: float, epsilon_min: float, 
                 alpha: float, batch_size: int, memory_size: int, start_replay_step: int, 
                 target_model_update_interval: int, train_freq: int):
        self.action_size = action_size
        self.state_size = state_size

        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.alpha = alpha

        self.memory = deque(maxlen=memory_size)
        self.batch_size = batch_size

        self.step = 0
        self.start_replay_step = start_replay_step

        self.target_model_update_interval = target_model_update_interval
        self.train_freq = train_freq # Frequency (interval) at which model should be trained (steps)

        assert self.start_replay_step >= self.batch_size, \
            "The number of steps to start replay must be at least as large as the batch size"

        self.action_mask = np.ones((1, self.action_size))
        self.action_mask_batch = np.ones((self.batch_size, self.action_size))

        config = tf.ConfigProto(intra_op_parallelism_threads=8,
                                inter_op_parallelism_threads=4,
                                allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        session = tf.Session(config=config)
        set_session(session)  # set this TensorFlow session as the default session for Keras

        self.model = self._build_model()
        self.target_model = self._build_model()

    def _build_model(self):
        """Deep Q-network as defined in the DeepMind article on Nature
        
        Returns:
            model [Model]
        """
        atari_shape = (84, 84, 4)
        # With the functional API we need to define the inputs.
        # Sequential API no longer works because of merge mask
        frames_input = Input(atari_shape, name='frames')
        action_mask = Input((self.action_size,), name='action_mask')

        # Assuming that the input frames are still encoded from 0 to 255. Transforming to [0, 1]
        normalized = Lambda(lambda x: x / 255.0, name='normalization')(frames_input)

        # "The first hidden layer convolves 16 8×8 filters with stride 4 with the 
        # input image and applies a rectifier nonlinearity."
        conv1 = Conv2D(filters=16,
                       kernel_size=(8, 8),
                       strides=(4, 4),
                       activation='relu')(normalized)

        # "The second hidden layer convolves 32 4×4 filters with stride 2, again followed 
        # by a rectifier nonlinearity."
        conv2 = Conv2D(filters=32,
                       kernel_size=(4, 4),
                       strides=(2, 2),
                       activation='relu')(conv1)

        # Flattening the last convolutional layer.
        conv_flattened = Flatten()(conv2)

        # "The final hidden layer is fully-connected and consists of 256 rectifier units."
        hidden = Dense(units=256, activation='relu')(conv_flattened)

        # "The output layer is a fully-connected linear layer with a single output 
        # for each valid action."
        output = Dense(self.action_size)(hidden)

        filtered_output = multiply([output, action_mask])

        model = Model(inputs=[frames_input, action_mask], outputs=filtered_output)
        model.compile(loss=huber_loss, optimizer=Adam(lr=self.alpha), metrics=None)

        return model

    def _replay(self) -> None:
        """Gets random experiences from memory for batch update of Q-function.
        
        Returns:
            None
        """
        # TODO: Get a random mini-batch from memory and create numpy arrays for each part of this experience
        rand_mini_batch = random.sample(self.memory, self.batch_size)
        states, actions, next_states, rewards, dones = np.array([]), np.array([]), np.array([]), np.array([]), np.array([])
        

        # TODO: Convert the parts of the mini-batch into corresponding numpy arrays.
        # Note that the states are of type 'LazyFrames' due to memory efficiency
        # and must therefore be converted individually
        #states = None
        #next_states = None
        #actions = None
        #rewards = None
        #dones = None
        states, actions, next_states, rewards, dones = np.array([memo[0] for memo in rand_mini_batch]), np.array([memo[1] for memo in rand_mini_batch]), np.array([memo[2] for memo in rand_mini_batch]), np.array([memo[3] for memo in rand_mini_batch]), np.array([memo[4] for memo in rand_mini_batch])
        # The following assert statements are intended to support further implementation,
        # but can also be removed/adjusted if necessary
        assert all(isinstance(x, np.ndarray) for x in (states, actions, rewards, next_states, dones)), \
            "All experience batches should be of type np.ndarray."
        assert states.shape == (self.batch_size, 84, 84, 4), \
            f"States shape should be: {(self.batch_size, 84, 84, 4)}"
        assert actions.shape == (self.batch_size,), f"Actions shape should be: {(self.batch_size,)}"
        assert rewards.shape == (self.batch_size,), f"Rewards shape should be: {(self.batch_size,)}"
        assert next_states.shape == (self.batch_size, 84, 84, 4), \
            f"Next states shape should be: {(self.batch_size, 84, 84, 4)}"
        assert dones.shape == (self.batch_size,), f"Dones shape should be: {(self.batch_size,)}"

        # TODO: Predict the Q values of the next states (choose the right model!). Passing ones as the action mask
        # Note that a suitable mask has already been created in '__init__'
        next_q_values = self.target_model.predict([next_states, self.action_mask_batch])

        # TODO: Calculate the Q values, remember
        #  - the Q values of each non-terminal state is the reward + gamma * the max next state Q value
        #  - and the Q values of terminal states should be the reward (Hint: 1.0 - dones) makes sure that if the game is
        #    over, targetQ = rewards
        # Depending on the implementation, the axis must be specified to get the max Q value for EACH batch element!
        q_values = [0 if dones[i] else (reward + (self.gamma * max(next_q_values[i]))) for (i, reward) in enumerate(rewards)]

        # TODO: Create a one hot encoding of the actions (the selected action is 1 all others 0)
        # Hint look at the imports. A Keras help function will be imported there
        one_hot_actions = np.zeros((actions.size, self.action_size))
        one_hot_actions[np.arange(actions.size), actions] = 1

        # TODO: Create the target Q values based on the one hot encoding of the actions and the calculated Q values
        #  Hint you have to "reshape" the q_values to match the shape
        target_q_values = np.array([action * q_values[i] for (i,action) in enumerate(one_hot_actions)])

        # TODO: Fit the model with the right x and y values
        self.model.fit(
           x=[states, self.action_mask_batch],  # states and mask
           y=target_q_values,  # target Q values
           batch_size=self.batch_size,
           verbose=0
        )

    def act(self, state: LazyFrames) -> int:
        """Selects the action to be executed based on the given state.

        Implements epsilon greedy exploration strategy, i.e. with a probability of
        epsilon, a random action is selected.

        Args:
            state [LazyFrames]: LazyFrames object representing the state based on 4 stacked observations (images)

        Returns:
            action [int]
        """
        if np.random.rand() <= self.epsilon:
            # TODO: Return random valid action
            action = env.action_space.sample()
        else:
            # TODO: Use the model to get the Q values for the state and determine the action based on the max Q value.
            # Hint: You have to convert the state to a list of numpy arrays before you can pass it to the model
            action = np.argmax(self.model.predict([[state], self.action_mask]))
        return action

    def train(self, experience: Tuple[LazyFrames, int, LazyFrames, float, bool]) -> None:
        """Stores the experience in memory. If memory is full trains network by replay.

        Args:
            experience [tuple]: Tuple of state, action, next state, reward, done.

        Returns:
            None
        """
        self.memory.append(experience)

        # TODO: As soon as enough steps are played:
        #  - Update epsilon as long as it is not minimal
        #  - Update weights of the target model (syn of the two models)
        #  - Execute replay

        if self.step >= self.start_replay_step:
          if self.epsilon > self.epsilon_min:
            self.epsilon *= (1 - self.epsilon_decay)
          
          if self.step % self.target_model_update_interval == 0:
            self.target_model.set_weights(self.model.get_weights())
          
          self._replay()

        self.step += 1

In [None]:
def interact_with_environment(env, agent, n_episodes=600, max_steps=1000000, train=True, verbose=True):      
    statistics = []
    tb_logger = TensorBoardLogger(f'./logs/run-{datetime.now().strftime("%Y-%m-%d_%H:%M:%S")}')
    
    with suppress(KeyboardInterrupt):
        total_step = 0
        for episode in range(n_episodes):
            done = False
            episode_reward = 0
            state = env.reset()
            episode_start_time = time.time()
            episode_step = 0

            while not done:
                action = agent.act(state)
                next_state, reward, done, _ = env.step(action)

                if train:
                    agent.train((state, action, next_state, reward, done))

                if episode == 0:
                    # for debug purpose log every state of first episode
                    for obs in state:
                        tb_logger.log_image(f'state_t{episode_step}:', tf_summary_image(np.array(obs, copy=False)),
                                            global_step=total_step)
                state = next_state
                episode_reward += reward
                episode_step += 1
            
            total_step += episode_step

            if episode % 10 == 0:
                speed = episode_step / (time.time() - episode_start_time)
                tb_logger.log_scalar('score', episode_reward, global_step=total_step)
                tb_logger.log_scalar('epsilon', agent.epsilon, global_step=total_step)
                tb_logger.log_scalar('speed', speed, global_step=total_step)
                if verbose:
                    print(f'episode: {episode}/{n_episodes}, score: {episode_reward}, steps: {episode_step}, '
                          f'total steps: {total_step}, e: {agent.epsilon:.3f}, speed: {speed:.2f} steps/s')

            statistics.append({
                'episode': episode,
                'score': episode_reward,
                'steps': episode_step
            })
                                  
            if total_step >= max_steps:
                break
        
    return statistics

In [None]:
action_size = env.action_space.n
state_size = env.observation_space.shape[0]

# Hyperparams (should be sufficient)
annealing_steps = 20000  # not episodes!
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = (epsilon - epsilon_min) / annealing_steps
alpha = 0.0001
batch_size = 64
memory_size = 10000
start_replay_step = 10000
target_model_update_interval = 1000
train_freq = 4

agent = DQNAgent(action_size=action_size, state_size=state_size, gamma=gamma, 
                 epsilon=epsilon, epsilon_decay=epsilon_decay, epsilon_min=epsilon_min, 
                 alpha=alpha, batch_size=batch_size, memory_size=memory_size,
                 start_replay_step=start_replay_step, 
                 target_model_update_interval=target_model_update_interval, train_freq=train_freq)

statistics = interact_with_environment(env, agent, n_episodes=20000, verbose=True)
env.close()
plot_statistics(statistics)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
episode: 0/50000, score: 1.0, steps: 51, total steps: 51, e: 1.000, speed: 250.14 steps/s
episode: 10/50000, score: 1.0, steps: 69, total steps: 468, e: 1.000, speed: 786.53 steps/s
episode: 20/50000, score: 0.0, steps: 23, total steps: 840, e: 1.000, speed: 792.26 steps/s
episode: 30/50000, score: 0.0, steps: 23, total steps: 1141, e: 1.000, speed: 764.53 steps/s
episode: 40/50000, score: 2.0, steps: 98, total steps: 1543, e: 1.000, speed: 767.90 steps/s
episode: 50/50000, score: 0.0, steps: 23, total steps: 1914, e: 1.000, speed: 442.32 steps/s
episode: 60/50000, score: 0.0, steps: 23, total steps: 2338, e: 1.000, speed: 789.18 steps/s
episode: 70/50000, score: 0.0, steps: 23, total steps: 2755, e: 1.000, speed: 788.40 steps/s
episode: 80/50000, score: 1.0, steps: 51, total steps: 3108, e: 1.000, speed: 757.87 s

In [None]:
for i in range(3):
    state = env.reset()
    img = plt.imshow(env.render(mode='rgb_array'))
    for j in range(200):
        action = agent.act(state)
        img.set_data(env.render(mode='rgb_array')) 
        plt.axis('off')
        display.display(plt.gcf())
        display.clear_output(wait=True)
        state, reward, done, _ = env.step(action)
        if done:
            break 
            
env.close()