<a href="https://colab.research.google.com/github/vicentcamison/idal_ia3/blob/main/4%20Aprendizaje%20reforzado/Sesion%203/DQN_v2_adv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DQN v2 advantage

In [1]:
import gym
import numpy as np
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import requests
import random
from collections import deque
import datetime

In [2]:
# Abrimos el archivo utils.py desde el github

url = 'https://raw.githubusercontent.com/vicentcamison/idal_ia3/main/4%20Aprendizaje%20reforzado/Sesion%203/utils.py'
r = requests.get(url)

with open('utils.py', 'w') as f:
    f.write(r.text)

In [3]:
# Abrimos el archivo utils.py desde el github

url = 'https://raw.githubusercontent.com/vicentcamison/idal_ia3/main/4%20Aprendizaje%20reforzado/Sesion%203/dqn_approximators.py'
r = requests.get(url)

with open('dqn_approximators.py', 'w') as f:
    f.write(r.text)

In [6]:
from utils import discrete_input
from dqn_approximators import DqnAdv

ModuleNotFoundError: ignored

Disable GPU computation for local devices


In [4]:
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

## Gym selection and basic configurations

In [5]:
gym_name_list = [
    {
        'name': 'CartPole-v0',
        'goal': 180,
        'v_min': 0,
        'v_max': 210,
        'ep': 50
    },
    {
        'name': 'MountainCar-v0',
        'goal': -150, 
        'v_min': -210,
        'v_max': 0,
        'ep': 20
    },
    {
        'name': 'Blackjack-v0',
        'goal': 0.10,
        'v_min': -20,
        'v_max': 20,
        'ep': 1000
    }
]

env_i = 0 #@param {type:"slider", min:0, max:2, step:1}

save_model: bool = False # @param {type:"boolean"}
show_plots: bool = True # @param {type:"boolean"}
render_env: bool = False # @param {type:"boolean"}
seed = 42 # @param {type:"integer"}

max_steps_per_episode = 200 # @param {type:"integer"}

stopping_reward_criteria = gym_name_list[env_i]['goal']

gym_name = gym_name_list[env_i]['name']

env = gym.make(gym_name)  # Create the environment
env.seed(seed)

if isinstance(env.observation_space, gym.spaces.tuple.Tuple):
    env = gym.wrappers.TransformObservation(env, lambda obs: discrete_input(obs, env.observation_space))
    num_inputs = sum([x.n for x in env.observation_space])  # 4
else:
    num_inputs = env.observation_space.shape[0]  # 4
num_actions = env.action_space.n  # 2

## Algorithm hyper-parameters

In [None]:
# Factor of the ema that displays that tracks the averaged rewards
ema_ratio = 0.01  # @param {type:"number"}

# Ratio between generating experiences and sampling for training
training_ratio: int = 4 # @param  {type:"integer"}

# Size of the batch when sampling experiences
batch_size: int = 32 # @param {type:"integer"}

# Size of the buffer that stores the experiences
mem_length: int = 2048 # @param {type:"integer"}

# Discount factor for estimaing the futures rewards
gamma: float = 0.99  # @param {type:"number"}

# Initial and last probability for choosing exploration instead of explotation
epsilon: float = 1.0 # @param {type:"number"}
epsilon_min: float = 0.05 # @param {type:"number"}

# This is an estimation of the training iterations to tune the epsilon decay
approx_iterations: float = 1e6 # @param {type:"number"}

# The epsilon_decay reduce the exploration probability after each iteration
epsilon_decay: float = (epsilon_min / epsilon) ** (1 / approx_iterations)

# Factor of the ema that controls the updating weights of the target network
tau: float = 0.125 # @param {type:"number"}

# The usual factor that controls the amount of change the weights are updated
learning_rate = 0.05 # @param {type:"number"}

# For enabling the double dqn learning when choosing next Q-values
double_dqn_learning: bool = True # @param {type:"boolean"}

# Factor to define heuristically the size of the hidden layer.
hidden_size_factor = 16 # @param {type:"integer"}
num_hidden = num_inputs * num_actions * hidden_size_factor

## Load DQN models as Q-table approximators
We start with double DQN, where:
*   q_model estimates the Q-values used for action selection.
*   t_model is responsible for estimating the target Q values on training.

In [None]:
q_model = DqnAdv(num_inputs=num_inputs,
                 num_actions=num_actions,
                 num_hidden=num_hidden,
                 name="q_model")

t_model = DqnAdv(num_inputs=num_inputs,
                 num_actions=num_actions,
                 num_hidden=num_hidden,
                 name="t_model")

optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
# loss_function = keras.losses.Huber()
loss_function = keras.losses.MeanSquaredError()

## Tensorboard configuration

In [None]:
implementation = "DQN_v2"
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = os.path.join("logs", gym_name, implementation, "T_" + current_time)
summary_writer = tf.summary.create_file_writer(train_log_dir)

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

# from tensorboard import notebook
# notebook.list() # View open TensorBoard instances

# # Control TensorBoard display. If no port is provided, 
# # the most recently launched TensorBoard is used
# notebook.display(port=6006, height=1000) 

%tensorboard --logdir ./logs

## Main learning loop
It consist in main two steps:

1.   Collect experiences from the environment in episodes, with exploitation/exploration trade-off
2.   Sample past experiences to train the model using the Bellman equation.

Other sections are the mean reward tracking, stop-learning trigger, display status.


In [None]:
memory = deque(maxlen=mem_length)
running_reward = None
episode_count = 0
epoch = 0
historic_reward = []
while True:  # Run until solved

    state = env.reset()
    state = tf.convert_to_tensor(state)
    state = tf.expand_dims(state, 0)
    episode_reward = 0
    for time_step in range(1, max_steps_per_episode):
        # env.render(); Adding this line would show the attempts
        # of the agent in a pop up window.

        epsilon *= epsilon_decay
        epsilon = max(epsilon_min, epsilon)
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_model(state, training=False))

        # Apply the sampled action in our environment
        next_state, reward, done, _ = env.step(action)

        next_state = tf.convert_to_tensor(next_state)
        next_state = tf.expand_dims(next_state, 0)

        memory.append([state, action, reward, next_state, done])
        episode_reward += reward

        # ##### TRAIN MODEL ###############
        if len(memory) >= 2 * batch_size and time_step % training_ratio == 0:
            samples = random.sample(memory, batch_size)

            state_batch = tf.concat([s for s, a, r, n_s, d in samples], axis=0)
            action_batch = tf.concat([a for s, a, r, n_s, d in samples], axis=0)
            reward_batch = tf.concat([r for s, a, r, n_s, d in samples], axis=0)
            next_state_batch = tf.concat([n_s for s, a, r, n_s, d in samples], axis=0)
            not_done_batch = tf.concat([float(not d) for s, a, r, n_s, d in samples], axis=0)

            # Create a mask so we only calculate loss on the updated Q-values
            masks = tf.one_hot(action_batch, num_actions)

            # Build the updated Q-values for the sampled future states
            # Use the target model for stability
            future_t = t_model(next_state_batch)

            if double_dqn_learning:
                future_q = q_model(next_state_batch)
                best_future_action = tf.argmax(future_q, axis=-1)
                next_action_mask = tf.one_hot(best_future_action, num_actions)
                future_q_action = tf.reduce_sum(tf.multiply(future_t, next_action_mask), axis=1)
            else:
                future_q_action = tf.reduce_max(future_t, axis=1)

            # Q value = reward + discount factor * expected future reward
            updated_q_values = reward_batch + gamma * tf.multiply(future_q_action, not_done_batch)

            with tf.GradientTape() as tape:
                q_values = q_model(state_batch)

                # Apply the masks to the Q-values to get the Q-value for action taken
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                loss = loss_function(q_action, updated_q_values)

                # Backpropagation
                grads = tape.gradient(loss, q_model.trainable_variables)
                optimizer.apply_gradients(zip(grads, q_model.trainable_variables))

            # Transfer weights to target model
            weights = q_model.get_weights()
            target_weights = t_model.get_weights()
            for j in range(len(target_weights)):
                target_weights[j] = weights[j] * tau + target_weights[j] * (1 - tau)
            t_model.set_weights(target_weights)
            # ###############################################

            if running_reward is not None:
                with summary_writer.as_default():
                    tf.summary.scalar('loss', loss.numpy(), step=epoch)
                    tf.summary.scalar('ema_reward', running_reward, step=epoch)
                    tf.summary.scalar('epsilon', epsilon, step=epoch)
                epoch += 1
        state = next_state

        if done:
            break

    if running_reward is None:
        running_reward = episode_reward

    # Update running reward to check condition for solving
    running_reward = ema_ratio * episode_reward + (1 - ema_ratio) * running_reward
    historic_reward.append(running_reward)

    # Log details
    episode_count += 1
    if episode_count % gym_name_list[env_i]['ep'] == 0 and 'loss' in locals():
        template = "running reward: {:.2f} at episode {} with epsilon {:.2f} and loss {:.2f}"
        print(template.format(running_reward, episode_count, epsilon, loss))

    if running_reward > stopping_reward_criteria:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break

    if show_plots and episode_count % 10000000 == 0:
        plt.plot(historic_reward)
        plt.show()

running reward: 66.70 at episode 24850 with epsilon 0.05 and loss 205.92
running reward: 63.69 at episode 24900 with epsilon 0.05 and loss 3.44
running reward: 54.40 at episode 24950 with epsilon 0.05 and loss 19.48


KeyboardInterrupt: ignored

## Save model weights for a later use (optional).

In [None]:
if save_model:
    model_folder = os.path.join("./models", gym_name, implementation, "Ep_" + str(episode_count).zfill(5), "model")
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)
    q_model.save_weights(filepath=model_folder, save_format="tf")

## Play with the trained agent

In [None]:
episodes = 100 # @param {type:"integer"}
agent_rewards = []
for env_i in range(episodes):
    state = env.reset()
    episode_reward = 0

    for time_step in range(1, max_steps_per_episode):
        if render_env and gym_name != 'Blackjack-v0':
            env.render()  # Show the attempts of the agent in a pop up window.

        state = tf.convert_to_tensor(state)
        state = tf.expand_dims(state, 0)

        action = np.argmax(q_model(state))

        # Apply the sampled action in our environment
        state, reward, done, _ = env.step(action)
        episode_reward += reward

        if done:
            break
    agent_rewards.append(episode_reward)
  
print(f"After 100 episodes the mean reward is {np.mean(agent_rewards)}")

if show_plots:
    num_bins = 50
    x = np.array(agent_rewards)
    fig, ax = plt.subplots()

    # the histogram of the data
    n, bins, patches = ax.hist(x, num_bins, density=1)

    ax.set_xlabel('Episode rewards')
    ax.set_ylabel('Probability density')
    ax.set_title(f'Mean {np.mean(x).round(2)} +/- {np.std(x).round(2)}')

    # Tweak spacing to prevent clipping of ylabel
    fig.tight_layout()
    plt.show()

print("End of script!")