## Environment Setup

In [None]:
from datetime import datetime
import time
from matplotlib import pyplot as plt
import gym
import os
import random
import numpy as np
from collections import deque
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout  # , Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.callbacks import TensorBoard
from tensorflow import GradientTape, math, one_hot, square, summary, config
from numpy.random import seed
from tensorflow import random as tf_rand


class NeuralNetwork(object):

    def __init__(self, input_size, output_size, layers, init_learning_rate, decay_steps, decay_rate, weights=None):
        lr_schedule = ExponentialDecay(
            initial_learning_rate=init_learning_rate,
            decay_steps=decay_steps,
            decay_rate=decay_rate,
            staircase=True)
        self.optimizer = Adam(learning_rate=lr_schedule)
        lr_metric = self.get_lr_metric(self.optimizer)
        self.model = self.create_model(input_size, output_size, layers, lr_metric)
        if weights is not None:
            self.copy_weights(weights)

    # source: https://stackoverflow.com/questions/47490834/how-can-i-print-the-learning-rate-at-each-epoch-with-adam-optimizer-in-keras
    def get_lr_metric(self, optimizer):
        def lr(y_true, y_pred):
            return optimizer._decayed_lr("float32")

        return lr

    def create_model(self, input_size, output_size, layers, lr_metric):
        state_input = Input(shape=(input_size,), dtype='float64', name="states input")

        x = Dense(layers[0], activation='relu')(state_input)  # , kernel_initializer='RandomNormal'
        for i in range(1, len(layers)):
            x = Dense(layers[i], activation='relu')(x)
            # x = BatchNormalization()(x)
            # x = Dropout(0.2)(x)

        x = Dense(output_size, activation='linear')(x)

        model = Model(state_input, x)
        model.compile(loss='mse', optimizer=self.optimizer, metrics=['acc', lr_metric])

        model.summary()
        return model

    def predict(self, x):
        return self.model(np.atleast_2d(x.astype('float64')))

    def copy_weights(self, other_weights):
        # owm_weights = self.model.trainable_variables
        # for v1, v2 in zip(owm_weights, other_weights):
        #     v1.assign(v2.numpy())
        self.model.set_weights(other_weights)


def sample_batch(deque_to_sample_from, n):
    batch = random.sample(list(deque_to_sample_from), n)
    return batch


def sample_batch_stochastic_prioritized(deque_to_sample_from, n):
    sorted_deque = sorted(deque_to_sample_from, key=lambda r: r["td_error"])
    alpha = 0.95
    # e = 1e-6
    w = [(1/(i+1)) ** alpha for i in range(len(sorted_deque))]
    # w = [(i["td_error"] + e) ** alpha for i in deque_to_sample_from]
    w_sum = sum(w)
    w = [i / w_sum for i in range(len(w))]
    batch = random.choices(deque_to_sample_from, weights=w, k=n)
    return batch


def epsilon_decay(old_epsilon, final_epsilon, decay_rate):
    new_epsilon = max(final_epsilon, decay_rate * old_epsilon)  # decrease epsilon
    return new_epsilon


def sample_action(epsilon, actions_q_values):
    rand = random.random()
    if rand < epsilon:
        action = random.randint(0, actions_q_values.shape[1] - 1)
    else:
        action = np.argmax(actions_q_values)
    return action


def train_agent(q_hyper_params, nn_hyper_params, log_dir, avg_return_steps=100):
    env = gym.make("CartPole-v1")
    start_time = time.time()

    summary_writer = summary.create_file_writer(log_dir)

    experience_replay = deque(maxlen=q_hyper_params["N"])  # D
    q_value_network = NeuralNetwork(input_size=env.observation_space.shape[0], output_size=env.action_space.n,
                                    layers=nn_hyper_params["layers"], init_learning_rate=nn_hyper_params["initial_lr"],
                                    decay_steps=nn_hyper_params["decay_steps"],
                                    decay_rate=nn_hyper_params["decay_rate"])
    target_network = NeuralNetwork(input_size=env.observation_space.shape[0], output_size=env.action_space.n,
                                   layers=nn_hyper_params["layers"],
                                   # weights=q_value_network.model.trainable_variables,
                                   init_learning_rate=nn_hyper_params["initial_lr"],
                                   decay_steps=nn_hyper_params["decay_steps"],
                                   decay_rate=nn_hyper_params["decay_rate"])
    episodes_total_rewards = []
    rewards = []
    losses = []

    s_t = env.reset()

    passed_avg_return_of_475 = False
    training_started = False
    total_steps_counter = 0
    epsilon = q_hyper_params["epsilon_init"]
    loss = -1
    for episode_index in range(q_hyper_params["M"]):
        episode_total_rewards = 0
        episode_total_losses = []
        done = False
        # play an episode - single trajectory
        while not done and not passed_avg_return_of_475:
            epsilon = epsilon_decay(old_epsilon=epsilon, final_epsilon=q_hyper_params["epsilon_end"],
                                    decay_rate=q_hyper_params["epsilon_decay_rate"])

            # selecting action
            state_action_q_values = q_value_network.predict(s_t)
            action = sample_action(epsilon,
                                   state_action_q_values)

            # taking a single step
            s_t_1, reward, done, info = env.step(action)
            total_steps_counter += 1
            rewards.append(reward)
            episode_total_rewards += reward

            # Stochastic Prioritized Experience
            if done:
                r_t_1 = reward
            else:
                r = reward
                target_q_values = target_network.predict(s_t_1)[0]
                max_q_value = max(target_q_values)
                r_t_1 = r + q_hyper_params["discount_factor"] * max_q_value
            td_error = r_t_1 - math.reduce_sum(
                state_action_q_values * one_hot(action, env.action_space.n),
                axis=1)

            # Storing transition
            current_transition = {"s_t": s_t, "action": action, "reward": reward, "s_t_1": s_t_1, "done": done,
                                  "td_error": td_error}
            experience_replay.append(current_transition)

            s_t = s_t_1

            if len(experience_replay) > nn_hyper_params["batch_size"] * 10:  # enough initial observations
                training_started = True
                transitions_batch = sample_batch_stochastic_prioritized(experience_replay,
                                                                        nn_hyper_params["batch_size"])

                y = []
                states_batch = []
                actions_batch = []
                for transition in transitions_batch:
                    states_batch.append(transition["s_t"])
                    actions_batch.append(transition["action"])
                    if transition["done"]:
                        y.append(transition["reward"])
                    else:
                        r = transition["reward"]
                        target_q_values = target_network.predict(transition["s_t_1"])[0]
                        max_q_value = max(target_q_values)
                        y.append(r + q_hyper_params["discount_factor"] * max_q_value)

                # training
                y = np.asarray(y)
                states_batch = np.asarray(states_batch)
                actions_batch = np.asarray(actions_batch)
                # gradient step
                with GradientTape() as tape:
                    predicted_action_value_q = math.reduce_sum(
                        q_value_network.model(states_batch, training=True) * one_hot(actions_batch, env.action_space.n),
                        axis=1)
                    # Compute the loss value for this minibatch.
                    loss = math.reduce_mean(square(y - predicted_action_value_q))

                    # loss = nn_hyper_params["loss_fn"](y, predicted_action_value_q)
                    # loss = q_value_network.model.compiled_loss(y, predicted_action_value_q)

                variables = q_value_network.model.trainable_weights
                gradients = tape.gradient(loss, variables)
                q_value_network.optimizer.apply_gradients(zip(gradients, variables))

                losses.append(loss)
                episode_total_losses.append(loss)
                with summary_writer.as_default():
                    summary.scalar("steps' losses", loss, step=total_steps_counter)
            if done:
                s_t = env.reset()
                episodes_total_rewards.append(episode_total_rewards)
            if total_steps_counter % q_hyper_params["C"] == 0:
                target_network.copy_weights(q_value_network.model.get_weights())
            if len(rewards) % 100 == 0:
                print(
                    "Step %d, Episode %d Training loss (for one batch): %.4f, Episode total rewards %.2f, Episode epsilon %.5f, Learning Rate %.5f" % (
                        total_steps_counter, episode_index + 1, float(loss), episode_total_rewards, epsilon,
                        q_value_network.optimizer._decayed_lr("float32")))

        last_episodes_avg_returns = (
                sum(episodes_total_rewards[-min(len(episodes_total_rewards), avg_return_steps):]) / len(
            episodes_total_rewards[-min(len(episodes_total_rewards), avg_return_steps):]))
        if episode_index >= avg_return_steps and not passed_avg_return_of_475:
            if last_episodes_avg_returns > 475:
                passed_avg_return_of_475 = True
                print(
                    f"Average reward per episode for the last {avg_return_steps} episodes passed 475 after episode: {episode_index + 1}")
                break
        if training_started:
            last_steps_avg_losses = (sum(losses[-min(len(losses), 1000):]) / len(
                losses[-min(len(losses), 1000):]))
            print(
                f"Episode {episode_index + 1}, episode reward: {episode_total_rewards},running avg reward (100 episodes): {last_episodes_avg_returns}")
            with summary_writer.as_default():
                summary.scalar('episode reward', episode_total_rewards, step=episode_index)
                summary.scalar('running avg reward (100 episodes)', last_episodes_avg_returns, step=episode_index)
                summary.scalar('running avg reward (1000 steps)', last_steps_avg_losses, step=episode_index)
                summary.scalar("episode's steps average loss", sum(episode_total_losses) / len(episode_total_losses),
                               step=episode_index)

    env.close()

    end_time = time.time()
    train_time = end_time - start_time
    print(f"Training Time (in seconds): {train_time}")
    return q_value_network, losses, episodes_total_rewards, train_time


def test_agent(q_net, path):
    env = gym.make("CartPole-v1")
    env = gym.wrappers.Monitor(env, path, force=True)

    s_t = env.reset()
    done = False
    while not done:
        env.render()  # enable in test
        state_action_q_values = q_net.predict(s_t)
        action = np.argmax(state_action_q_values)
        s_t_1, reward, done, info = env.step(action)
        s_t = s_t_1
    env.close()


def plot_line(x, y, x_label, y_label, data_label, title, color):
    fig = plt.figure(figsize=(10, 10))
    plt.plot(x, y, color=color, label=data_label)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend(loc='best', shadow=True)
    plt.title(title)
    plt.show()


def main():
    dqn_params = {
        "N": 1000,  # experience_replay deque size
        "M": 1000,  # number of episodes #50000
        "C": 25,  # steps update interval for the target network's weights
        "epsilon_init": 0.6,  # 0.6
        "epsilon_end": 0.0001,
        "epsilon_decay_rate": 0.999,
        "discount_factor": 0.99}
    nn_hyper_parameters = {
        "batch_size": 32,
        "initial_lr": 1e-2,
        "decay_steps": 150,
        "decay_rate": 0.95,
        "loss_fn": MeanSquaredError(),
        "layers": [6, 6, 6, 5, 4]}

    start_time = datetime.now().strftime("%Y%m%d-%H%M%S")
    log_path = 'logs/dqn/' + start_time

    q_network, losses, episodes_total_rewards, training_time = train_agent(q_hyper_params=dqn_params,
                                                                           nn_hyper_params=nn_hyper_parameters,
                                                                           avg_return_steps=100, log_dir=log_path)

    plot_line(range(len(losses)), losses, x_label="steps", y_label="MSE", data_label="losses",
              title="MSE for each episode",
              color="b")
    plot_line(range(len(episodes_total_rewards)), episodes_total_rewards, x_label="episodes", y_label="rewards sum",
              data_label="rewards per episode",
              title="total reward for each episode", color="c")
    for i in range(10):
        test_agent(q_network, path=os.path.join(os.getcwd(), f"videos/{start_time}/{i}"))


main()


Model: "model_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 states input (InputLayer)   [(None, 4)]               0         
                                                                 
 dense_80 (Dense)            (None, 7)                 35        
                                                                 
 dense_81 (Dense)            (None, 7)                 56        
                                                                 
 dense_82 (Dense)            (None, 4)                 32        
                                                                 
 dense_83 (Dense)            (None, 2)                 10        
                                                                 
Total params: 133
Trainable params: 133
Non-trainable params: 0
_________________________________________________________________
Model: "model_19"
__________________________________________

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard
%tensorboard --logdir /content/logs/dqn/20211201-135918

<IPython.core.display.Javascript object>

In [None]:
from tensorboard import notebook

notebook.display(port=6006, height=1000) 

Selecting TensorBoard with logdir /content/logs/dqn/20211201-135918 (started 0:00:31 ago; port 6006, pid 175).


<IPython.core.display.Javascript object>