Imports 

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import PrometheusMetricsCSV as prom
import pandas as pd
import subprocess
import time
import import_ipynb
# import SwarmEnvironment as envi
import Environment as envi
import HelpFunctions as hf
import matplotlib
import pickle

Setup

In [None]:
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.1
epsilon_max = 1.0
epsilon_interval = (
    epsilon_max - epsilon_min
)
batch_size = 10
max_steps_per_episode = 42 # scaling action every 40s

env = envi.Environment(4)
time.sleep(60)

Implement the Deep Q-Network

In [None]:
num_actions = 3


def create_q_model():
    
    inputs = layers.Input(shape=(37,))
    norm = layers.LayerNormalization(axis=1)(inputs)
    layer1 = layers.Dense(64, activation="relu",kernel_initializer='glorot_uniform')(norm)
    layer2 = layers.Dense(128, activation="relu",kernel_initializer='glorot_uniform')(layer1)
    layer3 = layers.Dense(256, activation="relu",kernel_initializer='glorot_uniform')(layer2)
    
    action = layers.Dense(num_actions, activation="linear",kernel_initializer='glorot_uniform')(layer3)
    
    return keras.Model(inputs=inputs, outputs=action)


model = create_q_model()

model_target = create_q_model()

# model.load_weights("model.h5")
# model_target.load_weights("target_model.h5")

In [None]:
#Section for ploting
delay_consumer = []
delay_producer = []
number_of_brokers = []
time_list = []

In [None]:
optimizer = keras.optimizers.RMSprop(learning_rate=0.00025,clipnorm=1.0)

# Experience replay buffers
action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []
Number_Of_Episodes = 10

# Number of random actions to take and observe output
epsilon_random_steps = 10 # edw eixame 10
max_memory_length = 42 # the same as the number of timesteps per episode

# Train the model after 1 action
update_after_actions = 1
# How often to update the target network
update_target_network = 5
# Using huber loss for stability
loss_function = keras.losses.Huber()

for episode in range(0,Number_Of_Episodes):
    if episode != 0:
        print("RESETING THE ENVIRONMENT!")
        env.reset()
        time.sleep(5)
        env = envi.Environment(4)
        time.sleep(60)
        print("ALL GOOD!")
        
    episode_reward = 0
    step_count = 0
    state = env.state()
    
    for timestep in range(0, max_steps_per_episode):
        print("------------------------------------")
        start_time = time.time()

        step_count +=1
        if (step_count < epsilon_random_steps and (episode == 0 or episode==1)) or epsilon > np.random.rand(1)[0]:
            # Take random action
            print(f"Taking random action:{step_count}!")
            action = np.random.choice(num_actions)
        else:
            # Predict action Q-values
            # From environment state
            print(f"Action based on the model:{step_count}!")
            state_tensor = tf.expand_dims(state, 0)
            action_probs = model(state_tensor, training=False)
            action = tf.argmax(action_probs[0]).numpy()

        # Decay probability of taking random action
        epsilon -= epsilon_interval / (Number_Of_Episodes*max_steps_per_episode/2)
        epsilon = max(epsilon,epsilon_min)
        # In the last 2 episodes only exploit what you have learned
        if (episode==Number_Of_Episodes-1) or (episode==Number_Of_Episodes-2):
            epsilon=0

        # Apply the sampled action in our environment, 
        # second parameter at step function denotes the appropriate reward function
        state_next, reward =  env.step(action,2)
        print(reward)
        episode_reward += reward
        #plotting
        delay_producer.append(env.producerLatency)
        delay_consumer.append(env.consumerLatency)
        number_of_brokers.append(env.activeBrokers)
        time_list.append(timestep)

        # Save actions and states in replay buffer
        action_history.append(action)
        state_history.append(state)
        state_next_history.append(state_next)
        rewards_history.append(reward)
        if step_count == max_steps_per_episode:
            done_history.append(1) # True
            done = 1 
        else:
            done_history.append(0) # False
            done = 0

        state = state_next

        # Update once batch size is over 32
        if len(done_history) > batch_size: #edw prepei na paei batch_size

            # Get indices of samples for replay buffers
            indices = np.random.choice(range(len(done_history)), size=batch_size)

            # Using list comprehension to sample from replay buffer
            state_sample = np.array([state_history[i] for i in indices])
            state_next_sample = np.array([state_next_history[i] for i in indices])
            rewards_sample = [rewards_history[i] for i in indices]
            action_sample = [action_history[i] for i in indices]
            done_sample = tf.convert_to_tensor([float(done_history[i]) for i in indices])

            # Build the updated Q-values for the sampled future states
            # Use the target model for stability
            future_rewards = model_target.predict(state_next_sample)

            updated_q_values = rewards_sample + gamma * tf.reduce_max(future_rewards, axis=1)

            # If final timestep of the episode
            updated_q_values = updated_q_values * (1 - done_sample) + done_sample * rewards_sample

            # Create a mask so we only calculate loss on the updated Q-values
            masks = tf.one_hot(action_sample, num_actions)
            with tf.GradientTape() as tape:
                q_values = model(state_sample)
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                loss = loss_function(updated_q_values, q_action)

            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # update the target network
        if step_count % update_target_network == 0:
            model_target.set_weights(model.get_weights())
            template = "running reward: {:.2f} at step {}"
            print(template.format(episode_reward,step_count))

        # Limit the state and reward history
        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]



        print(f"time:{time.time()-start_time},reward:{episode_reward}")
        a = int(round(time.time()-start_time,2))
        if(40-a)>0:
            print(f"sleeping for:{(40-a)}")
            time.sleep(40-a)
        if done == 1:
            hf.saveDataDQN(episode, time_list, delay_producer, delay_consumer, number_of_brokers)
            time_list.clear()
            delay_producer.clear()
            delay_consumer.clear()
            number_of_brokers.clear()
            
            break
    episode_reward_history.append(episode_reward)
    model.save_weights("DQNmodel.h5")
    model_target.save_weights("targetDQNmodel.h5")
with open("DQN/DQNrewards.txt","wb") as fp:
    pickle.dump(episode_reward_history,fp)
print('The training is done for DQN!')

In [None]:
env.reset()