In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import PrometheusMetricsCSV as prom
from tensorflow.keras import initializers
import pandas as pd
import subprocess
import time
import import_ipynb
#import SwarmEnvironment as envi
import Environment as envi
import HelpFunctions as hf
import matplotlib
import pickle

importing Jupyter notebook from Environment.ipynb
importing Jupyter notebook from HelpFunctions.ipynb


In [2]:
class D3QN(keras.Model):
    
    def __init__(self):
        super(D3QN, self).__init__()
        self.norm = layers.LayerNormalization(axis=1)
        self.layer1 = layers.Dense(64, activation="relu",kernel_initializer='glorot_uniform')
        self.layer2 = layers.Dense(128, activation="relu",kernel_initializer='glorot_uniform')
        self.layer3 = layers.Dense(256, activation="relu",kernel_initializer='glorot_uniform')

        self.Value =  layers.Dense(1, activation="linear")
        self.Advantage = keras.layers.Dense(3, activation="linear")
    
    def call(self, state):
        x = self.norm(state)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        
        V = self.Value(x)
        A = self.Advantage(x)
        
        Q = (V + (A - tf.math.reduce_mean(A, axis=1, keepdims=True)))
        
        return Q
    
    def advantage(self, state):
        x = self.norm(state)
        x = self.layer1(state)
        x = self.layer2(x)
        x = self.layer3(x)
        
        A = self.Advantage(x)
        
        return A

In [3]:
class ReplayBuffer():
    def __init__(self,max_memory_length):
        self.max_memory_length = max_memory_length
        
        self.action_history = []
        self.state_history = []
        self.state_next_history = []
        self.rewards_history = []
        self.done_history = []
        
    def store_transition(self, state, action, reward, next_state, done):
        self.state_history.append(state)
        self.action_history.append(action)
        self.rewards_history.append(reward)
        self.state_next_history.append(next_state)
        self.done_history.append(done)
        
        if len(self.rewards_history) > self.max_memory_length:
            del self.state_history[:1]
            del self.action_history[:1]
            del self.rewards_history[:1]
            del self.state_next_history[:1]
            del self.done_history[:1]
            
    def sample_buffer(self, batch_size):
        indices = np.random.choice(range(len(self.done_history)), size=batch_size)
        states = np.array([self.state_history[i] for i in indices])
        actions = [self.action_history[i] for i in indices]
        rewards = [self.rewards_history[i] for i in indices]
        next_states = np.array([self.state_next_history[i] for i in indices])
        dones = tf.convert_to_tensor([float(self.done_history[i]) for i in indices])

        return states, actions, rewards, next_states, dones

In [4]:
class Agent():
    
    def __init__(self, lr, gamma, epsilon, batch_size,
                 epsilon_min,epsilon_max, epsilon_random_steps,Number_Of_Episodes,max_steps_per_episode,
                 update_target_network,mem_size=42,num_actions=3):
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.batch_size = batch_size
        self.epsilon_min = epsilon_min
        self.epsilon_max = epsilon_max
        self.epsilon_random_steps = epsilon_random_steps
        self.update_target_network = update_target_network
        self.epsilon_interval = (self.epsilon_max - self.epsilon_min)
        self.Number_Of_Episodes = Number_Of_Episodes
        self.max_steps_per_episode = max_steps_per_episode
        self.mem_size=mem_size
        self.num_actions=num_actions
        
        self.memory = ReplayBuffer(self.mem_size)
        
        self.model = D3QN()
        
        self.model_target = D3QN()

        self.optimizer = keras.optimizers.RMSprop(learning_rate=self.lr,clipnorm=1.0)
        self.loss_function = keras.losses.Huber()
        
    def store_transition(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)
        
    def choose_action(self, step_count, episode, state):
        if (step_count <= self.epsilon_random_steps and (episode == 0 or episode==1)) or self.epsilon > np.random.rand(1)[0]:
            print(f"Taking random action:{step_count}!")
            action = np.random.choice(self.num_actions)
        else:
            print(f"Action based on the model:{step_count}!")
            # CARE HERE, I CHOOSE ACTION BASED ON THE ADVANTAGE NETWORK
            action_probs = self.model.advantage(state)
            print(action_probs)
            action = tf.argmax(action_probs[0]).numpy()
        
        self.epsilon -= self.epsilon_interval / (self.Number_Of_Episodes*self.max_steps_per_episode/2)
        self.epsilon = max(self.epsilon,self.epsilon_min)
        if (episode==self.Number_Of_Episodes-1):
            self.epsilon=0
            
        return action
    
    def learn(self,step_count):
        if len(self.memory.done_history) < self.batch_size:
            return
        
        if  step_count % self.update_target_network == 0:
             self.model_target.set_weights(self.model.get_weights())
        
        state_sample, action_sample, rewards_sample, state_next_sample, done_sample = self.memory.sample_buffer(self.batch_size)

        #DDQN
        actions=tf.argmax(self.model.predict(state_next_sample),axis=1)
        target_r=self.model_target(state_next_sample)
        mask_target = tf.one_hot(actions,self.num_actions)
        q_values_next_state=tf.reduce_sum(tf.multiply(target_r,mask_target),axis=1)
        updated_q_values = rewards_sample + self.gamma * q_values_next_state
        #If final timestep of the episode
        updated_q_values = updated_q_values * (1 - done_sample) + done_sample * rewards_sample
        
        masks = tf.one_hot(action_sample, self.num_actions)
        with tf.GradientTape() as tape:
            q_values = self.model(state_sample)
            q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
            loss = self.loss_function(updated_q_values, q_action)
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

In [None]:
if __name__ == '__main__':
    # for plotting
    delay_consumer = []
    delay_producer = []
    number_of_brokers = []
    time_list = []
    episode_reward_history = []

    Episodes = 10
    max_steps_per_episode = 42
    env = envi.Environment(4)
    time.sleep(60)
    agent = Agent(lr=0.00025,gamma=0.99,epsilon=1.0,epsilon_min=0.1,epsilon_max=1.0,batch_size=10, \
                 epsilon_random_steps = 10,Number_Of_Episodes=Episodes,max_steps_per_episode=42, \
                 update_target_network = 5)
    
    for episode in range(0,Episodes):
        if episode != 0:
            print("RESETING THE ENVIRONMENT!")
            env.reset()
            time.sleep(5)
            env = envi.Environment(4)
            time.sleep(60)
            print("ALL GOOD!")
        
        episode_reward = 0
        step_count = 0
        state = env.state()
        
        for timestep in range(0, max_steps_per_episode):
            print("------------------------------------")
            start_time = time.time()
            
            step_count +=1
            state_tensor = tf.expand_dims(state, 0)
            action = agent.choose_action(step_count,episode,state_tensor)
            state_next, reward = env.step(action,2)
            episode_reward += reward
            
            #plotting
            delay_producer.append(env.producerLatency)
            delay_consumer.append(env.consumerLatency)
            number_of_brokers.append(env.activeBrokers)
            time_list.append(timestep)
            
            if step_count == max_steps_per_episode:
                done = 1 
            else:
                done = 0
                
            agent.store_transition(state,action,reward,state_next,done)
            
            state = state_next
            
            agent.learn(step_count)
            
            print(f"time:{time.time()-start_time},reward:{episode_reward}")
            a = int(round(time.time()-start_time,2))
            if(40-a)>0:
                print(f"sleeping for:{(40-a)}")
                time.sleep(40-a)
            if done == 1:
                hf.saveDataDQN(episode, time_list, delay_producer, delay_consumer, number_of_brokers)
                time_list.clear()
                delay_producer.clear()
                delay_consumer.clear()
                number_of_brokers.clear()
                break
                
        episode_reward_history.append(episode_reward)
        agent.model.save_weights("D3QNmodel.h5")
        agent.model_target.save_weights("targetD3QNmodel.h5")
    with open("DQN/D3QNrewards.txt","wb") as fp:
        pickle.dump(episode_reward_history,fp)
    print('The training is done for D3QN!')
    env.reset()

------------------------------------
Taking random action:1!
We are scaling up!
im here:414.6633333333333,445.5633333333332,186.74222222222215
time:39.823575496673584,reward:-0.635
sleeping for:1
------------------------------------
Taking random action:2!
we are scaling down!
im here:671.3199999999998,667.4466666666666,346.2555555555555
time:32.82739973068237,reward:-1.335
sleeping for:8
------------------------------------
Taking random action:3!
We are scaling up!
im here:562.8599999999999,636.9766666666667,299.94555555555553
time:34.864434003829956,reward:-2.11
sleeping for:6
------------------------------------
Taking random action:4!
we are scaling down!
im here:693.1266666666666,687.02,360.0488888888889
time:34.29184365272522,reward:-2.8099999999999996
sleeping for:6
------------------------------------
Taking random action:5!
No scaling action was taken!
im here:723.4799999999998,722.3266666666667,381.9355555555555
time:20.07270860671997,reward:-3.51
sleeping for:20
-----------

In [None]:
env.reset()