Q-Agent

In [None]:
class QAgent:
    def __init__(self, bins_per_dim=6, action_size_per_service=3, num_services=3):
        self.bins_per_dim = bins_per_dim  
        self.action_size_per_service = action_size_per_service
        self.num_services = num_services
        
        self.q_table = defaultdict(float)
        
        self.eligibility_traces = defaultdict(float)
        self.lambda_param = 0.7  
        
        self.epsilon = 0.3  
        self.epsilon_min = 0.05
        self.epsilon_decay = 0.99
        self.steps_done = 0
        
        self.initial_lr = 0.08
        self.min_lr = 0.01
        
        self.reward_history = []
        self.epsilon_history = []
        self.loss_history = []
        self.service_states = {i: {} for i in range(num_services)}
        self.action_counts = {i: {0: 0, 1: 0, 2: 0} for i in range(num_services)}
        self.high_cpu_actions = {i: {0: 0, 1: 0, 2: 0} for i in range(num_services)}
        self.service_performance = {i: {'rewards': [], 'actions': []} for i in range(num_services)}
        
        self.service_thresholds = {
            0: {'low': 15, 'high': 85, 'opt_low': 30, 'opt_high': 70},  
            1: {'low': 18, 'high': 80, 'opt_low': 35, 'opt_high': 65},  
            2: {'low': 15, 'high': 85, 'opt_low': 30, 'opt_high': 70} 
        }
        
        self._initialize_q_table()
        
        self.state_action_cache = {}
    
    def _initialize_q_table(self):
        print("Initializing Q-table with guided values...")
        for cpu_level in range(0, 101, 5):  
            for resources in range(1, 11):  
                for service_idx in range(self.num_services):
                    thresholds = self.service_thresholds[service_idx]
                    
                    state_key = (resources, cpu_level, service_idx)
                    
                    if cpu_level >= thresholds['high']:
                        self.q_table[(state_key, 0)] = -20.0  
                        self.q_table[(state_key, 1)] = 0.0    
                        self.q_table[(state_key, 2)] = 20.0   
                    elif cpu_level <= thresholds['low'] and resources > 1:
                        self.q_table[(state_key, 0)] = 10.0   
                        self.q_table[(state_key, 1)] = 0.0    
                        self.q_table[(state_key, 2)] = -10.0 
                    elif cpu_level <= thresholds['low'] and resources <= 1:
                        
                        self.q_table[(state_key, 0)] = -15.0  
                        self.q_table[(state_key, 1)] = 10.0  
                        self.q_table[(state_key, 2)] = -5.0  
                    elif thresholds['opt_low'] <= cpu_level <= thresholds['opt_high']:

                        self.q_table[(state_key, 0)] = -5.0  
                        self.q_table[(state_key, 1)] = 10.0   
                        self.q_table[(state_key, 2)] = -5.0  
                    elif thresholds['opt_high'] < cpu_level < thresholds['high']:

                        self.q_table[(state_key, 0)] = -10.0 
                        self.q_table[(state_key, 1)] = 5.0    
                        self.q_table[(state_key, 2)] = 8.0   
                    else: 

                        if resources > 1:
                            self.q_table[(state_key, 0)] = 8.0    
                            self.q_table[(state_key, 1)] = 5.0
                            self.q_table[(state_key, 2)] = -8.0   
                        else:
                            self.q_table[(state_key, 0)] = -10.0 
                            self.q_table[(state_key, 1)] = 10.0   
                            self.q_table[(state_key, 2)] = -5.0   
        
        print("Q-table initialization complete.")
    
    def get_adaptive_learning_rate(self):
        # Adaptive learning rate function
        lr = max(self.min_lr, self.initial_lr * (0.99 ** (self.steps_done // 300)))
        

        if len(self.reward_history) > 10:
            recent_rewards = self.reward_history[-10:]
            if np.mean(recent_rewards[-3:]) > np.mean(recent_rewards[:7]):
                lr *= 1.05  # Increase if improving
            elif np.mean(recent_rewards[-3:]) < np.mean(recent_rewards[:7]):
                lr *= 0.95  # Decrease if getting worse
        
        return np.clip(lr, self.min_lr, self.initial_lr)
    
    def get_simplified_state(self, state, service_idx):
        try:

            if service_idx < len(state) // 2:
                resources = state[service_idx]
                cpu_util = state[service_idx + 3] * 100  # Denormalize
                

                resources = int(min(round(resources), 10))  # Cap at 10 resources
                cpu_level = int(min(round(cpu_util), 100))  # Cap at 100%
                

                return (resources, cpu_level, service_idx)
            else:

                return (2, 50, service_idx)
        except Exception as e:
            print(f"Error in get_simplified_state: {e}")

            return (2, 50, service_idx)
    
    def get_q_value(self, state_tuple, service_idx, action):
        key = (state_tuple, action)
        return self.q_table[key]
    
    def set_q_value(self, state_tuple, action, value):
        key = (state_tuple, action)
        self.q_table[key] = float(value)
    
    def select_action(self, state):
        try:

            for i in range(self.num_services):
                service_state = {}
                if i < len(state) // 2:
                    service_state['resources'] = state[i]
                    service_state['cpu'] = state[i + 3] * 100 
                    self.service_states[i] = service_state
            

            actions = {}
            

            for i in range(self.num_services):

                cpu_util = self.service_states[i].get('cpu', 50)
                resources = self.service_states[i].get('resources', 2)
                simple_state = self.get_simplified_state(state, i)
                

                thresholds = self.service_thresholds[i]
                

                if random.random() > self.epsilon:

                    if cpu_util >= thresholds['high']:

                        actions[i] = 2
                    elif cpu_util <= thresholds['low'] and resources > 1:

                        actions[i] = 0
                    elif cpu_util <= thresholds['low'] and resources <= 1:

                        actions[i] = 1  # No change
                    elif thresholds['opt_low'] <= cpu_util <= thresholds['opt_high']:

                        actions[i] = 1
                    else:


                        q_values = [self.get_q_value(simple_state, i, a) for a in range(3)]
                        

                        if thresholds['opt_high'] < cpu_util < thresholds['high']:

                            q_values[0] -= 5.0  
                            q_values[2] += (cpu_util - thresholds['opt_high']) / 10.0  
                        elif thresholds['low'] < cpu_util < thresholds['opt_low']:

                            if resources > 1:
                                q_values[0] += (thresholds['opt_low'] - cpu_util) / 10.0 
                                q_values[2] -= 5.0
                        

                        best_action = np.argmax(q_values)
                        actions[i] = best_action
                else:

                    if cpu_util >= thresholds['high']:

                        probs = [0.0, 0.05, 0.95]  
                    elif cpu_util <= thresholds['low'] and resources > 1:

                        probs = [0.95, 0.05, 0.0] 
                    elif thresholds['opt_low'] <= cpu_util <= thresholds['opt_high']:

                        probs = [0.05, 0.9, 0.05]  
                    elif thresholds['opt_high'] < cpu_util < thresholds['high']:

                        scale_factor = (cpu_util - thresholds['opt_high']) / (thresholds['high'] - thresholds['opt_high'])
                        up_prob = 0.3 + 0.6 * scale_factor 
                        probs = [0.0, 1.0 - up_prob, up_prob]  
                    else: 

                        if resources > 1:
                            scale_factor = (thresholds['opt_low'] - cpu_util) / (thresholds['opt_low'] - thresholds['low'])
                            down_prob = 0.3 + 0.6 * scale_factor 
                            probs = [down_prob, 1.0 - down_prob, 0.0]  
                        else:

                            probs = [0.0, 0.9, 0.1]  
                    

                    if resources <= 1:
                        probs[0] = 0.0  

                        total = sum(probs)
                        if total > 0:
                            probs = [p/total for p in probs]
                        else:
                            probs = [0.0, 0.9, 0.1] 
                    

                    action = random.choices([0, 1, 2], weights=probs)[0]
                    actions[i] = action
                

                self.action_counts[i][actions[i]] += 1
                if cpu_util > 80:
                    self.high_cpu_actions[i][actions[i]] += 1
            
            return actions
        
        except Exception as e:
            print(f"Error in select_action: {e}")
            print(traceback.format_exc())

            return {i: 1 for i in range(self.num_services)}
    
    def update_q_table(self, state, actions, reward, next_state):
        try:

            learning_rate = self.get_adaptive_learning_rate()
            

            if np.isnan(reward):
                reward = 0.0
            

            for i in range(self.num_services):
                if i in actions:
                    action = actions[i]
                    

                    simple_state = self.get_simplified_state(state, i)
                    next_simple_state = self.get_simplified_state(next_state, i)
                    

                    cpu_util = self.service_states[i].get('cpu', 50)
                    resources = self.service_states[i].get('resources', 2)
                    thresholds = self.service_thresholds[i]
                    

                    followed_rules = False
                    if cpu_util >= thresholds['high'] and action == 2:

                        followed_rules = True
                    elif cpu_util <= thresholds['low'] and resources > 1 and action == 0:

                        followed_rules = True
                    elif thresholds['opt_low'] <= cpu_util <= thresholds['opt_high'] and action == 1:

                        followed_rules = True
                    

                    effective_reward = reward / self.num_services
                    if followed_rules:

                        effective_reward += 2.0
                    elif (cpu_util >= thresholds['high'] and action == 0) or \
                         (cpu_util <= thresholds['low'] and resources > 1 and action == 2):

                        effective_reward -= 5.0
                    

                    current_q = self.get_q_value(simple_state, i, action)
                    

                    next_q_values = [self.get_q_value(next_simple_state, i, a) for a in range(3)]
                    max_next_q = max(next_q_values)
                    

                    td_error = effective_reward + 0.9 * max_next_q - current_q
                    td_error = np.clip(td_error, -5.0, 5.0)  # Prevent large updates
                    

                    new_q = current_q + learning_rate * td_error
                    

                    if followed_rules:

                        original_rule_value = self.get_initial_q_value(simple_state, i, action)
                        new_q = 0.7 * original_rule_value + 0.3 * new_q
                    

                    if np.isfinite(new_q):
                        self.set_q_value(simple_state, action, new_q)
            

            self.reward_history.append(reward)
        
        except Exception as e:
            print(f"Error in update_q_table: {e}")
            print(traceback.format_exc())
    
    def get_initial_q_value(self, state_tuple, service_idx, action):

        resources, cpu_level, _ = state_tuple
        thresholds = self.service_thresholds[service_idx]
        

        if cpu_level >= thresholds['high']:

            return [-20.0, 0.0, 20.0][action]
        elif cpu_level <= thresholds['low'] and resources > 1:

            return [10.0, 0.0, -10.0][action]
        elif cpu_level <= thresholds['low'] and resources <= 1:

            return [-15.0, 10.0, -5.0][action]
        elif thresholds['opt_low'] <= cpu_level <= thresholds['opt_high']:

            return [-5.0, 10.0, -5.0][action]
        elif thresholds['opt_high'] < cpu_level < thresholds['high']:

            return [-10.0, 5.0, 8.0][action]
        else: 
            if resources > 1:
                return [8.0, 5.0, -8.0][action]
            else:
                return [-10.0, 10.0, -5.0][action]
    
    def update_epsilon(self):

        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
        self.epsilon_history.append(self.epsilon)
        self.steps_done += 1
    
    def save_model(self, file_path):
        try:
            model_data = {
                'q_table': dict(self.q_table),
                'epsilon': self.epsilon,
                'reward_history': self.reward_history,
                'epsilon_history': self.epsilon_history,
                'bins_per_dim': self.bins_per_dim,
                'action_size_per_service': self.action_size_per_service,
                'num_services': self.num_services,
                'loss_history': self.loss_history,
                'action_counts': self.action_counts,
                'high_cpu_actions': self.high_cpu_actions,
                'service_performance': self.service_performance,
                'service_thresholds': self.service_thresholds
            }
            
            with open(file_path, 'wb') as f:
                pickle.dump(model_data, f)
            
            print(f"Model saved to {file_path}")
        except Exception as e:
            print(f"Error saving model: {e}")
            print(traceback.format_exc())
    
    def load_model(self, file_path):
        if os.path.exists(file_path):
            try:
                with open(file_path, 'rb') as f:
                    model_data = pickle.load(f)
                
                self.q_table = defaultdict(float, model_data['q_table'])
                self.epsilon = model_data.get('epsilon', self.epsilon_min)
                self.reward_history = model_data.get('reward_history', [])
                self.epsilon_history = model_data.get('epsilon_history', [])
                self.bins_per_dim = model_data.get('bins_per_dim', self.bins_per_dim)
                self.loss_history = model_data.get('loss_history', [])
                self.action_counts = model_data.get('action_counts', 
                                               {i: {0: 0, 1: 0, 2: 0} for i in range(self.num_services)})
                self.high_cpu_actions = model_data.get('high_cpu_actions', 
                                                  {i: {0: 0, 1: 0, 2: 0} for i in range(self.num_services)})
                self.service_performance = model_data.get('service_performance',
                                                     {i: {'rewards': [], 'actions': []} for i in range(self.num_services)})
                self.service_thresholds = model_data.get('service_thresholds', self.service_thresholds)
                
                print(f"Model loaded from {file_path}")
                return True
            except Exception as e:
                print(f"Error loading model: {e}")
                print(traceback.format_exc())
                return False
        return False

Training function

In [None]:
def train_agent(env, agent, num_episodes=50, save_path='cloud_dqn.pkl', save_best=True):
    episode_rewards = []
    best_reward = float('-inf')
    best_episode = 0
    

    start_time = time.time()
    

    all_resources = []
    all_cpu = []
    
    for episode in range(num_episodes):
        state = env.reset()
        episode_reward = 0
        steps = 0
        done = False
        

        episode_resources = {service: [] for service in env.services}
        episode_cpu = {service: [] for service in env.services}
        

        print(f"\nStarting Episode {episode+1}/{num_episodes}...")
        sys.stdout.flush() 
        
        while not done:

            actions = agent.select_action(state)
            

            for i, service in enumerate(env.services):
                episode_resources[service].append(env.current_resources[service])
                episode_cpu[service].append(env.current_cpu[service])
            

            next_state, reward, done, info = env.step(actions)
            

            agent.update_q_table(state, actions, reward, next_state)
            

            state = next_state
            episode_reward += reward
            steps += 1
            

            if steps > 0 and steps % 10 == 0:
                print(f"  Step {steps}, Reward so far: {episode_reward:.2f}")
                sys.stdout.flush()
        

        all_resources.append(episode_resources)
        all_cpu.append(episode_cpu)
        

        agent.update_epsilon()
        

        episode_rewards.append(episode_reward)
        agent.reward_history.append(episode_reward)
        

        elapsed_time = time.time() - start_time
        

        print(f"Episode {episode+1}/{num_episodes} completed | Steps: {steps} | "
              f"Reward: {episode_reward:.2f} | Epsilon: {agent.epsilon:.2f} | "
              f"Time: {elapsed_time:.1f}s | Q-table size: {len(agent.q_table)}")
        

        if save_best and episode_reward > best_reward:
            best_reward = episode_reward
            best_episode = episode + 1
            agent.save_model(save_path.replace('.pkl', '_best.pkl'))
            print(f"New best model saved with reward: {best_reward:.2f}")
        

        if (episode + 1) % 10 == 0:
            agent.save_model(save_path.replace('.pkl', f'_ep{episode+1}.pkl'))
            

            if episode > 0 and (episode + 1) % 20 == 0:
                visualize_training_progress(episode_rewards, agent, all_resources, all_cpu, episode)
    

    agent.save_model(save_path)
    print(f"Final model saved to {save_path}")
    print(f"Best model was from episode {best_episode} with reward {best_reward:.2f}")
    

    plot_training_metrics(agent, episode_rewards)
    
    return agent, best_episode, best_reward
