In [2]:
import gymnasium as gym
import numpy as np
from mealpy import FloatVar, GA
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
import matplotlib.pyplot as plt

class MetaheuristicQLearning:
    def __init__(self, env_name="CartPole-v1"):
        self.env_name = env_name
        self.env = gym.make(env_name)
        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        
        # Discretize continuous state space for Q-table
        self.discrete_bins = 20
        self.state_bounds = list(zip(self.env.observation_space.low, 
                                   self.env.observation_space.high))
        
    def discretize_state(self, state):
        """Convert continuous state to discrete state"""
        discrete_state = []
        for i, (low, high) in enumerate(self.state_bounds):
            # Handle infinite bounds
            if np.isinf(low):
                low = -4.0
            if np.isinf(high):
                high = 4.0
            
            # Clip and discretize
            state_val = np.clip(state[i], low, high)
            discrete_val = int((state_val - low) / (high - low) * (self.discrete_bins - 1))
            discrete_state.append(discrete_val)
        return tuple(discrete_state)
    
    def q_learning_episode(self, learning_rate, discount_factor, epsilon):
        """Run a single Q-learning episode with given hyperparameters"""
        # Initialize Q-table
        q_table = np.zeros([self.discrete_bins] * self.state_size + [self.action_size])
        
        total_rewards = []
        
        for episode in range(100):  # Reduced episodes for optimization
            state = self.env.reset()
            if isinstance(state, tuple):
                state = state[0]
            
            discrete_state = self.discretize_state(state)
            episode_reward = 0
            done = False
            
            while not done:
                # Epsilon-greedy action selection
                if np.random.random() < epsilon:
                    action = self.env.action_space.sample()
                else:
                    action = np.argmax(q_table[discrete_state])
                
                # Take action
                next_state, reward, done, truncated, _ = self.env.step(action)
                if isinstance(next_state, tuple):
                    next_state = next_state[0]
                
                next_discrete_state = self.discretize_state(next_state)
                
                # Q-learning update
                current_q = q_table[discrete_state + (action,)]
                max_next_q = np.max(q_table[next_discrete_state])
                new_q = current_q + learning_rate * (reward + discount_factor * max_next_q - current_q)
                q_table[discrete_state + (action,)] = new_q
                
                discrete_state = next_discrete_state
                episode_reward += reward
                
                if done or truncated:
                    break
            
            total_rewards.append(episode_reward)
        
        return np.mean(total_rewards[-10:])  # Return average of last 10 episodes
    
    def objective_function(self, solution):
        """Objective function for metaheuristic optimization"""
        learning_rate, discount_factor, epsilon = solution
        
        # Ensure parameters are in valid ranges
        learning_rate = np.clip(learning_rate, 0.001, 1.0)
        discount_factor = np.clip(discount_factor, 0.1, 0.99)
        epsilon = np.clip(epsilon, 0.01, 1.0)
        
        # Run Q-learning with these parameters
        avg_reward = self.q_learning_episode(learning_rate, discount_factor, epsilon)
        
        # Return negative reward (since we want to maximize reward but mealpy minimizes)
        return -avg_reward
    
    def optimize_hyperparameters(self):
        """Use metaheuristic to optimize Q-learning hyperparameters"""
        # Define problem bounds: [learning_rate, discount_factor, epsilon]
        problem_dict = {
            "obj_func": self.objective_function,
            "bounds": FloatVar(lb=[0.001, 0.1, 0.01], ub=[1.0, 0.99, 1.0]),
            "minmax": "min",
        }
        
        # Use Genetic Algorithm to optimize
        optimizer = GA.BaseGA(epoch=20, pop_size=10, pc=0.85, pm=0.1)
        optimizer.solve(problem_dict)
        
        best_params = optimizer.g_best.solution
        best_fitness = -optimizer.g_best.target.fitness
        
        print(f"Best hyperparameters found:")
        print(f"Learning Rate: {best_params[0]:.4f}")
        print(f"Discount Factor: {best_params[1]:.4f}")
        print(f"Epsilon: {best_params[2]:.4f}")
        print(f"Best Average Reward: {best_fitness:.2f}")
        
        return best_params, best_fitness

# Example usage
if __name__ == "__main__":
    # Initialize the metaheuristic Q-learning optimizer
    mql = MetaheuristicQLearning("CartPole-v1")
    
    # Optimize hyperparameters
    best_params, best_reward = mql.optimize_hyperparameters()


2025/07/01 02:05:28 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: BaseGA(epoch=20, pop_size=10, pc=0.85, pm=0.1)
2025/07/01 02:05:29 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 1, Current best: -38.0, Global best: -41.5, Runtime: 0.55542 seconds
2025/07/01 02:05:30 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 2, Current best: -53.1, Global best: -53.1, Runtime: 0.68382 seconds
2025/07/01 02:05:30 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 3, Current best: -53.1, Global best: -53.1, Runtime: 0.67910 seconds
2025/07/01 02:05:31 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 4, Current best: -65.4, Global best: -65.4, Runtime: 0.72986 seconds
2025/07/01 02:05:32 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 5, Current best: -65.4, Global best: -65.4, Runtime: 0.81023 seconds
2025/07/01 02:05:33 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >>>Problem: P, Epoch: 6, Curr

Best hyperparameters found:
Learning Rate: 0.7996
Discount Factor: 0.7772
Epsilon: 0.2217
Best Average Reward: 72.70
