# Environement set up details

```
pipx install uv
uv venv
uv pip install -r requirements.txt
```


In [1]:
from typing import Tuple, List, Dict, Optional, Union
from dataclasses import dataclass, asdict

import time
from datetime import datetime
from pathlib import Path

import json

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns 
from IPython.display import clear_output

import openpyxl
from pprint import pprint

from mazelib import Maze
from mazelib.generate.Sidewinder import Sidewinder
from mazelib.solve.BacktrackingSolver import BacktrackingSolver







# Configuration

In [2]:
@dataclass
class Config:
    """Configuration parameters for maze navigation system"""
    # Maze parameters
    maze_size: int = 5
    maze_id: Optional[int] = None  # None for random generation, int for specific seed
    
    # Algorithm parameters 
    learning_rate: float = 0.1  # step size α ∈ (0, 1]
    discount_factor: float = 0.9  # γ
    epsilon: float = 0.1  # small ε > 0
    num_episodes: int = 100
    agent_seed: int = None

    # Reward structure
    goal_reward: float = 100
    wall_penalty: float = -10
    step_penalty: float = -1

    # training parameter
    optimal_path_convergence_window: int = 5 # number of optimal path occurence for convergence

# Agent, Environment Control classes

Go to the execution section to configure, train, and test.

In [3]:
class MazeEnvironment:
    """Handles maze generation, state management and visualization"""
    
    def __init__(self, config: Config):
        self.config = config
        self.grid = None
        self.start = None
        self.end = None
        self._maze = None
        self.seed = None
        self.optimal_path = None
        self.optimal_path_length = None
        self.generate()

        
    def generate(self) -> None:
        """Creates new maze using Sidewinder algorithm"""
        # Use config maze_id if provided, otherwise generate random seed
        self.seed = self.config.maze_id if self.config.maze_id is not None else np.random.randint(1, 1000)
   
        self._maze = Maze(self.seed)
        self._maze.generator = Sidewinder(self.config.maze_size, self.config.maze_size)
        self._maze.generate()
        self._maze.generate_entrances()
        
        self.grid = self._maze.grid

        # Set the start to the first valid cell inside grid
        self.start = (1, 1)  
        # Set the end to the last valid cell inside grid
        self.end = (self.grid.shape[0]-2, self.grid.shape[1]-2)

        # After maze generation, calculate optimal path
        self._calculate_optimal_path()

    def _calculate_optimal_path(self) -> None:
        """Calculate optimal path using maze's solver"""
        # Set up solver
        self._maze.solver = BacktrackingSolver()
        self._maze.start = self.start
        self._maze.end = self.end
        
        # Solve
        self._maze.solve()
        
        if self._maze.solutions:
            self.optimal_path = self._maze.solutions[0]  # Store first solution
            self.optimal_path_length = len(self.optimal_path) + 1
        else:
            # Handle case where no solution is found
            self.optimal_path = None
            self.optimal_path_length = None
    
    def get_minimum_steps(self) -> Optional[int]:
        """Returns the length of optimal path if it exists"""
        return self.optimal_path_length
        
    def get_state(self, position: Tuple[int, int]) -> Tuple[int, int]:
        """Returns current state representation"""
        # There is no uncertainity in this environment
        # uncertainty could be added here to mimic a dirty sensor
        # or to mimic external factors like wind
        # this would be for a different problem (partial observability) 
        return position
        
    def get_reward(self, state: Tuple[int, int], next_state: Tuple[int, int]) -> float:
        """Calculates reward for a state transition"""
        if not self.is_valid_move(next_state):
            return self.config.wall_penalty
        elif next_state == self.end:
            return self.config.goal_reward
        return self.config.step_penalty
        
    def is_valid_move(self, state: Tuple[int, int]) -> bool:
        """Checks if move is legal"""
        row, col = state
        # check if the move goes outside of the grid or into a wall (1)
        if (row < 0 or row >= self.grid.shape[0] or 
            col < 0 or col >= self.grid.shape[1] or 
            self.grid[state] == 1):
            return False
        # move is valid
        return True

    def visualize(self, path: Optional[List[Tuple[int, int]]] = None, 
                    show_optimal: bool = False,
                    save_path: Optional[Path] = None) -> None:
        """Displays or saves maze visualization with optional path and optimal path
        
        Args:
            path: Optional list of positions showing a solution path
            show_optimal: Whether to display the optimal path
            save_path: If provided, saves figure to this path instead of displaying
        """
        plt.figure(figsize=(5, 5))
        # Add title showing Maze ID and minimum steps
        title = f"Maze #{self.seed} - Min Steps: {self.get_minimum_steps()}"
        plt.title(title)
        plt.imshow(self.grid, cmap='binary')
        
        # Plots the start (S) and goal (G)
        plt.text(self.start[1], self.start[0], 'S', 
                ha='center', va='center', color='red', fontsize=20)
        plt.text(self.end[1], self.end[0], 'G', 
                ha='center', va='center', color='green', fontsize=20)
        
        # Plot optimal path if requested
        if show_optimal and self.optimal_path:
            for pos in self.optimal_path:
                plt.text(pos[1], pos[0], "O", 
                        ha='center', va='center', color='green', fontsize=15)
        
        # Plot current path
        if path:
            for pos in path:
                plt.text(pos[1], pos[0], "#", 
                        ha='center', va='center', color='blue', fontsize=20)
        
        plt.xticks([])
        plt.yticks([])
        
        if save_path:
            plt.savefig(save_path)
            plt.close()
        else:
            plt.show()

In [4]:

class QLearningAgent:
    """Implements Q-learning algorithm"""
    
    def __init__(self, env: MazeEnvironment, config: Config):
        self.env = env
        self.config = config
        # Actions the agent can take: Up, Down, Left, Right. Each action is represented as a tuple of two values: (row_change, column_change)
        self.actions = [
            (-1, 0), # Up: Moving one step up, reducing the row index by 1
            (1, 0),  # Down: Moving on step down, increasing the row index by 1
            (0, -1), # Left: Moving one step to the left, reducing the column index by 1
            (0, 1)   # Right: Moving one step to the right, increasing the column index by 1 
        ]
        maze_height, maze_width = env.grid.shape
        self.q_table = np.zeros((maze_height, maze_width, 4))
        self.exploration_rate = config.epsilon

        # Set random seed for the agent
        if self.config.agent_seed is not None:
            np.random.seed(self.config.agent_seed)
        
    def get_action(self, state: Tuple[int, int], training: bool = True) -> int:
        """Selects action using ε-greedy policy"""
        # When training, Choose A from S using policy derived from Q (e.g., ε-greedy)
        if training and np.random.rand() < (1- self.exploration_rate):
            # explore
            return np.random.randint(4)
        # exploit
        return np.argmax(self.q_table[state])
        
    def update(self, state: Tuple[int, int], action: int, 
              reward: float, next_state: Tuple[int, int]) -> None:
        """Updates Q-value for state-action pair"""
        # max_a Q(S', a)
        best_next_action = np.argmax(self.q_table[next_state])
        # Q(S, A)
        current_q = self.q_table[state][action]
        # Q(S', a)
        next_q = self.q_table[next_state][best_next_action]
        # Q(S, A) ← Q(S, A) + α [R + γ max_a Q(S', a) - Q(S, A)]
        new_q = current_q + self.config.learning_rate * (
            reward + self.config.discount_factor * next_q - current_q)
        # update the q_table
        self.q_table[state][action] = new_q
        


In [5]:
class AgentControl:
    """Manages training, testing and metrics"""
    
    def __init__(self, env: MazeEnvironment, agent: QLearningAgent, config: Config):
        self.env = env
        self.agent = agent
        self.config = config
        self.metrics = {
            'rewards': [],
            'steps': [],
            'success_rate': [],
            'episode_status': [],  # Track if episode reached goal
            'training_start_time': None,
            'training_duration': None,
            'path_lengths': [],    # Track path length per episode
            'final_qtable': None,  # Store final Q-table state
            'policy_stability': [], # Track changes in policy
            'reached_goal_test': [], # Track if the episode test reached the goal
            'steps_test': [], # Track the episode test step count
            'path_optimality_test': [], # Track the episode test path optimality
            'steps_to_first_optimal': [], # Track the number of steps before the test agent uses optimal path
            'episodes_to_convergence': None, # Track the number of steps before the test agent *always* uses optimal path
        }
        
    def calculate_policy_stability(self) -> float:
        """Measures policy stability by comparing action choices across states"""
        current_policy = {state: np.argmax(self.agent.q_table[state]) 
                         for state in np.ndindex(self.env.grid.shape)}
        if not hasattr(self, '_last_policy'):
            self._last_policy = current_policy
            return 0.0
        
        matches = sum(1 for s in current_policy 
                     if current_policy[s] == self._last_policy[s])
        stability = matches / len(current_policy)
        self._last_policy = current_policy
        return stability
        
    def run_episode(self, training: bool = True) -> Tuple[float, int, List[Tuple[int, int]], bool]:
        """Runs single episode with enhanced metrics"""
        current_state = self.env.start
        episode_reward = 0
        steps = 0
        path = [current_state]
        max_steps = self.env.grid.shape[0] * self.env.grid.shape[1] * 10
        reached_goal = False
        
        while steps < max_steps:
            action = self.agent.get_action(current_state, training)
            next_state = (
                current_state[0] + self.agent.actions[action][0],
                current_state[1] + self.agent.actions[action][1]
            )
            
            if not self.env.is_valid_move(next_state):
                next_state = current_state

            reward = self.env.get_reward(current_state, next_state)
            
            if training:
                self.agent.update(current_state, action, reward, next_state)
                
            episode_reward += reward
            steps += 1
            path.append(next_state)
            
            if next_state == self.env.end:
                reached_goal = True
                break
                
            current_state = next_state
            
        return episode_reward, steps, path, reached_goal
        
    def train(self, save_path: Optional[str] = None, experiment_id: Optional[str] = None, iteration_count: Optional[str] = None) -> None:
        """Runs training loop with enhanced metrics"""
        plt.ion()
        fig, (ax_reward, ax_steps, ax_stability, ax_optimality) = plt.subplots(1, 4, figsize=(15, 5))
        
        window_size = 20
        moving_rewards = []
        moving_steps = []
        episodes_to_convergence = None
        path_optimality_test_counter = 0

        self.experiment_id = experiment_id if experiment_id else None
        self.iteration_count = iteration_count if iteration_count else None
        
        self.metrics['training_start_time'] = time.time()

        try:
            for episode in range(self.config.num_episodes):
                # Clear old plot data periodically to manage memory
                if episode % 50 == 0:
                    plt.close('all')
                    fig, (ax_reward, ax_steps, ax_stability, ax_optimality) = plt.subplots(1, 4, figsize=(15, 5))
                
                
                reward, steps, path, reached_goal = self.run_episode(training=True)
                
                # Update metrics
                self.metrics['rewards'].append(reward)
                self.metrics['steps'].append(steps)
                self.metrics['episode_status'].append(reached_goal)
                self.metrics['policy_stability'].append(self.calculate_policy_stability())
                
                # Calculate moving averages
                if episode >= window_size:
                    avg_reward = np.mean(self.metrics['rewards'][-window_size:])
                    avg_steps = np.mean(self.metrics['steps'][-window_size:])
                else:
                    avg_reward = np.mean(self.metrics['rewards'])
                    avg_steps = np.mean(self.metrics['steps'])
                    
                moving_rewards.append(avg_reward)
                moving_steps.append(avg_steps)

                # run a test 
                _ , steps_test, path_test, reached_goal_test= self.run_episode(training=False)
                
                self.metrics['reached_goal_test'].append(reached_goal_test)
                self.metrics['steps_test'].append(steps_test)
                
                if steps_test != 0 or self.env.optimal_path_length != 0:
                    path_optimality_test = self.env.optimal_path_length / steps_test
                else:
                    path_optimality_test = 0
                
                self.metrics['path_optimality_test'].append(path_optimality_test)

                # check for convergence on the optimal path
                if path_optimality_test == 1:
                    path_optimality_test_counter += 1 
                    if path_optimality_test_counter == self.config.optimal_path_convergence_window:
                        episodes_to_convergence = episode
                        break
                else:
                    path_optimality_test_counter = 0

                # Update plots (with memory management)
                if episode % 5 == 0:  # Update plots less frequently
                    self._update_training_plots(fig, ax_reward, ax_steps, ax_stability, ax_optimality,
                                            moving_rewards, moving_steps)
        
        except Exception as e:
            print(f"Training stopped due to error: {str(e)}")
            raise
        finally:
            print(self.config.optimal_path_convergence_window)    
            self.metrics['training_duration'] = time.time() - self.metrics['training_start_time']
            self.metrics['episodes_to_convergence'] = episodes_to_convergence

            # One final plot update
            self._update_training_plots(fig, ax_reward, ax_steps, ax_stability, ax_optimality,
                                    moving_rewards, moving_steps)

            if save_path:
                timestamp = int(time.time())
                filename = (f"{timestamp}_maze{self.config.maze_size}_"
                        f"lr{self.config.learning_rate}_"
                        f"df{self.config.discount_factor}_"
                        f"eps{self.config.epsilon}.png")
                fig.savefig(f"{save_path}/{filename}", bbox_inches='tight')

            plt.close('all')  # Clean up all plots
            plt.ioff()


    def test(self, display: bool = False) -> None:
        """Evaluates agent performance"""
        episode_reward, steps, path, reached_goal= self.run_episode(training=False)
        print(f"Test Results - Steps: {steps}, Reward: {episode_reward}, Successful: {reached_goal}")
        if display:
            self.env.visualize(path)

    def test_consistency(self, num_tests: int = 10) -> Dict[str, float]:
        """Tests agent consistency across multiple runs"""
        test_results = {
            'success_rate': 0,
            'avg_steps': 0,
            'std_steps': 0,
            'avg_path_optimality': 0,
            'std_path_optimality': 0
        }
        
        steps_list = []
        optimality_list = []
        for _ in range(num_tests):
            reward, steps, path, reached_goal = self.run_episode(training=False)
            if reached_goal:
                test_results['success_rate'] += 1
            if steps != 0 or self.env.optimal_path_length != 0:
                optimality_list.append(self.env.optimal_path_length / steps)
            else:
                optimality_list.append(0)                
            steps_list.append(steps)
        
        if steps_list:
            test_results['success_rate'] /= num_tests
            test_results['avg_steps'] = np.mean(steps_list)
            test_results['std_steps'] = np.std(steps_list)
            
        if optimality_list:
            test_results['avg_path_optimality'] = np.mean(optimality_list)
            test_results['std_path_optimality'] = np.std(optimality_list)
            
        return test_results

    def _update_training_plots(self, fig, ax_reward, ax_steps, ax_stability, ax_optimality,
                             moving_rewards, moving_steps) -> None:
        """Updates training visualization plots"""
        clear_output(wait=True)

        # Add configuration header
        elapsed_time = time.time() - self.metrics['training_start_time']
        header = (f"Agent Seed: {self.config.agent_seed} | "
                f"Maze Size: {self.config.maze_size}x{self.config.maze_size} | "
                f"Maze ID: {self.env.seed} | "
                f"Min Steps: {self.env.get_minimum_steps()} | "
                f"Episodes: {self.config.num_episodes}\n"
                f"Learning Rate: {self.config.learning_rate} | "
                f"Discount: {self.config.discount_factor} | "
                f"Epsilon: {self.config.epsilon}\n"
                f"Episodes to Convergence: {self.metrics['episodes_to_convergence']} | "
                f"Training Time: {elapsed_time:.1f}s")
        # Add experiment metadata as footer if available
        if hasattr(self, 'experiment_id') and hasattr(self, 'iteration_count'):
            plt.subplots_adjust()  
            footer = f"Experiment: {self.experiment_id} | Iteration: {self.iteration_count}"
            fig.text(0.5, 0, footer, ha='center', va='center', fontsize=10)

        fig.suptitle(header, wrap=True, y=1.05)
        
        # Reward plot
        ax_reward.clear()
        ax_reward.plot(self.metrics['rewards'], 'r-', alpha=0.3, label='Rewards')
        ax_reward.plot(moving_rewards, 'r-', label='Moving Average')
        ax_reward.set_xlabel('Episode')
        ax_reward.set_ylabel('Reward')
        ax_reward.legend()
        ax_reward.grid(True)
        
        # Steps plot
        ax_steps.clear()
        ax_steps.plot(self.metrics['steps'], 'b-', alpha=0.3, label='Steps')
        ax_steps.plot(moving_steps, 'b-', label='Moving Average')
        ax_steps.set_xlabel('Episode')
        ax_steps.set_ylabel('Steps')
        ax_steps.legend()
        ax_steps.grid(True)
        
        # Policy stability plot
        ax_stability.clear()
        ax_stability.plot(self.metrics['policy_stability'], 'g-', label='Policy Stability')
        ax_stability.set_xlabel('Episode')
        ax_stability.set_ylabel('Stability')
        ax_stability.legend()
        ax_stability.grid(True)

        # Path optimality plot
        ax_optimality.clear()
        ax_optimality.plot(self.metrics['path_optimality_test'], 'g-', label='Path Optimality')
        ax_optimality.set_xlabel('Episode')
        ax_optimality.set_ylabel('Optimality')
        ax_optimality.legend()
        ax_optimality.grid(True)
        
        plt.tight_layout()
        display(fig)
        plt.pause(0.1)

In [6]:
import pandas as pd
from dataclasses import asdict
import json
from typing import List, Dict, Any
import time
from pathlib import Path

class ExperimentRunner:
    """Manages automated testing and reporting"""
    
    def __init__(self, base_config: Config):
        self.base_config = base_config
        self.results = []
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.experiment_id = experiment_id or timestamp
        self.base_path = Path(f"experiments/{self.experiment_id}")
        self.base_path.mkdir(parents=True, exist_ok=True)

    def _train_and_test_agent(self, config, num_tests, save_path, experiment_id=None, iteration_count=None):
        env = MazeEnvironment(config)
        agent = QLearningAgent(env, config)
        control = AgentControl(env, agent, config)
        
        # Train and test
        control.train(
            save_path=str(save_path),
            experiment_id=experiment_id,
            iteration_count=iteration_count
        )
        test_results = control.test_consistency(num_tests=num_tests)
        
        # Flatten it like this:
        flattened_result = {
            # Unpack all parameters from config
            **asdict(config),
            # Unpack all test results
            **test_results,
            # Add the additional metrics
            'training_duration': control.metrics['training_duration'],
            'final_policy_stability': control.metrics['policy_stability'][-1],
            'episodes_to_convergence': control.metrics['episodes_to_convergence'],
            # Add experiment metadata
            'experiment_id': experiment_id,
            'iteration_count': iteration_count
        }

        return flattened_result
        
    def run_hyperparameter_sweep(self, param_grid: Dict = None) -> List[Dict]:
        """Test different hyperparameter combinations"""
        # create the training results folder
        save_path = self.base_path / "hyperparameter_sweep"
        save_path.mkdir(exist_ok=True)

        if param_grid == None:
            param_grid = {
                'learning_rate': [0.1, 0.11],
                'discount_factor': [1, 0.9],
                'epsilon': [0.25, 0.3], 
                'maze_id': 2,
                'num_episodes': 1500,
                'maze_size': 5,
                'agent_seeds': [1, 2, 3, 4, 5], 
            } 

        # Calculate total number of iterations
        total_iterations = (
            len(param_grid['agent_seeds']) *
            len(param_grid['learning_rate']) *
            len(param_grid['discount_factor']) *
            len(param_grid['epsilon'])
        )
        
        results = []
        current_iteration = 0

        for agent_seed in  param_grid['agent_seeds']:
            for lr in param_grid['learning_rate']:
                for df in param_grid['discount_factor']:
                    for eps in param_grid['epsilon']:
                        current_iteration += 1
                        iteration_count = f"{current_iteration}/{total_iterations}"

                        config = Config(
                            maze_size=param_grid['maze_size'],
                            maze_id=param_grid['maze_id'],
                            num_episodes=param_grid['num_episodes'],
                            learning_rate=lr,
                            discount_factor=df,
                            epsilon=eps,
                            agent_seed=agent_seed
                        )
                        flattened_result = self._train_and_test_agent(
                            config, 
                            num_tests=10, 
                            save_path=save_path,
                            experiment_id=self.experiment_id,
                            iteration_count=iteration_count
                        )                        
                        results.append(flattened_result)

        return results
    
    def _run_untrained_tests(self, num_trials: int) -> Dict:
        """Run tests with untrained agent"""
        env = MazeEnvironment(self.base_config)
        agent = QLearningAgent(env, self.base_config)
        control = AgentControl(env, agent, self.base_config)
        return control.test_consistency(num_trials)
    
    def save_report(self, experiment_results: Dict[str, Any]) -> None:
        """Save experiment results to Excel report"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_path = self.base_path / f"experiment_report_{timestamp}.xlsx"
        
        if 'hyperparameters' in experiment_results:
            output_path = self.base_path / f"hyperparameter_results_{timestamp}.csv"
            df_hyper = pd.DataFrame(experiment_results['hyperparameters'])
            df_hyper.to_csv(output_path, index=False)


In [None]:
class MazeExperiments:
    """Manages maze learning experiments with comprehensive tracking"""
    
    def __init__(self, base_path: str = "experiments"):
        self.base_path = Path(base_path)
        self.base_path.mkdir(parents=True, exist_ok=True)
        self.experiment_id = datetime.now().strftime("%Y%m%d_%H%M%S")
        
    def run_experiment(self, maze_sizes: List[int], param_grid: Dict, trials: int = 5):
        """Run experiments across maze sizes and parameters"""
        results = {}
        total_iterations = len(maze_sizes) * len(param_grid['learning_rate']) * \
                          len(param_grid['discount_factor']) * len(param_grid['epsilon']) * trials
        current_iteration = 0
        
        # Create experiment directory structure
        experiment_dir = self.base_path / self.experiment_id
        experiment_dir.mkdir(exist_ok=True)
        plots_dir = experiment_dir / 'training_plots'
        plots_dir.mkdir(exist_ok=True)
        
        for size in maze_sizes:
            size_results = []
            size_dir = plots_dir / f'size_{size}'
            size_dir.mkdir(exist_ok=True)
            
            for lr in param_grid['learning_rate']:
                for df in param_grid['discount_factor']:
                    for eps in param_grid['epsilon']:
                        for trial in range(trials):
                            current_iteration += 1
                            iteration_count = f"{current_iteration}/{total_iterations}"
                            
                            # Configure experiment
                            config = Config(
                                maze_size=size,
                                learning_rate=lr,
                                discount_factor=df,
                                epsilon=eps,
                                num_episodes=param_grid['num_episodes'],
                                maze_id=param_grid.get('maze_id', None),
                                agent_seed=trial
                            )
                            
                            # Run single experiment
                            save_path = size_dir / f"lr{lr}_df{df}_eps{eps}_trial{trial}"
                            result = self._run_single_experiment(
                                config=config,
                                save_path=save_path,
                                experiment_id=self.experiment_id,
                                iteration_count=iteration_count
                            )
                            size_results.append(result)
                            
            results[size] = size_results
            
        # Save experiment metadata
        metadata = {
            'timestamp': self.experiment_id,
            'param_grid': param_grid,
            'maze_sizes': maze_sizes,
            'trials': trials,
            'total_iterations': total_iterations
        }
        with open(experiment_dir / 'metadata.json', 'w') as f:
            json.dump(metadata, f, indent=4)
            
        return self._save_results(results)
    
    def _save_maze_visualization(self, env: MazeEnvironment, save_path: Path):
        """Save visualization of maze with optimal path"""
        plt.figure(figsize=(5, 5))
        # Add title showing Maze ID and minimum steps
        title = f"Maze #{env.seed} - Min Steps: {env.get_minimum_steps()}"
        plt.title(title)
        plt.imshow(env.grid, cmap='binary')
        
        # Plots the start (S) and goal (G)
        plt.text(env.start[1], env.start[0], 'S', 
                ha='center', va='center', color='red', fontsize=20)
        plt.text(env.end[1], env.end[0], 'G', 
                ha='center', va='center', color='green', fontsize=20)
        
        # Plot optimal path
        if env.optimal_path:
            for pos in env.optimal_path:
                plt.text(pos[1], pos[0], "O", 
                        ha='center', va='center', color='green', fontsize=15)
        
        plt.xticks([])
        plt.yticks([])
        plt.savefig(save_path / 'maze_optimal.png')
        plt.close()

    def _run_single_experiment(self, config, save_path, experiment_id=None, iteration_count=None):
        """Execute single experiment with full metric collection"""
        env = MazeEnvironment(config)
        
        # Create directory and save maze visualization
        maze_path = Path(save_path)
        maze_path.mkdir(parents=True, exist_ok=True)
        self._save_maze_visualization(env, maze_path)
        
        # Save maze metadata
        maze_metadata = {
            'seed': env.seed,
            'grid': env.grid.tolist(),  # Convert numpy array to list for JSON
            'start': env.start,
            'end': env.end,
            'optimal_path': env.optimal_path,
            'optimal_path_length': env.optimal_path_length
        }
        with open(maze_path / 'maze_metadata.json', 'w') as f:
            json.dump(maze_metadata, f, indent=4)
            
        agent = QLearningAgent(env, config)
        control = AgentControl(env, agent, config)
        
        # Train and save training plot
        control.train(
            save_path=str(save_path),
            experiment_id=experiment_id,
            iteration_count=iteration_count
        )
        
        # Run consistency tests
        test_results = control.test_consistency(num_tests=10)
        
        # Combine all results
        flattened_result = {
            # Configuration parameters
            **asdict(config),
            # Test results
            **test_results,
            # Additional metrics
            'training_duration': control.metrics['training_duration'],
            'final_policy_stability': control.metrics['policy_stability'][-1],
            'episodes_to_convergence': control.metrics['episodes_to_convergence'],
            'policy_stability_history': control.metrics['policy_stability'],
            'reward_history': control.metrics['rewards'],
            'steps_history': control.metrics['steps'],
            # Experiment metadata
            'experiment_id': experiment_id,
            'iteration_count': iteration_count,
            'save_path': str(save_path)
        }
        
        return flattened_result
    
    def _save_results(self, results: Dict) -> str:
        """Save results with comprehensive structure"""
        experiment_dir = self.base_path / self.experiment_id
        results_dir = experiment_dir / 'results'
        results_dir.mkdir(exist_ok=True)
        
        # Save each maze size results with full detail
        for size, size_results in results.items():
            with open(results_dir / f"size_{size}.json", 'w') as f:
                json.dump(size_results, f, indent=4)
        
        return str(experiment_dir)
    
    def analyze_results(self, experiment_dir: Optional[str] = None):
        """Analyze and visualize experiment results"""
        if experiment_dir is None:
            experiment_dir = str(self.base_path / self.experiment_id)
            
        # Load results and metadata
        exp_path = Path(experiment_dir)
        with open(exp_path / 'metadata.json') as f:
            metadata = json.load(f)
            
        results = {}
        for size in metadata['maze_sizes']:
            with open(exp_path / 'results' / f"size_{size}.json") as f:
                results[size] = json.load(f)
        
        # Generate analysis
        self._generate_analysis(results, metadata, exp_path)
    
    def _generate_analysis(self, results: Dict, metadata: Dict, save_path: Path):
        """Generate comprehensive analysis and visualizations"""
        analysis_dir = save_path / 'analysis'
        analysis_dir.mkdir(exist_ok=True)
        
        # Convert to DataFrame for analysis
        dfs = []
        for size, size_results in results.items():
            df = pd.DataFrame(size_results)
            dfs.append(df)
        df_all = pd.concat(dfs)
        
        # Generate plots
        self._plot_convergence_analysis(df_all, analysis_dir)
        self._plot_parameter_impact(df_all, analysis_dir)
        
        # Generate summary report
        self._generate_summary_report(df_all, metadata, analysis_dir)
    
    def _plot_convergence_analysis(self, df: pd.DataFrame, save_path: Path):
        """Plot convergence-related metrics"""
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # Episodes to convergence
        sns.boxplot(data=df, x='maze_size', y='episodes_to_convergence', ax=axes[0, 0])
        axes[0, 0].set_title('Episodes to Convergence by Maze Size')
        axes[0, 0].set_ylabel('Episodes')
        
        # Training duration
        sns.boxplot(data=df, x='maze_size', y='training_duration', ax=axes[0, 1])
        axes[0, 1].set_title('Training Duration by Maze Size')
        axes[0, 1].set_ylabel('Duration (s)')
        
        # Policy stability
        sns.boxplot(data=df, x='maze_size', y='final_policy_stability', ax=axes[1, 0])
        axes[1, 0].set_title('Final Policy Stability by Maze Size')
        axes[1, 0].set_ylabel('Stability')
        
        # Success rate
        sns.boxplot(data=df, x='maze_size', y='success_rate', ax=axes[1, 1])
        axes[1, 1].set_title('Success Rate by Maze Size')
        axes[1, 1].set_ylabel('Success Rate')
        
        plt.tight_layout()
        plt.savefig(save_path / 'convergence_analysis.png')
        plt.close()

    def _plot_parameter_impact(self, df: pd.DataFrame, save_path: Path):
        """Plot parameter impact on performance"""
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # Learning rate impact
        sns.boxplot(
            data=df, 
            x='learning_rate', 
            y='episodes_to_convergence', 
            hue='maze_size', 
            ax=axes[0, 0],
            palette="pastel"
        )
        axes[0, 0].set_title('Learning Rate Impact by Maze Size')
        axes[0, 0].set_ylabel('Episodes to Convergence')
        axes[0, 0].legend(title='Maze Size', bbox_to_anchor=(1.05, 1), loc='upper left')
                
        # Discount factor impact
        sns.boxplot(
            data=df, 
            x='discount_factor', 
            y='episodes_to_convergence',
            hue='maze_size', 
            ax=axes[0, 1],
            palette="pastel"
        )
        axes[0, 1].set_title('Discount Factor Impact by Maze Size')
        axes[0, 1].set_ylabel('Episodes to Convergence')
        axes[0, 1].legend(title='Maze Size', bbox_to_anchor=(1.05, 1), loc='upper left')
                
        # Epsilon impact
        sns.boxplot(
            data=df, 
            x='epsilon', 
            y='episodes_to_convergence',
            hue='maze_size', 
            ax=axes[1, 0],
            palette="pastel"
        )
        axes[1, 0].set_title('Epsilon Impact by Maze Size')
        axes[1, 0].set_ylabel('Episodes to Convergence')
        axes[1, 0].legend(title='Maze Size', bbox_to_anchor=(1.05, 1), loc='upper left')
        
        # Combined heatmap with per-maze-size normalization
        pivot_table = df.groupby(['learning_rate', 'epsilon', 'maze_size'])['episodes_to_convergence'].mean().unstack()
        
        # Normalize each maze size column by its minimum value
        normalized_pivot = pivot_table.copy()
        for col in normalized_pivot.columns:
            normalized_pivot[col] = normalized_pivot[col] / normalized_pivot[col].min()
            
        sns.heatmap(normalized_pivot, ax=axes[1, 1], 
                   cmap='YlOrRd',
                   annot=True, 
                   fmt='.1f',
                   cbar_kws={'label': 'Ratio to minimum episodes for maze size'})
        axes[1, 1].set_title('Normalized Episodes to Convergence\nby Learning Rate and Epsilon')
        
        plt.tight_layout()
        plt.savefig(save_path / 'parameter_impact.png', dpi=300, bbox_inches='tight')
        plt.close()
    
    def _generate_summary_report(self, df: pd.DataFrame, metadata: Dict, save_path: Path):
        """Generate detailed summary report"""
        with open(save_path / 'summary_report.md', 'w') as f:
            f.write("# Experiment Summary\n\n")
            f.write(f"**Experiment ID**: {metadata['timestamp']}\n\n")
            f.write(f"**Total Iterations**: {metadata['total_iterations']}\n\n")

            f.write("## Best Configurations by Maze Size\n")
            for size in df['maze_size'].unique():
                size_df = df[df['maze_size'] == size]
                best_idx = size_df['episodes_to_convergence'].idxmin()
                best_config = size_df.loc[best_idx]

                f.write(f"\n### Maze Size {size}x{size}\n")
                f.write(f"- **Learning Rate**: {best_config['learning_rate']}\n")
                f.write(f"- **Discount Factor**: {best_config['discount_factor']}\n")
                f.write(f"- **Epsilon**: {best_config['epsilon']}\n")
                f.write(f"- **Episodes to Convergence**: {best_config['episodes_to_convergence']:.1f}\n")
                f.write(f"- **Success Rate**: {best_config['success_rate']:.1%}\n")
                f.write(f"- **Final Policy Stability**: {best_config['final_policy_stability']:.2f}\n")

                # Add a link to the corresponding maze visualization
                image_path = save_path / f"training_plots/size_{size}/maze_optimal.png"
                if image_path.exists():
                    relative_image_path = image_path.relative_to(save_path.parent)
                    f.write(f"![Maze Visualization](../{relative_image_path})\n\n")
                else:
                    f.write("*Maze visualization not found.*\n\n")
                
# Usage Example:
if __name__ == "__main__":
    param_grid = {
        'learning_rate': [0.2, 0.3, 0.4],
        'discount_factor': [1.0, 0.99],
        'epsilon': [0.06, 0.08, 0.1],
        'num_episodes': 1000,
        'maze_id': 42  # Optional: fixed maze for reproducibility
    }
    
    experiments = MazeExperiments()
    experiment_dir = experiments.run_experiment(
        # maze_sizes=[3, 4, 5, 6, 7, 8],
        maze_sizes=[3, 4, 5, 6, 7, 8, 9, 10, 11],
        param_grid=param_grid,
        trials=5
    )
    experiments.analyze_results(experiment_dir)