In [None]:
import sys
from pathlib import Path
import os

# Add the 'app' directory to sys.path to allow importing simulate_run
# This assumes the notebook is in the root of the frenztoolkit directory
notebook_dir = Path.cwd() # Should be /Users/suryaven/Documents/code/uni/neurotech/frenztoolkit
app_dir = notebook_dir / 'app'
if str(app_dir) not in sys.path:
    sys.path.insert(0, str(app_dir))

print(f"App directory for imports: {app_dir}")
print(f"Current sys.path: {sys.path}")

# Now try the import
try:
    from simulate_run import simulate_session_from_files
    print("Successfully imported simulate_session_from_files from app.simulate_run")
except ImportError as e:
    print(f"Error importing simulate_session_from_files: {e}")
    print("Please ensure simulate_run.py is in the 'app' directory and there are no circular dependencies.")
    # Fallback: if 'app' is not the right place, try importing from the current directory as a last resort
    # This might be the case if the user moved simulate_run.py to the root temporarily
    if str(notebook_dir) not in sys.path:
        sys.path.insert(0, str(notebook_dir))
    try:
        from simulate_run import simulate_session_from_files
        print("Successfully imported simulate_session_from_files from root directory as a fallback.")
    except ImportError as e2:
        print(f"Fallback import also failed: {e2}")
        print("Please check the location of simulate_run.py")

## Sleep Session Simulation and Analysis

This notebook runs the `simulate_session_from_files` function from `app.simulate_run` on multiple datasets, collects the output metadata, and then performs an analysis on the simulated audio cueing during REM sleep.

In [None]:
# Configuration
# Make sure these paths are correct for your environment.
# It's assumed this notebook is in the root of the 'frenztoolkit' directory.

# Path to the directory containing all sample dataset folders
# Example: sample_data/dataset1/, sample_data/dataset2/, ...
SAMPLE_DATA_BASE_DIR = Path.cwd() / "sample_data"

# Path to the directory where simulation outputs (including logs and metadata) will be saved.
# A subdirectory will be created for each simulation run.
SIMULATION_OUTPUT_BASE_DIR = Path.cwd() / "simulation_output"

# Create the base output directory if it doesn't exist
SIMULATION_OUTPUT_BASE_DIR.mkdir(parents=True, exist_ok=True)

print(f"Sample data expected in subfolders of: {SAMPLE_DATA_BASE_DIR}")
print(f"Simulation outputs will be saved in subfolders of: {SIMULATION_OUTPUT_BASE_DIR}")

# List of dataset folder names to process (relative to SAMPLE_DATA_BASE_DIR)
# If empty, the script will attempt to process all subdirectories in SAMPLE_DATA_BASE_DIR.
# Example: DATASET_NAMES_TO_PROCESS = ["E23B6B24FX14_1743438214000", "another_dataset_folder_name"]
DATASET_NAMES_TO_PROCESS = [] # Process all found datasets by default (up to 20 as per original request)

if not SAMPLE_DATA_BASE_DIR.is_dir():
    print(f"ERROR: SAMPLE_DATA_BASE_DIR does not exist or is not a directory: {SAMPLE_DATA_BASE_DIR}")
    # You might want to stop execution here or handle this error appropriately.
    datasets_to_process_paths = [] # Ensure this is defined
else:
    if not DATASET_NAMES_TO_PROCESS: # If list is empty, find all subdirectories
        all_dataset_subdirs = sorted([d for d in SAMPLE_DATA_BASE_DIR.iterdir() if d.is_dir()])
        DATASET_NAMES_TO_PROCESS = [d.name for d in all_dataset_subdirs[:20]] # Take first 20 or fewer
        if not DATASET_NAMES_TO_PROCESS:
            print(f"No dataset subdirectories found in {SAMPLE_DATA_BASE_DIR}. Please check the path and directory structure.")
        else:
            print(f"Found {len(DATASET_NAMES_TO_PROCESS)} datasets to process (max 20): {DATASET_NAMES_TO_PROCESS}")
    else:
        # If specific names are provided, use them (still implicitly limited by what simulate_run might do if >20)
        print(f"Specified {len(DATASET_NAMES_TO_PROCESS)} datasets to process: {DATASET_NAMES_TO_PROCESS}")
    
    # Create full paths for the datasets to be processed
    datasets_to_process_paths = [SAMPLE_DATA_BASE_DIR / name for name in DATASET_NAMES_TO_PROCESS]

# For use in later cells, ensure simulate_run constants are available if not already imported by the first cell
try:
    # This might be redundant if the first cell succeeded, but good for robustness if cells are run out of order
    from simulate_run import METADATA_FILENAME, REM_SLEEP_STAGE_VALUE 
except ImportError:
    print("Warning: Could not re-import METADATA_FILENAME, REM_SLEEP_STAGE_VALUE from simulate_run. Ensure first cell ran correctly.")
    # Define them manually as a fallback if needed for standalone cell execution, though this is not ideal
    if 'METADATA_FILENAME' not in globals(): METADATA_FILENAME = "session_metadata.npz"
    if 'REM_SLEEP_STAGE_VALUE' not in globals(): REM_SLEEP_STAGE_VALUE = 3 # Default REM value

### Run Simulation for Each Dataset

In [None]:
import time

all_simulation_output_paths = [] # Stores paths to the SIM_DATASET_TIMESTAMP level
failed_simulations_summary = []

if 'simulate_session_from_files' not in globals():
    print("ERROR: simulate_session_from_files function not imported. Please check the first cell and ensure simulate_run.py is accessible.")
elif not datasets_to_process_paths:
    print("ERROR: No datasets found or specified to process. Check the configuration cell and your sample_data directory.")
else:
    print(f"Starting simulations for {len(datasets_to_process_paths)} datasets...")
    for dataset_full_path in datasets_to_process_paths:
        dataset_name = dataset_full_path.name
        if not dataset_full_path.is_dir():
            print(f"Dataset directory {dataset_full_path} not found or is not a directory. Skipping.")
            failed_simulations_summary.append({"dataset": dataset_name, "reason": "Directory not found"})
            continue

        print(f"\n--- Processing dataset: {dataset_name} from {dataset_full_path} ---")
        start_time = time.time()
        output_session_folder_level1 = None # This will be SIM_DATASETNAME_TIMESTAMP
        try:
            # simulate_session_from_files returns the path to the *inner* session data folder (timestamped)
            # e.g., simulation_output/SIM_dataset_timestamp/float_timestamp/
            # For collecting all_simulation_output_paths, we want the parent of this: SIM_dataset_timestamp
            output_session_folder_level2_timestamped = simulate_session_from_files(dataset_full_path, SIMULATION_OUTPUT_BASE_DIR)
            end_time = time.time()
            
            if output_session_folder_level2_timestamped:
                output_session_folder_level1 = output_session_folder_level2_timestamped.parent
                all_simulation_output_paths.append(output_session_folder_level1)
                # Verify metadata existence within the timestamped folder
                metadata_file = output_session_folder_level2_timestamped / METADATA_FILENAME
                if metadata_file.exists():
                    print(f"Simulation for {dataset_name} completed. Main output folder: {output_session_folder_level1}. Metadata in: {metadata_file}")
                else:
                    print(f"Simulation for {dataset_name} completed (returned path: {output_session_folder_level2_timestamped}), but METADATA FILE NOT FOUND at {metadata_file}. It might be an error metadata.")
                    # It's still a form of success in terms of the script running, so it's in all_simulation_output_paths
            else:
                # If simulate_session_from_files returns None, it means a critical error occurred before even creating the session structure.
                # The simulate_run script itself should log this. We also log it here.
                print(f"Simulation function returned None for {dataset_name}. Critical failure. Check logs if any were created by simulate_run.py in a folder like SIM_{dataset_name}_* under {SIMULATION_OUTPUT_BASE_DIR}.")
                failed_simulations_summary.append({"dataset": dataset_name, "reason": "Simulation function returned None (critical internal error)"})
            
            print(f"Time taken for {dataset_name}: {end_time - start_time:.2f} seconds.")

        except Exception as e:
            end_time = time.time()
            print(f"CRITICAL ERROR during the call to simulate_session_from_files for {dataset_name}: {e}")
            import traceback
            traceback.print_exc()
            failed_simulations_summary.append({"dataset": dataset_name, "reason": f"Exception in notebook: {e}"})
            print(f"Time taken before error: {end_time - start_time:.2f} seconds.")
            # Attempt to find if a partial folder was created to add to output_paths for inspection
            # This is a bit heuristic.
            possible_partial_path_name = f"SIM_{dataset_name}_*"
            found_partials = list(SIMULATION_OUTPUT_BASE_DIR.glob(possible_partial_path_name))
            if found_partials:
                all_simulation_output_paths.append(found_partials[0]) # Add the first one found for potential log inspection
                print(f"Added potential partial output folder for inspection: {found_partials[0]}")


    print("\n--- Simulation Run Summary ---")
    print(f"Attempted to process {len(datasets_to_process_paths)} datasets.")
    print(f"Number of simulation output folders (may include those with errors but having a folder structure): {len(all_simulation_output_paths)}.")
    if failed_simulations_summary:
        print(f"Summary of specific failures encountered by the notebook for {len(failed_simulations_summary)} datasets:")
        for fail_info in failed_simulations_summary:
            print(f"  - Dataset: {fail_info['dataset']}, Reason: {fail_info['reason']}")
    else:
        print("No critical errors caught by this notebook cell during simulation calls.")

    print("\nPaths to simulation session group folders (each contains a timestamped subfolder with logs and metadata.npz, or error info):")
    for p in all_simulation_output_paths:
        print(p)

### Analyze Simulation Results

Now, load the generated `session_metadata.npz` files and analyze the REM cycle detection and audio cue triggering.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd  # Ensure pandas is imported here if not already

# REM_SLEEP_STAGE_VALUE should be available from the config cell or imported from simulate_run
# If not, define it: REM_SLEEP_STAGE_VALUE = 3 

def analyze_rem_cuing_from_metadata(metadata_file_path: Path):
    """Analyzes REM cuing from a single session_metadata.npz file."""
    try:
        data = np.load(metadata_file_path, allow_pickle=True)
    except FileNotFoundError:
        print(f"Metadata file not found: {metadata_file_path}")
        return {"metadata_path": metadata_file_path, "error": "FileNotFound", "dataset_name": metadata_file_path.parent.parent.name}
    except Exception as e:
        print(f"Error loading metadata {metadata_file_path}: {e}")
        return {"metadata_path": metadata_file_path, "error": f"LoadError: {e}", "dataset_name": metadata_file_path.parent.parent.name}

    # dataset_name is from the SIM_DATASETNAME_TIMESTAMP folder
    dataset_name_from_path = metadata_file_path.parent.parent.name 

    results = {
        "metadata_path": str(metadata_file_path),
        "dataset_name": dataset_name_from_path,  # More reliable dataset identifier
        "session_id": data.get("session_id", "UnknownSessionID"),
        "total_windows": 0,
        "rem_windows": 0,
        "rem_segments_script_detected": 0,  # Based on audio_cue_sequence_initiation_timestamps
        "cues_triggered_in_rem_logic": 0,
        "error_in_simulation_metadata": data.get("error_message", None),  # Error reported by simulate_run.py
        "analysis_error_reason": None  # Error during this analysis function
    }

    if results["error_in_simulation_metadata"]:
        print(f"Note: Simulation for {results['dataset_name']} (Session: {results['session_id']}) reported an error in its metadata: {results['error_in_simulation_metadata']}")
        # We can still try to extract partial data if it exists.

    try:
        if "scores" not in data or not isinstance(data["scores"].item(), dict) or \
           "array__sleep_stage" not in data["scores"].item():
            results["analysis_error_reason"] = "Missing or malformed sleep stage data in metadata."
            print(f"{results['analysis_error_reason']} for {metadata_file_path}")
            return results  # Return partially filled results
        
        sleep_stages = data["scores"].item()["array__sleep_stage"]
        results["total_windows"] = len(sleep_stages)
        results["rem_windows"] = np.sum(sleep_stages == REM_SLEEP_STAGE_VALUE)
        
        if "audio_cue_timestamps" in data and data["audio_cue_timestamps"] is not None:
            results["cues_triggered_in_rem_logic"] = len(data["audio_cue_timestamps"])
        else:
            # This is not an error, just means no cues were triggered or data not present
            pass  # Default is 0

        if "audio_cue_sequence_initiation_timestamps" in data and data["audio_cue_sequence_initiation_timestamps"] is not None:
            results["rem_segments_script_detected"] = len(data["audio_cue_sequence_initiation_timestamps"])
        else:
            pass  # Default is 0
            
    except KeyError as ke:
        err_msg = f"KeyError processing {metadata_file_path}: {ke}. Data might be incomplete."
        print(err_msg)
        results["analysis_error_reason"] = err_msg
    except Exception as e:
        err_msg = f"Unexpected error during analysis of {metadata_file_path}: {e}"
        print(err_msg)
        results["analysis_error_reason"] = err_msg

    return results


# --- Main Analysis Execution ---
all_analysis_results_list = []
processed_metadata_files = []

if not all_simulation_output_paths:
    print("No simulation output paths found. Run the simulation cell first.")
else:
    print(f"\n--- Starting Analysis of {len(all_simulation_output_paths)} Simulation Outputs ---")
    for session_group_folder_path in all_simulation_output_paths:  # Path to SIM_DATASETNAME_TIMESTAMP
        # The actual data (metadata.npz) is in a subfolder named after the session_start_timestamp (all digits)
        potential_timestamp_subfolders = [d for d in session_group_folder_path.iterdir() if d.is_dir() and d.name.isdigit()] 
        
        if not potential_timestamp_subfolders:
            # Check if metadata.npz (likely error metadata) is directly in the session_group_folder_path
            direct_metadata_file = session_group_folder_path / METADATA_FILENAME
            if direct_metadata_file.exists():
                print(f"Found metadata (likely error) directly in group folder: {direct_metadata_file}")
                processed_metadata_files.append(direct_metadata_file)
            else:
                print(f"No timestamp subfolder OR direct metadata file found in {session_group_folder_path}. Skipping analysis for this entry.")
            continue

        # Assume the first or only timestamp folder is the one of interest
        # A more robust solution might be needed if multiple valid timestamp folders could exist.
        timestamp_subfolder_with_data = potential_timestamp_subfolders[0]
        metadata_file_to_analyze = timestamp_subfolder_with_data / METADATA_FILENAME
        
        if metadata_file_to_analyze.exists():
            processed_metadata_files.append(metadata_file_to_analyze)
        else:
            print(f"Metadata file NOT FOUND at expected location: {metadata_file_to_analyze}. Skipping for this entry.")

if not processed_metadata_files:
    print("No metadata files (session_metadata.npz) found to analyze. Ensure simulations ran and produced output.")
    analysis_df = pd.DataFrame()  # Ensure df exists for later cells
else:
    print(f"\nFound {len(processed_metadata_files)} metadata files to analyze.")
    for mf_path in processed_metadata_files:
        print(f"Analyzing: {mf_path}")
        analysis_result_dict = analyze_rem_cuing_from_metadata(mf_path)
        all_analysis_results_list.append(analysis_result_dict)

    analysis_df = pd.DataFrame(all_analysis_results_list)
    print("\n--- Analysis Results Table (Raw) ---")
    if not analysis_df.empty:
        # Display relevant columns from the raw analysis
        cols_to_show = ['dataset_name', 'session_id', 'total_windows', 'rem_windows', 'rem_segments_script_detected', 'cues_triggered_in_rem_logic', 'error_in_simulation_metadata', 'analysis_error_reason']
        # Ensure all selected columns exist in the DataFrame before trying to display them
        display_df_cols = [col for col in cols_to_show if col in analysis_df.columns]
        if display_df_cols:
            display(analysis_df[display_df_cols])
        else:
            print("No relevant columns found for raw display in analysis_df.")
            display(analysis_df) # Display all if specific ones are missing
    else:
        print("Analysis DataFrame is empty after processing.")

# For later cells that might use analysis_df directly:
if 'analysis_df' not in globals():
    analysis_df = pd.DataFrame()  # Define if it wasn't created

# --- Create a prettier version of the analysis table for presentation ---
if 'analysis_df' in locals() and not analysis_df.empty:
    # Filter out rows where the analysis itself failed, or where simulation reported an error
    # For presentation, we typically want to show successfully processed and analyzed data.
    # You can adjust this filter based on whether you want to include simulations that had internal errors but were still analyzed.
    presentation_worthy_df = analysis_df[
        analysis_df['analysis_error_reason'].isna() & 
        analysis_df['error_in_simulation_metadata'].isna()
    ].copy()

    if not presentation_worthy_df.empty:
        # Select and rename columns for presentation
        cols_for_presentation = {
            'dataset_name': 'Dataset (Sim Group)',
            'total_windows': 'Total Windows',
            'rem_windows': 'REM Windows',
            'rem_segments_script_detected': 'REM Segments Detected',
            'cues_triggered_in_rem_logic': 'Audio Cues Triggered'
        }
        
        # Ensure all selected columns for presentation exist in presentation_worthy_df
        valid_cols_for_presentation = {k: v for k, v in cols_for_presentation.items() if k in presentation_worthy_df.columns}
        
        if not valid_cols_for_presentation:
            print("No valid columns for the presentation table after filtering.")
        else:
            presentation_df = presentation_worthy_df[list(valid_cols_for_presentation.keys())].copy()
            presentation_df.rename(columns=valid_cols_for_presentation, inplace=True)
            
            # Use 'Dataset (Sim Group)' as index if it exists and is unique, otherwise default index
            if 'Dataset (Sim Group)' in presentation_df.columns:
                try:
                    presentation_df.set_index('Dataset (Sim Group)', inplace=True)
                except KeyError: # Should not happen if column exists
                    print("Could not set 'Dataset (Sim Group)' as index.")


            # Style the dataframe for presentation
            styled_df = presentation_df.style.set_properties(**{
                'text-align': 'center',
                'font-weight': 'normal', # Adjusted from bold for better readability of numbers
                'border': '1px solid #CCCCCC',
                'padding': '8px', # Adjusted padding
                'background-color': 'white',
                'color': 'black'
            }).set_table_styles([
                {'selector': 'thead th', 'props': [ # Target header cells specifically
                    ('background-color', '#4F81BD'), # A slightly different blue
                    ('color', 'white'),
                    ('font-weight', 'bold'),
                    ('text-align', 'center'),
                    ('padding', '10px'),
                    ('border', '1px solid #CCCCCC')]},
                {'selector': 'tbody tr:nth-child(even)', 'props': [('background-color', '#DCE6F1')]}, # Lighter even row
                {'selector': 'tbody tr:nth-child(odd)', 'props': [('background-color', 'white')]},
                {'selector': 'caption', 'props': [
                    ('font-size', '16px'), # Adjusted size
                    ('font-weight', 'bold'),
                    ('color', '#365F91'), # Darker blue for caption
                    ('caption-side', 'top'),
                    ('margin-bottom', '10px'), # Add some space below caption
                    ('background-color', 'white')]},
                {'selector': 'table', 'props': [('background-color', 'white'), ('width', '100%')]} # Ensure table has white background and takes full width
            ]).set_caption('REM Sleep Simulation Analysis Summary')

            # Add bar representation for numerical columns if they exist
            bar_cols_map = {
                'Total Windows': '#AEC7E8', # Light blue
                'REM Windows': '#FFBB78',   # Light orange
                'REM Segments Detected': '#98DF8A', # Light green
                'Audio Cues Triggered': '#FF9896' # Light red
            }
            for col_name, color in bar_cols_map.items():
                if col_name in presentation_df.columns:
                    styled_df = styled_df.bar(subset=[col_name], color=color, vmin=0) # Ensure bars start from 0

            display(styled_df)

            # Calculate and display summary statistics based on the presentation_df
            total_rem_windows_sum = presentation_df['REM Windows'].sum() if 'REM Windows' in presentation_df else 0
            total_rem_segments_sum = presentation_df['REM Segments Detected'].sum() if 'REM Segments Detected' in presentation_df else 0
            total_audio_cues_sum = presentation_df['Audio Cues Triggered'].sum() if 'Audio Cues Triggered' in presentation_df else 0
            
            cue_rate_per_segment = (total_audio_cues_sum / total_rem_segments_sum * 100) if total_rem_segments_sum > 0 else 0
            
            summary_text_html = f"""
            <div style="padding:15px; margin-top:20px; background-color:#EBF4FF; border-left:5px solid #4F81BD; color:black; font-family: Arial, sans-serif;">
                <h3 style="margin:0 0 10px 0; color:#365F91;">Overall Summary Statistics (from Presented Data)</h3>
                <p style="margin: 5px 0;"><b>Total REM Windows:</b> {total_rem_windows_sum}</p>
                <p style="margin: 5px 0;"><b>Total Detected REM Segments:</b> {total_rem_segments_sum}</p>
                <p style="margin: 5px 0;"><b>Total Audio Cues Triggered:</b> {total_audio_cues_sum}</p>
                <p style="margin: 5px 0;"><b>Overall Cueing Rate (Cues per Detected Segment):</b> {cue_rate_per_segment:.1f}%</p>
            </div>
            """
            from IPython.display import HTML
            display(HTML(summary_text_html))
    else:
        print("No data suitable for the styled presentation table after filtering (e.g., all simulations had errors or analysis issues).")
else:
    print("No analysis_df available or it is empty. Skipping presentation table.")


### Visualization of Results (Example)

In [None]:
if analysis_df.empty:
    print("Analysis DataFrame is empty. Skipping plotting.")
else:
    # --- Report on Simulation Errors from Metadata ---
    if 'error_in_simulation_metadata' in analysis_df.columns:
        sim_errors_df = analysis_df[analysis_df['error_in_simulation_metadata'].notna()]
        if not sim_errors_df.empty:
            print("\n--- Simulations with Errors Reported in Their Own Metadata ---")
            for _, row in sim_errors_df.iterrows():
                print(f"  Dataset: {row.get('dataset_name', 'N/A')}, Session: {row.get('session_id', 'N/A')}, Error: {row['error_in_simulation_metadata']}")
        else:
            print("\nNo simulations reported errors directly in their metadata files (among those analyzed).")
    else:
        print("\n'error_in_simulation_metadata' column not found in analysis_df.")

    # --- Filter for Plotting: exclude rows with analysis errors or critical missing data ---
    # Keep rows where analysis itself didn't fail AND simulation didn't report a critical error preventing data presence.
    # A simulation error might still allow some data, so we are a bit lenient here if core fields exist.
    plot_df = analysis_df[analysis_df['analysis_error_reason'].isna()].copy()
    # Ensure essential columns for plotting exist and have valid data
    plot_df.dropna(subset=['rem_windows', 'cues_triggered_in_rem_logic', 'rem_segments_script_detected'], inplace=True)
    plot_df = plot_df[plot_df['total_windows'] > 0] # Meaningful data was processed

    if plot_df.empty:
        print("\nNo data suitable for plotting after filtering for analysis errors or missing essential values.")
    else:
        print(f"\n--- Generating Plots for {len(plot_df)} Successfully Analyzed Simulations ---")
        
        # Plot 1: Total REM windows vs. Cues Triggered vs. REM Segments Detected
        plot_df.plot(x='dataset_name', y=['rem_windows', 'cues_triggered_in_rem_logic', 'rem_segments_script_detected'], 
                     kind='bar', figsize=(17, 8),
                     title='REM Windows, Cues Triggered, and Script-Detected REM Segments per Dataset')
        plt.ylabel('Count')
        plt.xlabel('Dataset (Simulation Session Group Name)')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()

        # Plot 2: Percentage of script-detected REM segments that received cues
        plot_df['cued_rem_segment_percentage'] = plot_df.apply(
            lambda row: (row['cues_triggered_in_rem_logic'] / row['rem_segments_script_detected'] * 100) 
            if row['rem_segments_script_detected'] > 0 else 0,
            axis=1
        )
        # Filter for plotting - only datasets with script-detected REM segments
        plot_df_rem_segments_for_hist = plot_df[plot_df['rem_segments_script_detected'] > 0]

        if not plot_df_rem_segments_for_hist.empty:
            plot_df_rem_segments_for_hist.plot(x='dataset_name', y='cued_rem_segment_percentage', kind='bar', figsize=(17, 8),
                         title='Percentage of Script-Detected REM Segments Cued per Dataset')
            plt.ylabel('Percentage of Segments Cued (%)')
            plt.xlabel('Dataset (Simulation Session Group Name)')
            plt.xticks(rotation=45, ha='right')
            plt.ylim(0, max(110, plot_df_rem_segments_for_hist['cued_rem_segment_percentage'].max() * 1.1)) # Dynamic Y limit
            plt.tight_layout()
            plt.show()

            # Histogram of these percentages
            plt.figure(figsize=(10, 6))
            plot_df_rem_segments_for_hist['cued_rem_segment_percentage'].hist(bins=10, edgecolor='black')
            plt.title('Distribution of Cued REM Segment Percentages')
            plt.xlabel('Percentage of Script-Detected REM Segments Cued')
            plt.ylabel('Number of Datasets')
            plt.grid(axis='y', alpha=0.75)
            plt.show()
        else:
            print("No datasets with script-detected REM segments found for 'cued_rem_segment_percentage' plot.")

        # --- Overall Summary Statistics from Plot DF ---
        total_rem_windows_overall = plot_df['rem_windows'].sum()
        total_cues_triggered_overall = plot_df['cues_triggered_in_rem_logic'].sum()
        total_rem_segments_script_detected_overall = plot_df['rem_segments_script_detected'].sum()

        summary_metrics = {
            'Total REM Windows (across plotted datasets)': total_rem_windows_overall,
            'Total Cues Triggered (across plotted datasets)': total_cues_triggered_overall,
            'Total Script-Detected REM Segments (across plotted datasets)': total_rem_segments_script_detected_overall
        }
        print("\n--- Overall Summary Statistics (from plotted data) ---")
        for metric, value in summary_metrics.items():
            print(f"{metric}: {value}")

        if total_rem_segments_script_detected_overall > 0:
            # This is an average of percentages, which can be misleading. 
            # A better metric is total cues / total segments.
            # However, the per-dataset percentage plot is more informative for distribution.
            avg_cue_per_segment_overall = (total_cues_triggered_overall / total_rem_segments_script_detected_overall) * 100
            print(f"Overall average cues per script-detected REM segment (total cues / total segments): {avg_cue_per_segment_overall:.2f}%")
        else:
            print("No REM segments detected by script across plotted datasets to calculate overall cue success rate.")

        # Display the styled table from the original cell if desired (cell id 577f78d7)
        # This part is kept separate as it has specific styling and might use the full analysis_df
        if 'presentation_df' in globals() and not presentation_df.empty:
            print("\n--- Formatted Presentation Table (from original cell, may include data not in current plots) ---")
            display(styled_df) # Assuming styled_df was created in the original cell 577f78d7
            if 'summary_text' in globals():
                 from IPython.display import HTML
                 display(HTML(summary_text))
        else:
            print("\nOriginal presentation table (cell 577f78d7) data not available or not generated.")

### Detailed Plot for a Single Successful Simulation

Let's pick one of the successfully simulated datasets and plot its detailed sleep stages and audio cues, similar to the `test_app_data.ipynb` visualization, but adapted for the simulation output structure.

In [None]:
from datetime import datetime # Ensure datetime is imported
import matplotlib.pyplot as plt # Ensure pyplot is imported
import numpy as np # Ensure numpy is imported

# --- Configuration for this cell ---
# Set this to a specific dataset name (from DATASET_NAMES_TO_PROCESS or plot_df['dataset_name']) 
# or leave as None to attempt to auto-select one.
CHOSEN_DATASET_NAME_FOR_DETAIL_PLOT = None 
# Example: CHOSEN_DATASET_NAME_FOR_DETAIL_PLOT = "E23B6B24FX14_1743438214000" # Use the actual folder name

# --- Auto-selection logic ---
if CHOSEN_DATASET_NAME_FOR_DETAIL_PLOT is None:
    if 'plot_df' in globals() and not plot_df.empty:
        # Prefer datasets that had REM segments detected and were plotted successfully
        if 'rem_segments_script_detected' in plot_df.columns and plot_df['rem_segments_script_detected'].sum() > 0:
            first_suitable_dataset = plot_df[plot_df['rem_segments_script_detected'] > 0].iloc[0]
            CHOSEN_DATASET_NAME_FOR_DETAIL_PLOT = first_suitable_dataset['dataset_name'] # This is SIM_DATASET_TIMESTAMP
            print(f"Auto-selected dataset for detailed plot (had REM segments): {CHOSEN_DATASET_NAME_FOR_DETAIL_PLOT}")
        else:
            CHOSEN_DATASET_NAME_FOR_DETAIL_PLOT = plot_df['dataset_name'].iloc[0]
            print(f"Auto-selected first available dataset from plot_df: {CHOSEN_DATASET_NAME_FOR_DETAIL_PLOT}")
    elif 'analysis_df' in globals() and not analysis_df.empty:
        # Fallback to analysis_df if plot_df is not available or empty
        valid_for_plot = analysis_df[analysis_df['analysis_error_reason'].isna() & analysis_df['total_windows'] > 0]
        if not valid_for_plot.empty:
            CHOSEN_DATASET_NAME_FOR_DETAIL_PLOT = valid_for_plot['dataset_name'].iloc[0]
            print(f"Auto-selected first valid dataset from analysis_df: {CHOSEN_DATASET_NAME_FOR_DETAIL_PLOT}")
        else:
            print("No suitable dataset found automatically for detailed plot.")
    else:
        print("No analysis data available to auto-select a dataset for detailed plot.")

# --- Plotting Function ---
def plot_single_simulation_details_revised(dataset_sim_group_name: str, base_output_dir: Path):
    """Loads a single simulation's metadata and plots sleep stages and audio cues."""
    
    session_group_folder = base_output_dir / dataset_sim_group_name
    if not session_group_folder.is_dir():
        print(f"Session group folder not found: {session_group_folder}")
        return

    potential_ts_subfolders = [d for d in session_group_folder.iterdir() if d.is_dir() and d.name.isdigit()]
    if not potential_ts_subfolders:
        print(f"No timestamped data subfolder found in {session_group_folder}.")
        # Check for direct error metadata
        direct_metadata = session_group_folder / METADATA_FILENAME
        if direct_metadata.exists():
            data_direct = np.load(direct_metadata, allow_pickle=True)
            if "error_message" in data_direct:
                print(f"Error metadata found directly in {session_group_folder}: {data_direct['error_message']}")
        return
    
    metadata_path = potential_ts_subfolders[0] / METADATA_FILENAME # Take the first one

    if not metadata_path.exists():
        print(f"Metadata file not found: {metadata_path}")
        return

    print(f"Loading metadata for detailed plot from: {metadata_path}")
    try:
        data = np.load(metadata_path, allow_pickle=True)

        if data.get("error_message") is not None:
            print(f"NOTE: This simulation ({dataset_sim_group_name}) reported an error in its metadata: {data['error_message']}")

        timestamps_unix = data.get("metadata_eeg_eog_timestamps")
        scores_item = data.get("scores")
        sleep_stages = scores_item.item().get("array__sleep_stage") if scores_item is not None and isinstance(scores_item.item(), dict) else None
        audio_cues_unix = data.get("audio_cue_timestamps")
        audio_cue_initiations_unix = data.get("audio_cue_sequence_initiation_timestamps")
        
        session_start_iso_str = data.get("session_start_iso_str")
        session_start_unix_ref = timestamps_unix[0] if (timestamps_unix is not None and len(timestamps_unix) > 0) else 0
        if session_start_iso_str:
            try:
                session_start_unix_ref = datetime.fromisoformat(str(session_start_iso_str)).timestamp()
            except:
                print(f"Could not parse session_start_iso_str: {session_start_iso_str}. Using first timestamp as reference.")
        elif session_start_unix_ref == 0:
             print("Could not determine session start time for relative plotting.")
             return

        if timestamps_unix is None or sleep_stages is None:
            print(f"Essential data (timestamps or sleep_stages) missing in {metadata_path}. Cannot generate detailed plot.")
            return

        if len(sleep_stages) != len(timestamps_unix):
            print(f"Warning: Mismatch between sleep stages ({len(sleep_stages)}) and timestamps ({len(timestamps_unix)}). Truncating to shorter length.")
            min_len = min(len(sleep_stages), len(timestamps_unix))
            sleep_stages = sleep_stages[:min_len]
            timestamps_unix = timestamps_unix[:min_len]
            if min_len == 0:
                print("No data points to plot after length adjustment.")
                return

        relative_time_s = timestamps_unix - session_start_unix_ref

        fig, ax1 = plt.subplots(figsize=(20, 7))

        # Plot sleep stages
        color_stage = 'tab:blue'
        ax1.set_xlabel(f'Time from Session Start ({datetime.fromtimestamp(session_start_unix_ref).strftime("%Y-%m-%d %H:%M:%S")}) (seconds)')
        ax1.set_ylabel('Sleep Stage', color=color_stage)
        ax1.plot(relative_time_s, sleep_stages, color=color_stage, drawstyle='steps-post', label='Simulated Sleep Stage', linewidth=1.5)
        ax1.tick_params(axis='y', labelcolor=color_stage)
        ax1.set_yticks([0, 1, 2, 3, 4])
        ax1.set_yticklabels(['Wake', 'Light', 'Deep', 'REM', 'Unknown'])
        ax1.grid(True, axis='y', linestyle=':', alpha=0.7)
        ax1.set_ylim(-0.5, 4.5) # Give some padding to y-axis for stages

        # Twin axis for cues
        ax2 = ax1.twinx()
        ax2.set_ylim(0, 1) # Dummy scale for vlines
        ax2.set_yticks([]) # No ticks needed for this axis

        cue_legend_handles = []
        if audio_cue_initiations_unix is not None and len(audio_cue_initiations_unix) > 0:
            relative_initiations_s = audio_cue_initiations_unix - session_start_unix_ref
            line = ax2.vlines(relative_initiations_s, 0.2, 0.8, color='green', linestyle='--', linewidth=2, label='REM Segment Start (Script Logic)')
            cue_legend_handles.append(line)

        if audio_cues_unix is not None and len(audio_cues_unix) > 0:
            relative_cues_s = audio_cues_unix - session_start_unix_ref
            line = ax2.vlines(relative_cues_s, 0.1, 0.9, color='red', linestyle='-', linewidth=2, label='Audio Cue Triggered')
            cue_legend_handles.append(line)
        
        stage_handles, stage_labels = ax1.get_legend_handles_labels()
        if cue_legend_handles:
            ax1.legend(stage_handles + cue_legend_handles, stage_labels + [h.get_label() for h in cue_legend_handles], loc='upper right')
        else:
            ax1.legend(loc='upper right')

        plt.title(f'Simulated Sleep Stages & Audio Cues for: {dataset_sim_group_name}', fontsize=16)
        plt.tight_layout(rect=[0, 0, 1, 0.97]) # Adjust for suptitle
        plt.show()

    except Exception as e:
        print(f"CRITICAL Error plotting details for {metadata_path}: {e}")
        import traceback
        traceback.print_exc()

# --- Execute Plotting ---
if CHOSEN_DATASET_NAME_FOR_DETAIL_PLOT:
    if 'SIMULATION_OUTPUT_BASE_DIR' in globals() and SIMULATION_OUTPUT_BASE_DIR.is_dir():
        print(f"\n--- Plotting detailed view for: {CHOSEN_DATASET_NAME_FOR_DETAIL_PLOT} ---")
        plot_single_simulation_details_revised(CHOSEN_DATASET_NAME_FOR_DETAIL_PLOT, SIMULATION_OUTPUT_BASE_DIR)
    else:
        print("SIMULATION_OUTPUT_BASE_DIR is not defined or not a directory. Cannot plot details.")
elif ('plot_df' in globals() and plot_df.empty) or 'plot_df' not in globals():
    print("\nNo suitable dataset was chosen or found for detailed plotting. ")
    print("Set CHOSEN_DATASET_NAME_FOR_DETAIL_PLOT manually or ensure previous cells run successfully and produce data.")

