In [2]:
import json
import os
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import ttest_rel, ttest_ind
import matplotlib.pyplot as plt
import seaborn as sns
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinter import Text, Listbox, Scrollbar, END, EXTENDED
from pathlib import Path
import webbrowser
import tempfile
import html

class RyffAnalyzer:
    def __init__(self):
        self.root = tk.Tk()
        self.root.title("AI Wellbeing - Ryff Scale Analysis Tool")
        self.root.geometry("1000x700")
        
        # Ryff Scale Configuration
        self.scale_points = 7  # 7-point scale (1 to 7)
        
        # Subscale definitions (Ryff 42-item version)
        self.subscales = {
            'Autonomy': [1, 13, 24, 35, 41, 10, 21],
            'Environmental_Mastery': [3, 15, 26, 36, 42, 12, 23],
            'Personal_Growth': [5, 17, 28, 37, 2, 14, 25],
            'Positive_Relations': [7, 18, 30, 38, 4, 16, 27],
            'Purpose_in_Life': [9, 20, 32, 39, 6, 29, 33],
            'Self_Acceptance': [11, 22, 34, 40, 8, 19, 31]
        }
        
        # Reverse-scored items (as per standard Ryff 42-item scoring)
        self.reverse_items = [1, 2, 3, 4, 6, 7, 11, 13, 17, 20, 21, 22, 23, 
                             27, 29, 31, 35, 36, 37, 38, 40]
        
        # Data storage
        self.loaded_files = {}
        self.processed_data = {}
        self.file_labels = {}  # Store user-assigned labels
        
        self.setup_gui()
    
    def setup_gui(self):
        # Create notebook for tabs
        notebook = ttk.Notebook(self.root)
        notebook.pack(fill='both', expand=True, padx=10, pady=10)
        
        # Tab 1: File Management
        file_frame = ttk.Frame(notebook)
        notebook.add(file_frame, text="File Management")
        
        ttk.Button(file_frame, text="Load JSON Files", 
                  command=self.load_files).pack(pady=5)
        
        # File list with labels (with multiple selection)
        list_frame = ttk.Frame(file_frame)
        list_frame.pack(fill='both', expand=True, pady=5)
        
        ttk.Label(list_frame, text="Loaded Files (Ctrl+Click for multiple selection):").pack(anchor='w')
        self.file_listbox = Listbox(list_frame, height=8, selectmode=EXTENDED)
        self.file_listbox.pack(fill='both', expand=True)
        
        # Labeling section
        label_frame = ttk.Frame(file_frame)
        label_frame.pack(fill='x', pady=5)
        
        ttk.Label(label_frame, text="Label selected file(s) as:").pack(side='left')
        self.label_entry = ttk.Entry(label_frame, width=20)
        self.label_entry.pack(side='left', padx=5)
        ttk.Button(label_frame, text="Set Label", command=self.set_file_label).pack(side='left')
        
        # Management buttons
        mgmt_frame = ttk.Frame(file_frame)
        mgmt_frame.pack(fill='x', pady=5)
        
        ttk.Button(mgmt_frame, text="Remove Selected File(s)", 
                  command=self.remove_files).pack(side='left', padx=5)
        ttk.Button(mgmt_frame, text="Clear All", 
                  command=self.cleanup).pack(side='left', padx=5)
        ttk.Button(mgmt_frame, text="Show File Labels", 
                  command=self.show_labels).pack(side='left', padx=5)
        
        # Tab 2: Analysis
        analysis_frame = ttk.Frame(notebook)
        notebook.add(analysis_frame, text="Analysis")
        
        # Analysis buttons
        button_frame = ttk.Frame(analysis_frame)
        button_frame.pack(fill='x', pady=5)
        
        buttons = [
            ("1. Process Ryff Scores", self.process_ryff_scores),
            ("2. Check Internal Consistency", self.check_consistency),
            ("3. Compare Groups A vs B", self.compare_groups),
            ("4. Cross-File Similarity", self.analyze_similarity),
            ("5. Error Analysis", self.error_analysis)
        ]
        
        for i, (text, command) in enumerate(buttons):
            row = i // 2
            col = i % 2
            btn = ttk.Button(button_frame, text=text, command=command)
            btn.grid(row=row, column=col, padx=5, pady=2, sticky='ew')
        
        button_frame.grid_columnconfigure(0, weight=1)
        button_frame.grid_columnconfigure(1, weight=1)
        
        # Export buttons
        export_frame = ttk.Frame(analysis_frame)
        export_frame.pack(fill='x', pady=5)
        
        ttk.Button(export_frame, text="Save as TXT", 
                  command=self.save_as_txt).pack(side='left', padx=5)
        ttk.Button(export_frame, text="Save as HTML", 
                  command=self.save_as_html).pack(side='left', padx=5)
        ttk.Button(export_frame, text="Save as PDF", 
                  command=self.save_as_pdf).pack(side='left', padx=5)
        
        # Results display
        results_frame = ttk.Frame(analysis_frame)
        results_frame.pack(fill='both', expand=True, pady=5)
        
        self.results_text = Text(results_frame, height=20, wrap='word', font=('Consolas', 10))
        scrollbar = Scrollbar(results_frame, orient="vertical", command=self.results_text.yview)
        self.results_text.configure(yscrollcommand=scrollbar.set)
        
        self.results_text.pack(side='left', fill='both', expand=True)
        scrollbar.pack(side='right', fill='y')
        
        # Clear results button
        ttk.Button(analysis_frame, text="Clear Results", 
                  command=self.clear_results).pack(pady=5)
    
    def load_files(self):
        """Load JSON files containing Ryff scale responses"""
        try:
            files = filedialog.askopenfilenames(
                title="Select JSON files",
                filetypes=[("JSON files", "*.json")]
            )
            
            # Check if user cancelled the dialog
            if not files:
                return
            
            loaded_count = 0
            for file_path in files:
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        data = json.load(f)
                    
                    filename = os.path.basename(file_path)
                    # Handle duplicate filenames by adding a suffix
                    original_filename = filename
                    counter = 1
                    while filename in self.loaded_files:
                        name, ext = os.path.splitext(original_filename)
                        filename = f"{name}_{counter}{ext}"
                        counter += 1
                    
                    self.loaded_files[filename] = data
                    self.file_labels[filename] = "unlabeled"
                    loaded_count += 1
                    
                except Exception as e:
                    messagebox.showerror("Error", f"Failed to load {file_path}: {str(e)}")
            
            self.update_file_list()
            if loaded_count > 0:
                self.log_result(f"Successfully loaded {loaded_count} files.")
            
        except Exception as e:
            messagebox.showerror("Error", f"Unexpected error during file loading: {str(e)}")
    
    def set_file_label(self):
        """Set label for selected files"""
        try:
            selections = self.file_listbox.curselection()
            if not selections:
                messagebox.showwarning("Warning", "Please select one or more files first!")
                return
            
            label = self.label_entry.get().strip()
            
            if not label:
                messagebox.showwarning("Warning", "Please enter a label!")
                return
            
            filenames = list(self.loaded_files.keys())
            labeled_files = []
            
            for selection in selections:
                if selection < len(filenames):  # Safety check
                    filename = filenames[selection]
                    self.file_labels[filename] = label
                    labeled_files.append(filename)
            
            self.update_file_list()
            self.label_entry.delete(0, tk.END)
            self.log_result(f"Labeled {len(labeled_files)} files as '{label}': {', '.join(labeled_files)}")
            
        except Exception as e:
            messagebox.showerror("Error", f"Error setting file labels: {str(e)}")
    
    def update_file_list(self):
        """Update the file listbox with labels"""
        self.file_listbox.delete(0, tk.END)
        for filename in self.loaded_files.keys():
            label = self.file_labels.get(filename, "unlabeled")
            display_text = f"{filename} [{label}]"
            self.file_listbox.insert(tk.END, display_text)
    
    def show_labels(self):
        """Show all file labels"""
        self.log_result("\n=== FILE LABELS ===")
        for filename, label in self.file_labels.items():
            self.log_result(f"{filename}: {label}")
    
    def remove_files(self):
        """Remove selected files from analysis"""
        try:
            selections = self.file_listbox.curselection()
            if not selections:
                messagebox.showwarning("Warning", "Please select one or more files first!")
                return
            
            filenames = list(self.loaded_files.keys())
            to_remove = []
            
            for selection in reversed(selections):  # Remove in reverse order to maintain indices
                if selection < len(filenames):  # Safety check
                    filename = filenames[selection]
                    to_remove.append(filename)
                    del self.loaded_files[filename]
                    if filename in self.file_labels:
                        del self.file_labels[filename]
                    if filename in self.processed_data:
                        del self.processed_data[filename]
            
            self.update_file_list()
            self.log_result(f"Removed {len(to_remove)} files: {', '.join(to_remove)}")
            
        except Exception as e:
            messagebox.showerror("Error", f"Error removing files: {str(e)}")
    
    def cleanup(self):
        """Clear all data"""
        try:
            self.loaded_files.clear()
            self.processed_data.clear()
            self.file_labels.clear()
            self.update_file_list()
            self.clear_results()
            self.log_result("All data cleared.")
        except Exception as e:
            messagebox.showerror("Error", f"Error clearing data: {str(e)}")
    
    def clear_results(self):
        """Clear the results text area"""
        self.results_text.delete(1.0, tk.END)
    
    def get_results_text(self):
        """Get all text from results area"""
        return self.results_text.get(1.0, tk.END)
    
    def save_as_txt(self):
        """Save results as TXT file"""
        try:
            content = self.get_results_text()
            if not content.strip():
                messagebox.showwarning("Warning", "No results to save!")
                return
            
            filename = filedialog.asksaveasfilename(
                defaultextension=".txt",
                filetypes=[("Text files", "*.txt"), ("All files", "*.*")]
            )
            
            if filename:
                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(content)
                messagebox.showinfo("Success", f"Results saved to {filename}")
                
        except Exception as e:
            messagebox.showerror("Error", f"Failed to save file: {str(e)}")
    
    def save_as_html(self):
        """Save results as HTML file"""
        try:
            content = self.get_results_text()
            if not content.strip():
                messagebox.showwarning("Warning", "No results to save!")
                return
            
            filename = filedialog.asksaveasfilename(
                defaultextension=".html",
                filetypes=[("HTML files", "*.html"), ("All files", "*.*")]
            )
            
            if filename:
                html_content = f"""
<!DOCTYPE html>
<html>
<head>
    <title>Ryff Scale Analysis Results</title>
    <style>
        body {{ font-family: 'Courier New', monospace; white-space: pre-wrap; margin: 20px; }}
        .header {{ color: #2c3e50; font-weight: bold; }}
    </style>
</head>
<body>
{html.escape(content)}
</body>
</html>
"""
                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(html_content)
                messagebox.showinfo("Success", f"Results saved to {filename}")
                
        except Exception as e:
            messagebox.showerror("Error", f"Failed to save file: {str(e)}")
    
    def save_as_pdf(self):
        """Save results as PDF file"""
        try:
            content = self.get_results_text()
            if not content.strip():
                messagebox.showwarning("Warning", "No results to save!")
                return
            
            filename = filedialog.asksaveasfilename(
                defaultextension=".pdf",
                filetypes=[("PDF files", "*.pdf"), ("All files", "*.*")]
            )
            
            if filename:
                try:
                    from reportlab.lib.pagesizes import letter
                    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
                    from reportlab.lib.styles import getSampleStyleSheet
                    from reportlab.lib.units import inch
                    
                    doc = SimpleDocTemplate(filename, pagesize=letter)
                    styles = getSampleStyleSheet()
                    story = []
                    
                    # Split content into lines and create paragraphs
                    lines = content.split('\n')
                    for line in lines:
                        if line.strip():
                            # Use a monospace style for consistent formatting
                            para = Paragraph(html.escape(line), styles['Code'])
                            story.append(para)
                        else:
                            story.append(Spacer(1, 0.1*inch))
                    
                    doc.build(story)
                    messagebox.showinfo("Success", f"Results saved to {filename}")
                    
                except ImportError:
                    messagebox.showerror("Error", "PDF export requires reportlab library. Install with: pip install reportlab")
                    
        except Exception as e:
            messagebox.showerror("Error", f"Failed to save PDF: {str(e)}")
    
    def reverse_score(self, value):
        """Apply reverse scoring formula for a 1-7 Likert scale."""
        if value is None:
            return None
        
        try:
            numeric_value = float(value)
            if not (1 <= numeric_value <= self.scale_points) or not numeric_value.is_integer():
                return None
            
            # For a 7-point scale, this is 8 - value
            return (self.scale_points + 1) - int(numeric_value)
            
        except (ValueError, TypeError):
            return None
    
    def safe_std(self, data, ddof=1):
        """Safely calculate standard deviation, handling edge cases"""
        if not data or len(data) <= ddof: # Needs at least 2 points for ddof=1
            return None
        try:
            return np.std(data, ddof=ddof)
        except:
            return None
    
    def safe_mean(self, data):
        """Safely calculate mean, handling edge cases"""
        if not data:
            return None
        try:
            return np.mean(data)
        except:
            return None
    
    def process_single_file(self, data, filename):
        """
        Process a single JSON file for Ryff scoring.
        Calculates total score, subscale sums, and subscale averages.
        """
        processed = {
            'filename': filename,
            'raw_scores': {},
            'reversed_scores': {},
            'subscale_sums': {},
            'subscale_averages': {},
            'global_ryff_total_score': None, # Initialize as None, set if valid
            'errors': [],
            'invalid_entries': [],
            'valid': True
        }
        
        # Process each item (1 to 42)
        error_count = 0
        valid_reversed_scores_for_total = {} # Collect valid reversed scores for final sum
        
        for item_num in range(1, 43):
            item_key = str(item_num)
            
            if item_key not in data:
                processed['errors'].append(f"Missing item {item_num}")
                processed['invalid_entries'].append(f"Item {item_num}: missing")
                error_count += 1
                continue
            
            raw_value = data[item_key]
            
            if raw_value is None:
                processed['errors'].append(f"Null value for item {item_num}")
                processed['invalid_entries'].append(f"Item {item_num}: null")
                error_count += 1
                continue
            
            if isinstance(raw_value, (list, dict)):
                processed['errors'].append(f"Multiple values (list/dict) for item {item_num}")
                processed['invalid_entries'].append(f"Item {item_num}: {raw_value}")
                error_count += 1
                continue
            
            try:
                numeric_value = float(raw_value)
                if not (1 <= numeric_value <= self.scale_points) or not numeric_value.is_integer():
                    processed['errors'].append(f"Invalid value (out of range 1-{self.scale_points} or not integer) for item {item_num}: {raw_value}")
                    processed['invalid_entries'].append(f"Item {item_num}: {raw_value}")
                    error_count += 1
                    continue
                numeric_value = int(numeric_value)
            except (ValueError, TypeError):
                processed['errors'].append(f"Non-numeric value for item {item_num}: {raw_value}")
                processed['invalid_entries'].append(f"Item {item_num}: {raw_value}")
                error_count += 1
                continue
            
            # Successfully processed raw item
            processed['raw_scores'][item_num] = numeric_value
            
            # Apply reverse scoring if needed
            reversed_val = None
            if item_num in self.reverse_items:
                reversed_val = self.reverse_score(numeric_value)
            else:
                reversed_val = numeric_value
            
            if reversed_val is not None:
                processed['reversed_scores'][item_num] = reversed_val
                valid_reversed_scores_for_total[item_num] = reversed_val # Add to valid list for total sum
            else:
                processed['errors'].append(f"Error during reverse/direct scoring for item {item_num}: {raw_value}")
                processed['invalid_entries'].append(f"Item {item_num}: Scoring error")
                error_count += 1
        
        # Mark file as invalid if too many errors
        if error_count >= 8:
            processed['valid'] = False
            return processed
        
        # Calculate Subscale Sums and Averages
        for subscale_name, items in self.subscales.items():
            subscale_item_values = []
            for item_num in items:
                if item_num in processed['reversed_scores']: # Use only successfully reversed items
                    subscale_item_values.append(processed['reversed_scores'][item_num])
            
            if subscale_item_values:
                processed['subscale_sums'][subscale_name] = sum(subscale_item_values)
                processed['subscale_averages'][subscale_name] = self.safe_mean(subscale_item_values)
            else:
                processed['subscale_sums'][subscale_name] = None
                processed['subscale_averages'][subscale_name] = None
        
        # Calculate Global Ryff Total Score (sum of all VALID reversed items)
        # Ensure that only items successfully processed and reversed contribute to the total
        if valid_reversed_scores_for_total:
            processed['global_ryff_total_score'] = sum(valid_reversed_scores_for_total.values())
        else:
            processed['global_ryff_total_score'] = None
            processed['valid'] = False # If no valid items for total score, mark as invalid
        
        return processed
    
    def create_scores_table(self, valid_files):
        """Create formatted tables for scores"""
        self.log_result("\nGLOBAL RYFF TOTAL SCORES TABLE (Sum of valid reversed items):")
        self.log_result("=" * 80)
        self.log_result(f"{'Filename':<30} {'Label':<15} {'Total Score':<12}")
        self.log_result("-" * 80)
        
        for filename in valid_files:
            label = self.file_labels.get(filename, "unlabeled")
            score = self.processed_data[filename].get('global_ryff_total_score') # Use .get for safety
            if score is not None:
                self.log_result(f"{filename:<30} {label:<15} {score:<12.3f}")
            else:
                self.log_result(f"{filename:<30} {label:<15} {'N/A':<12}")
        
        self.log_result(f"\nSUBSCALE SUMS TABLE:")
        self.log_result("=" * 120)
        
        # Header for subscale sums
        header_sums = f"{'Filename':<20} {'Label':<10}"
        for subscale in self.subscales.keys():
            header_sums += f" {subscale[:8]:<9}"
        self.log_result(header_sums)
        self.log_result("-" * 120)
        
        # Data rows for subscale sums
        for filename in valid_files:
            label = self.file_labels.get(filename, "unlabeled")
            row_sums = f"{filename:<20} {label:<10}"
            
            for subscale in self.subscales.keys():
                score = self.processed_data[filename]['subscale_sums'].get(subscale)
                if score is not None:
                    row_sums += f" {score:<9.3f}"
                else:
                    row_sums += f" {'N/A':<9}"
            
            self.log_result(row_sums)

        self.log_result(f"\nSUBSCALE AVERAGES TABLE:")
        self.log_result("=" * 120)
        
        # Header for subscale averages
        header_averages = f"{'Filename':<20} {'Label':<10}"
        for subscale in self.subscales.keys():
            header_averages += f" {subscale[:8]:<9}"
        self.log_result(header_averages)
        self.log_result("-" * 120)
        
        # Data rows for subscale averages
        for filename in valid_files:
            label = self.file_labels.get(filename, "unlabeled")
            row_averages = f"{filename:<20} {label:<10}"
            
            for subscale in self.subscales.keys():
                score = self.processed_data[filename]['subscale_averages'].get(subscale)
                if score is not None:
                    row_averages += f" {score:<9.3f}"
                else:
                    row_averages += f" {'N/A':<9}"
            
            self.log_result(row_averages)
            
    def analyze_subscale_extremes(self, valid_files):
        """Analyze highest and lowest scoring subscales based on subscale averages."""
        self.log_result(f"\nSUBSCALE EXTREMES ANALYSIS (Based on Subscale Averages):")
        self.log_result("=" * 60)
        
        self.log_result("INDIVIDUAL FILE ANALYSIS:")
        self.log_result("-" * 40)
        
        for filename in valid_files:
            data = self.processed_data[filename]
            label = self.file_labels.get(filename, "unlabeled")
            subscale_averages = data['subscale_averages']
            
            if subscale_averages:
                filtered_averages = {k: v for k, v in subscale_averages.items() if v is not None}
                
                if filtered_averages:
                    highest_subscale = max(filtered_averages.items(), key=lambda x: x[1])
                    lowest_subscale = min(filtered_averages.items(), key=lambda x: x[1])
                    
                    self.log_result(f"\n{filename} [{label}]:")
                    self.log_result(f"  Highest: {highest_subscale[0]} ({highest_subscale[1]:.3f})")
                    self.log_result(f"  Lowest:  {lowest_subscale[0]} ({lowest_subscale[1]:.3f})")
                    self.log_result(f"  Range:   {highest_subscale[1] - lowest_subscale[1]:.3f}")
                else:
                    self.log_result(f"\n{filename} [{label}]: No valid subscale averages to analyze extremes.")
            else:
                self.log_result(f"\n{filename} [{label}]: No subscale averages available for analysis.")
        
        self.log_result(f"\nOVERALL ANALYSIS ACROSS ALL FILES:")
        self.log_result("-" * 40)
        
        subscale_group_averages = {}
        for subscale_name in self.subscales.keys():
            scores_across_files = []
            for filename in valid_files:
                subscale_avg = self.processed_data[filename]['subscale_averages'].get(subscale_name)
                if subscale_avg is not None:
                    scores_across_files.append(subscale_avg)
            
            if scores_across_files:
                subscale_group_averages[subscale_name] = self.safe_mean(scores_across_files)
        
        if subscale_group_averages:
            valid_group_averages = {k: v for k, v in subscale_group_averages.items() if v is not None}
            
            if valid_group_averages:
                highest_overall = max(valid_group_averages.items(), key=lambda x: x[1])
                lowest_overall = min(valid_group_averages.items(), key=lambda x: x[1])
                
                self.log_result(f"\nAverage subscale scores across {len(valid_files)} files:")
                for subscale, avg_score in sorted(valid_group_averages.items(), key=lambda x: x[1], reverse=True):
                    self.log_result(f"  {subscale:<20}: {avg_score:.3f}")
                
                self.log_result(f"\nOVERALL EXTREMES:")
                self.log_result(f"  Highest scoring subscale: {highest_overall[0]} ({highest_overall[1]:.3f})")
                self.log_result(f"  Lowest scoring subscale:  {lowest_overall[0]} ({lowest_overall[1]:.3f})")
                self.log_result(f"  Overall range:            {highest_overall[1] - lowest_overall[1]:.3f}")
            else:
                self.log_result("\nNo valid overall subscale averages available for extreme analysis.")
        else:
            self.log_result("\nNo overall subscale averages available for extreme analysis.")
            
    def process_ryff_scores(self):
        """
        Process all loaded files for Ryff scoring, calculating total scores,
        subscale sums, and subscale averages for each file.
        """
        if not self.loaded_files:
            messagebox.showwarning("Warning", "No files loaded!")
            return
        
        try:
            self.log_result("=" * 60)
            self.log_result("RYFF PSYCHOLOGICAL WELL-BEING SCALE PROCESSING")
            self.log_result("=" * 60)
            
            self.log_result("\nMETHODOLOGY EXPLANATION:")
            self.log_result("The Ryff Scale measures psychological well-being across 6 dimensions.")
            self.log_result(f"Each dimension is composed of 7 items, rated on a {self.scale_points}-point Likert scale (1=strongly disagree, {self.scale_points}=strongly agree).")
            self.log_result("This process calculates scores for each AI response file (subject) based on these items.")
            
            self.log_result("\nREVERSE SCORING:")
            self.log_result(f"Items worded negatively are reverse-scored to ensure higher scores consistently reflect higher well-being.")
            self.log_result(f"Formula used: ({self.scale_points + 1} - original_score). For a 1-7 scale, this is (8 - original_score).")
            self.log_result(f"Items identified for reverse-scoring: {sorted(self.reverse_items)}")
            
            self.log_result("\nSCORING FORMULAS:")
            self.log_result("- **Subscale Sum:** Simple sum of the 7 valid reversed item scores within each specific subscale.")
            self.log_result("- **Subscale Average:** Mean (average) of the 7 valid reversed item scores within each specific subscale.")
            self.log_result(f"- **Global Ryff Total Score:** Simple SUM of all 42 valid individual item scores (after reverse scoring).")
            self.log_result(f"  - This total score can range from {42 * 1} (all minimums) to {42 * self.scale_points} (all maximums). For a 1-7 scale, this range is {42} to {294}.")
            
            self.log_result("\nVALIDITY THRESHOLD:")
            self.log_result("Files with 8 or more invalid responses (missing items, non-numeric, out-of-range values, multiple values) are marked as INVALID.")
            self.log_result("INVALID files are excluded from statistical comparisons and consistency analyses to ensure reliable results.")
            self.log_result("\nACCEPTABLE ITEM VALUES:")
            self.log_result(f"Only single integer values from 1 to {self.scale_points} are considered valid responses for individual items.")
            self.log_result("-" * 60)
            
            valid_files = []
            invalid_files = []
            
            for filename, data in self.loaded_files.items():
                processed = self.process_single_file(data, filename)
                self.processed_data[filename] = processed
                
                label = self.file_labels.get(filename, "unlabeled")
                
                if processed['valid']:
                    valid_files.append(filename)
                    self.log_result(f"\n✓ {filename} [{label}] - VALID FILE")
                    score = processed['global_ryff_total_score']
                    if score is not None:
                        self.log_result(f"  Calculated Global Ryff Total Score: {score:.3f}")
                    else:
                        self.log_result(f"  Global Ryff Total Score: N/A (Insufficient valid items)")
                    self.log_result(f"  Number of errors/invalid entries detected: {len(processed['errors'])}")
                    
                    self.log_result("  Subscale Sums:")
                    for subscale, score_sum in processed['subscale_sums'].items():
                        if score_sum is not None:
                            self.log_result(f"    {subscale}: {score_sum:.3f}")
                        else:
                            self.log_result(f"    {subscale}: N/A (Insufficient valid items)")
                    
                    self.log_result("  Subscale Averages:")
                    for subscale, score_avg in processed['subscale_averages'].items():
                        if score_avg is not None:
                            self.log_result(f"    {subscale}: {score_avg:.3f}")
                        else:
                            self.log_result(f"    {subscale}: N/A (Insufficient valid items)")
                else:
                    invalid_files.append(filename)
                    self.log_result(f"\n✗ {filename} [{label}] - INVALID FILE ({len(processed['errors'])} errors)")
                    self.log_result(f"  This file exceeded the validity threshold (≥8 errors) and will be excluded from further analyses.")
                    if processed['invalid_entries']:
                        self.log_result(f"  Example invalid entries: {', '.join(processed['invalid_entries'][:5])}{'...' if len(processed['invalid_entries']) > 5 else ''}")
            
            # Create summary tables at the end
            if valid_files:
                self.create_scores_table(valid_files)
                self.analyze_subscale_extremes(valid_files)
            else:
                self.log_result("\nNo valid files were processed for summary tables.")
            
            self.log_result(f"\n" + "=" * 40)
            self.log_result(f"PROCESSING SUMMARY:")
            self.log_result(f"Total files attempted: {len(self.loaded_files)}")
            self.log_result(f"Valid files for analysis: {len(valid_files)}")
            self.log_result(f"Invalid files (excluded): {len(invalid_files)}")
            total_files_sum = len(valid_files) + len(invalid_files)
            if total_files_sum > 0:
                self.log_result(f"Success rate: {len(valid_files)/total_files_sum*100:.1f}%")
            else:
                self.log_result("Success rate: N/A")
            self.log_result("=" * 40)
            
        except Exception as e:
            messagebox.showerror("Error", f"Error processing Ryff scores: {str(e)}")
    
    def check_consistency(self):
        """
        Check internal consistency for each AI instance.
        MODIFIED: Enforces minimum item count, defines global consistency, and provides detailed summary.
        """
        if not self.processed_data:
            messagebox.showwarning("Warning", "Please process Ryff scores first!")
            return
        
        try:
            self.log_result("\n" + "=" * 60)
            self.log_result("INTERNAL CONSISTENCY (WITHIN SUBSCALE) ANALYSIS")
            self.log_result("=" * 60)
            
            self.log_result("\nMETHODOLOGY EXPLANATION:")
            self.log_result("This analysis assesses if an AI responded consistently to items measuring the same underlying construct.")
            self.log_result("We calculate the Standard Deviation (SD) of the reversed item scores *within each subscale* for *each file*.")
            self.log_result("A minimum of 5 valid items (out of 7) is required to calculate the SD for a subscale to ensure the measure is meaningful.")

            self.log_result("\nCONSISTENCY RULES & THRESHOLDS:")
            self.log_result("1. **Subscale Inconsistency:** A subscale is flagged if its item response SD > 2.0.")
            self.log_result("2. **Global Inconsistency:** An entire file is flagged as 'Globally Inconsistent' if it has MORE THAN 2 inconsistent subscales (i.e., fewer than 4 consistent scales).")
            self.log_result("-" * 60)
            
            MINIMUM_ITEMS_REQUIRED = 5
            GLOBAL_INCONSISTENCY_THRESHOLD = 2 # A file is inconsistent if it has >2 inconsistent scales.
            
            inconsistency_summary = {i: 0 for i in range(1, 7)} # {1: 0, 2: 0, ... 6: 0}
            globally_inconsistent_files_count = 0
            
            for filename, data in self.processed_data.items():
                if not data['valid']:
                    self.log_result(f"\n{filename} [{self.file_labels.get(filename, 'unlabeled')}]: SKIPPED (marked as INVALID in processing stage).")
                    continue
                
                label = self.file_labels.get(filename, "unlabeled")
                subscale_consistency_issues = {}
                
                self.log_result(f"\nANALYZING FILE: {filename} [{label}]")
                self.log_result(f"  Subscale Item Response Standard Deviations:")
                
                for subscale_name, items in self.subscales.items():
                    subscale_item_reversed_scores = []
                    for item_num in items:
                        if item_num in data['reversed_scores']:
                            subscale_item_reversed_scores.append(data['reversed_scores'][item_num])
                    
                    if len(subscale_item_reversed_scores) < MINIMUM_ITEMS_REQUIRED:
                        msg = f"N/A (Cannot calculate. Found {len(subscale_item_reversed_scores)}/{len(items)} valid items, need at least {MINIMUM_ITEMS_REQUIRED})."
                        self.log_result(f"    {subscale_name:<20}: {msg}")
                        continue
                    
                    sub_sd = self.safe_std(subscale_item_reversed_scores, ddof=1)
                    
                    if sub_sd is None:
                        self.log_result(f"    {subscale_name:<20}: N/A (Cannot calculate SD).")
                        continue
                    
                    marker = "✓"
                    # Use a general threshold of 2.0 to flag any inconsistency
                    if sub_sd > 2.0:
                        marker = "⚠️"
                        subscale_consistency_issues[subscale_name] = sub_sd
                    
                    scores_tuple = tuple(subscale_item_reversed_scores)
                    self.log_result(f"    {marker} {subscale_name:<20}: {sub_sd:.3f} --- Scores: {scores_tuple}")
                
                # Assess the file's overall consistency based on the number of flagged subscales
                num_inconsistent = len(subscale_consistency_issues)
                if num_inconsistent > 0:
                    inconsistency_summary[num_inconsistent] += 1
                
                if num_inconsistent > GLOBAL_INCONSISTENCY_THRESHOLD:
                    globally_inconsistent_files_count += 1
                    self.log_result(f"  ⚠️ This file is considered GLOBALLY INCONSISTENT ({num_inconsistent} inconsistent scales found, exceeds threshold of {GLOBAL_INCONSISTENCY_THRESHOLD}).")
                else:
                    self.log_result(f"  ✓ This file is considered GLOBALLY CONSISTENT ({num_inconsistent} inconsistent scales found).")
            
            # --- OVERALL SUMMARY ---
            valid_analyzed = len([f for f in self.processed_data.values() if f['valid']])
            self.log_result(f"\n" + "=" * 60)
            self.log_result(f"OVERALL CONSISTENCY SUMMARY")
            self.log_result("=" * 60)
            self.log_result(f"Total valid files analyzed for consistency: {valid_analyzed}")
            
            self.log_result("\nBreakdown by number of inconsistent scales per file:")
            any_found = False
            for num, count in inconsistency_summary.items():
                if count > 0:
                    self.log_result(f"  - Files with exactly {num} inconsistent scale(s): {count}")
                    any_found = True
            if not any_found and valid_analyzed > 0:
                 self.log_result("  ✓ No files were found with any inconsistent scales.")
            
            self.log_result(f"\nRule: A file is 'Globally Inconsistent' if it has >{GLOBAL_INCONSISTENCY_THRESHOLD} inconsistent scales (i.e., fewer than 4 consistent scales).")
            self.log_result(f"Total files flagged as Globally Inconsistent: {globally_inconsistent_files_count}")
            
            if valid_analyzed > 0:
                consistent_files_count = valid_analyzed - globally_inconsistent_files_count
                consistency_rate = (consistent_files_count / valid_analyzed) * 100
                self.log_result(f"Overall File Consistency Rate (files deemed 'Globally Consistent'): {consistency_rate:.1f}%")
            else:
                self.log_result("Overall File Consistency Rate: N/A (no valid files)")
            self.log_result("=" * 60)
            
        except Exception as e:
            messagebox.showerror("Error", f"Error checking consistency: {str(e)}")

    def determine_comparison_type(self, n_a, n_b):
        """Helper to determine the type of statistical comparison needed based on sample sizes."""
        if n_a == 1 and n_b > 1:
            return "one_sample_a_vs_multi_b"
        elif n_a > 1 and n_b == 1:
            return "multi_a_vs_one_sample_b"
        elif n_a > 1 and n_b > 1:
            return "two_sample_multi_vs_multi"
        else:  # n_a == 1 and n_b == 1
            return "both_single"
            
    def calculate_cohens_d(self, mean_sample, sd_sample, mean_reference, sd_reference=None, comparison_type=None):
        """
        Calculates Cohen's d based on the specified comparison type.
        mean_sample: Mean of the group that is considered the 'sample' (the one with SD)
        sd_sample: SD of the group that is considered the 'sample'
        mean_reference: Mean of the reference group/single value
        sd_reference: SD of the reference group (only applicable for two_sample_multi_vs_multi)
        comparison_type: Type of comparison (string)
        """
        
        diff = abs(mean_sample - mean_reference)
        
        if comparison_type in ["one_sample_a_vs_multi_b", "multi_a_vs_one_sample_b"]:
            # For one-sample test, denominator is the SD of the 'sample' group (the one with N > 1)
            if sd_sample is not None and sd_sample > 0:
                return diff / sd_sample
            return 0.0
        
        elif comparison_type == "two_sample_multi_vs_multi":
            # For Welch's t-test, use the square root of the average of the two variances
            if sd_sample is not None and sd_reference is not None and (sd_sample > 0 or sd_reference > 0):
                combined_std = np.sqrt((sd_sample**2 + sd_reference**2) / 2)
                return diff / combined_std if combined_std > 0 else 0.0
            return 0.0
        
        # 'both_single' case handles effect size estimation separately in compare_groups
        return 0.0
        
    def perform_subscale_comparison(self, group_a_files, group_b_files, group_a_label, group_b_label):
        """
        Performs detailed subscale comparison between groups, including statistical tests
        and Cohen's d for each subscale. This is called from compare_groups.
        """
        self.log_result(f"\nSUBSCALE LEVEL COMPARISON:")
        self.log_result("-" * 40)
        
        significant_subscales = [] # Stores names of subscales with p < 0.05
        
        for subscale_name in self.subscales.keys():
            # Get the computed subscale AVERAGE scores for each file in both groups
            group_a_subscale_averages = []
            group_b_subscale_averages = []
            
            for filename in group_a_files:
                if filename in self.processed_data:
                    subscale_avg = self.processed_data[filename]['subscale_averages'].get(subscale_name)
                    if subscale_avg is not None:
                        group_a_subscale_averages.append(subscale_avg)
            
            for filename in group_b_files:
                if filename in self.processed_data:
                    subscale_avg = self.processed_data[filename]['subscale_averages'].get(subscale_name)
                    if subscale_avg is not None:
                        group_b_subscale_averages.append(subscale_avg)
            
            self.log_result(f"\n  Subscale: {subscale_name}")
            
            # --- Descriptive Statistics for Subscales ---
            a_mean_sub = self.safe_mean(group_a_subscale_averages)
            b_mean_sub = self.safe_mean(group_b_subscale_averages)
            
            a_std_sub = self.safe_std(group_a_subscale_averages, ddof=1)
            b_std_sub = self.safe_std(group_b_subscale_averages, ddof=1)
            
            if a_mean_sub is None or b_mean_sub is None or len(group_a_subscale_averages) == 0 or len(group_b_subscale_averages) == 0:
                self.log_result(f"    Insufficient valid data for this subscale comparison. Group A N={len(group_a_subscale_averages)}, Group B N={len(group_b_subscale_averages)}.")
                continue # Skip to next subscale
            
            self.log_result(f"    Group A ({group_a_label}): Mean={a_mean_sub:.3f}{f', SD={a_std_sub:.3f}' if a_std_sub is not None else ''} (n={len(group_a_subscale_averages)})")
            self.log_result(f"    Group B ({group_b_label}): Mean={b_mean_sub:.3f}{f', SD={b_std_sub:.3f}' if b_std_sub is not None else ''} (n={len(group_b_subscale_averages)})")
            self.log_result(f"    Absolute difference: {abs(a_mean_sub - b_mean_sub):.3f}")
            
            t_stat = p_val = df = effect_size = None
            test_type = "Not applicable" # Default if no test is performed
            
            # Determine specific comparison type for this subscale's data
            current_subscale_comp_type = self.determine_comparison_type(len(group_a_subscale_averages), len(group_b_subscale_averages))
            
            # --- Apply appropriate statistical test and effect size calculation ---
            try:
                if current_subscale_comp_type == "one_sample_a_vs_multi_b":
                    # Comparing Group B (multiple) mean to Group A (single) score
                    if len(group_b_subscale_averages) >= 2:
                        t_stat, p_val = stats.ttest_1samp(group_b_subscale_averages, group_a_subscale_averages[0])
                        effect_size = self.calculate_cohens_d(b_mean_sub, b_std_sub, group_a_subscale_averages[0], None, current_subscale_comp_type)
                        df = len(group_b_subscale_averages) - 1
                        test_type = "One-sample t-test"
                        self.log_result(f"    Formula: t = (Mean_GroupB - Single_Score_GroupA) / (SD_GroupB / sqrt(n_GroupB))")
                        self.log_result(f"    Cohen's d Formula: d = |Mean_GroupB - Single_Score_GroupA| / SD_GroupB")
                    else:
                        self.log_result(f"    Cannot perform One-Sample t-test: Group B must have at least 2 files for SD calculation. Current N={len(group_b_subscale_averages)}.")

                elif current_subscale_comp_type == "multi_a_vs_one_sample_b":
                    # Comparing Group A (multiple) mean to Group B (single) score
                    if len(group_a_subscale_averages) >= 2:
                        t_stat, p_val = stats.ttest_1samp(group_a_subscale_averages, group_b_subscale_averages[0])
                        effect_size = self.calculate_cohens_d(a_mean_sub, a_std_sub, group_b_subscale_averages[0], None, current_subscale_comp_type)
                        df = len(group_a_subscale_averages) - 1
                        test_type = "One-sample t-test"
                        self.log_result(f"    Formula: t = (Mean_GroupA - Single_Score_GroupB) / (SD_GroupA / sqrt(n_GroupA))")
                        self.log_result(f"    Cohen's d Formula: d = |Mean_GroupA - Single_Score_GroupB| / SD_GroupA")
                    else:
                        self.log_result(f"    Cannot perform One-Sample t-test: Group A must have at least 2 files for SD calculation. Current N={len(group_a_subscale_averages)}.")
                    
                elif current_subscale_comp_type == "two_sample_multi_vs_multi":
                    # Comparing Group A (multiple) mean to Group B (multiple) mean
                    if len(group_a_subscale_averages) >= 2 and len(group_b_subscale_averages) >= 2:
                        t_stat, p_val = stats.ttest_ind(group_a_subscale_averages, group_b_subscale_averages, equal_var=False)
                        effect_size = self.calculate_cohens_d(a_mean_sub, a_std_sub, b_mean_sub, b_std_sub, current_subscale_comp_type)
                        # Welch's degrees of freedom calculation
                        if a_std_sub is not None and b_std_sub is not None:
                            s1_sq_n1 = a_std_sub**2 / len(group_a_subscale_averages)
                            s2_sq_n2 = b_std_sub**2 / len(group_b_subscale_averages)
                            df = (s1_sq_n1 + s2_sq_n2)**2 / (s1_sq_n1**2/(len(group_a_subscale_averages)-1) + s2_sq_n2**2/(len(group_b_subscale_averages)-1))
                        else:
                            df = "N/A (SDs not calculable)"
                        test_type = "Welch's Two-sample t-test"
                        self.log_result(f"    Formula: t = (Mean_GroupA - Mean_GroupB) / sqrt((Var_GroupA/n_GroupA) + (Var_GroupB/n_GroupB))")
                        self.log_result(f"    Cohen's d Formula: d = |Mean_GroupA - Mean_GroupB| / sqrt((SD_GroupA^2 + SD_GroupB^2)/2)")
                    else:
                        self.log_result(f"    Cannot perform Welch's t-test: Both groups must have at least 2 files for SD calculation. Group A N={len(group_a_subscale_averages)}, Group B N={len(group_b_subscale_averages)}.")
                
                else: # both_single
                    self.log_result(f"    No statistical test can be performed between two single files at the subscale level.")
                    # No t_stat, p_val, df, effect_size from function
            
            except Exception as e:
                self.log_result(f"    Error during statistical calculation for {subscale_name}: {str(e)}")
            
            # --- Report Statistical Results & Interpretation for Subscale ---
            if t_stat is not None:
                self.log_result(f"    Statistical Test: {test_type}")
                self.log_result(f"    t-statistic: {t_stat:.3f}")
                if isinstance(df, (int, float)): # Check if df is a number or string
                    self.log_result(f"    Degrees of freedom: {df:.1f}")
                else:
                    self.log_result(f"    Degrees of freedom: {df}") # Print string directly
                self.log_result(f"    p-value: {p_val:.3f}")
                self.log_result(f"    Effect size (Cohen's d): {effect_size:.3f}")
                
                if p_val < 0.05:
                    significant_subscales.append(subscale_name)
                    self.log_result(f"    ⚠️  Statistically Significant Difference (p < 0.05)")
                else:
                    self.log_result(f"    ✓ No Statistically Significant Difference (p ≥ 0.05)")
                
                if effect_size >= 0.8:
                    self.log_result(f"    ⚠️  LARGE practical effect size (d ≥ 0.8)")
                elif effect_size >= 0.5:
                    self.log_result(f"    ⚠️  MEDIUM practical effect size (d ≥ 0.5)")
                elif effect_size >= 0.2:
                    self.log_result(f"    ~ SMALL practical effect size (d ≥ 0.2)")
                else:
                    self.log_result(f"    ✓ NEGLIGIBLE practical effect size (d < 0.2)")
            else:
                self.log_result(f"    Statistical test results are not available for this comparison scenario.")
        
        # Final Summary for Subscale Comparison
        if significant_subscales:
            self.log_result(f"\n⚠️  SUMMARY: SUBSCALES WITH STATISTICALLY SIGNIFICANT DIFFERENCES:")
            for subscale in significant_subscales:
                self.log_result(f"    - {subscale}")
        else:
            self.log_result(f"\n✓ SUMMARY: No subscales show statistically significant differences.")
    
    def compare_groups(self):
        """
        Compares two groups of files (defined by user labels) based on their Global Ryff Total Scores
        and performs appropriate statistical tests (One-Sample T-test or Welch's T-test).
        Then proceeds to perform detailed subscale comparisons.
        """
        if not self.processed_data:
            messagebox.showwarning("Warning", "Please process Ryff scores first!")
            return
        
        try:
            # Get all unique labels
            unique_labels = set(self.file_labels.values())
            unique_labels.discard("unlabeled")
            
            if len(unique_labels) < 2:
                messagebox.showwarning("Warning", 
                    "Need at least 2 different labels to compare groups!\n"
                    "Please label your files first (e.g., 'A', 'B', 'det', 'nondet', etc.)")
                return
            
            label_list = sorted(list(unique_labels))
            
            # Create a dialog to select two groups
            dialog = tk.Toplevel(self.root)
            dialog.title("Select Groups to Compare")
            dialog.geometry("400x300")
            dialog.transient(self.root)
            dialog.grab_set()
            
            tk.Label(dialog, text="Select Group A:").pack(pady=5)
            group_a_var = tk.StringVar(value=label_list[0])
            group_a_combo = ttk.Combobox(dialog, textvariable=group_a_var, values=label_list, state="readonly")
            group_a_combo.pack(pady=5)
            
            tk.Label(dialog, text="Select Group B:").pack(pady=5)
            group_b_var = tk.StringVar(value=label_list[1] if len(label_list) > 1 else label_list[0])
            group_b_combo = ttk.Combobox(dialog, textvariable=group_b_var, values=label_list, state="readonly")
            group_b_combo.pack(pady=5)
            
            result = {"proceed": False, "group_a": None, "group_b": None}
            
            def on_compare():
                result["proceed"] = True
                result["group_a"] = group_a_var.get()
                result["group_b"] = group_b_var.get()
                dialog.destroy()
            
            def on_cancel():
                dialog.destroy()
            
            tk.Button(dialog, text="Compare", command=on_compare).pack(pady=10)
            tk.Button(dialog, text="Cancel", command=on_cancel).pack(pady=5)
            
            dialog.wait_window()
            
            if not result["proceed"]:
                return
            
            group_a_label = result["group_a"]
            group_b_label = result["group_b"]
            
            # Get valid files for each group based on their assigned labels
            group_a_files = [f for f, label in self.file_labels.items() 
                            if label == group_a_label and f in self.processed_data 
                            and self.processed_data[f]['valid'] and self.processed_data[f]['global_ryff_total_score'] is not None]
            
            group_b_files = [f for f, label in self.file_labels.items() 
                            if label == group_b_label and f in self.processed_data 
                            and self.processed_data[f]['valid'] and self.processed_data[f]['global_ryff_total_score'] is not None]
            
            if not group_a_files or not group_b_files:
                messagebox.showwarning("Warning", 
                    f"Cannot perform comparison: Insufficient valid files with Global Ryff Total Scores for one or both groups!\n"
                    f"Group A ({group_a_label}): {len(group_a_files)} valid files\n"
                    f"Group B ({group_b_label}): {len(group_b_files)} valid files")
                return
            
            self.log_result("\n" + "=" * 60)
            self.log_result(f"GROUP COMPARISON: {group_a_label.upper()} vs {group_b_label.upper()}")
            self.log_result("=" * 60)
            
            # Determine the comparison type (e.g., N vs 1, N vs N, 1 vs 1)
            comparison_type = self.determine_comparison_type(len(group_a_files), len(group_b_files))
            
            self.log_result("\nMETHODOLOGY EXPLANATION (GLOBAL RYFF TOTAL SCORE COMPARISON):")
            self.log_result("This analysis compares the **Global Ryff Total Scores** (sum of all valid reversed items) between the selected groups.")
            self.log_result(f"Each AI response file yields one Global Ryff Total Score, ranging from {42 * 1} to {42 * self.scale_points} ({42} to {294} for 1-7 Likert).")
            
            # Extract Global Ryff Total Scores for calculations
            group_a_global_scores = [self.processed_data[f]['global_ryff_total_score'] for f in group_a_files]
            group_b_global_scores = [self.processed_data[f]['global_ryff_total_score'] for f in group_b_files]
            
            # Calculate descriptive statistics
            group_a_mean = self.safe_mean(group_a_global_scores)
            group_b_mean = self.safe_mean(group_b_global_scores)
            
            group_a_std = self.safe_std(group_a_global_scores, ddof=1)
            group_b_std = self.safe_std(group_b_global_scores, ddof=1)
            
            if group_a_mean is None or group_b_mean is None:
                self.log_result("Cannot calculate means for comparison. This should not happen if files list is valid.")
                return
            
            self.log_result(f"\nDESCRIPTIVE STATISTICS (Global Ryff Total Scores):")
            self.log_result(f"  Group A ({group_a_label}): Mean = {group_a_mean:.3f}, n = {len(group_a_global_scores)}{f', SD = {group_a_std:.3f}' if group_a_std is not None else ''}")
            self.log_result(f"  Group B ({group_b_label}): Mean = {group_b_mean:.3f}, n = {len(group_b_global_scores)}{f', SD = {group_b_std:.3f}' if group_b_std is not None else ''}")
            self.log_result(f"  Absolute difference between means: {abs(group_a_mean - group_b_mean):.3f}")
            
            t_stat = p_value = df = effect_size = None
            test_description = "No statistical test performed"

            try:
                if comparison_type == "one_sample_a_vs_multi_b":
                    self.log_result("\nCOMPARISON SCENARIO: ONE PEER (Group A) vs. MULTIPLE INSTANCES (Group B)")
                    self.log_result("  This is ideal for comparing a deterministic baseline AI (Group A) to multiple non-deterministic AI runs (Group B).")
                    self.log_result("  **Statistical Test:** One-Sample t-test.")
                    self.log_result(f"  **Goal:** To determine if the mean Global Ryff Total Score of Group B significantly differs from the single Global Ryff Total Score of Group A.")
                    self.log_result(f"  **Formula (t-statistic):** t = (Mean_GroupB - Score_GroupA) / (SD_GroupB / sqrt(n_GroupB))")
                    self.log_result(f"  **Null Hypothesis (H0):** Mean of Group B's Global Scores = Group A's single Global Score.")
                    self.log_result(f"  **Alternative Hypothesis (H1):** Mean of Group B's Global Scores ≠ Group A's single Global Score.")
                    self.log_result(f"  **Cohen's d Formula:** d = |Mean_GroupB - Score_GroupA| / SD_GroupB")
                    
                    if len(group_b_global_scores) >= 2:
                        t_stat, p_value = stats.ttest_1samp(group_b_global_scores, group_a_global_scores[0])
                        effect_size = self.calculate_cohens_d(group_b_mean, group_b_std, group_a_global_scores[0], None, comparison_type)
                        df = len(group_b_global_scores) - 1
                        test_description = f"One-Sample t-test (Group B vs Group A's single score)"
                    else:
                        self.log_result(f"  Cannot perform One-Sample t-test: Group B must have at least 2 files to calculate a standard deviation. Current N={len(group_b_global_scores)}.")
                
                elif comparison_type == "multi_a_vs_one_sample_b":
                    self.log_result("\nCOMPARISON SCENARIO: MULTIPLE INSTANCES (Group A) vs. ONE PEER (Group B)")
                    self.log_result("  This is ideal for comparing multiple non-deterministic AI runs (Group A) to a deterministic baseline AI (Group B).")
                    self.log_result("  **Statistical Test:** One-Sample t-test.")
                    self.log_result(f"  **Goal:** To determine if the mean Global Ryff Total Score of Group A significantly differs from the single Global Ryff Total Score of Group B.")
                    self.log_result(f"  **Formula (t-statistic):** t = (Mean_GroupA - Score_GroupB) / (SD_GroupA / sqrt(n_GroupA))")
                    self.log_result(f"  **Null Hypothesis (H0):** Mean of Group A's Global Scores = Group B's single Global Score.")
                    self.log_result(f"  **Alternative Hypothesis (H1):** Mean of Group A's Global Scores ≠ Mean of Group B's Global Score.")
                    self.log_result(f"  **Cohen's d Formula:** d = |Mean_GroupA - Score_GroupB| / SD_GroupA")
                    
                    if len(group_a_global_scores) >= 2:
                        t_stat, p_value = stats.ttest_1samp(group_a_global_scores, group_b_global_scores[0])
                        effect_size = self.calculate_cohens_d(group_a_mean, group_a_std, group_b_global_scores[0], None, comparison_type)
                        df = len(group_a_global_scores) - 1
                        test_description = f"One-Sample t-test (Group A vs Group B's single score)"
                    else:
                        self.log_result(f"  Cannot perform One-Sample t-test: Group A must have at least 2 files to calculate a standard deviation. Current N={len(group_a_global_scores)}.")

                elif comparison_type == "two_sample_multi_vs_multi":
                    self.log_result("\nCOMPARISON SCENARIO: MULTIPLE INSTANCES (Group A) vs. MULTIPLE INSTANCES (Group B)")
                    self.log_result("  This is ideal for comparing two different sets of non-deterministic AI runs (e.g., perturbation 1a vs perturbation 3).")
                    self.log_result("  **Statistical Test:** Welch's Independent Samples t-test (assumes unequal variances).")
                    self.log_result(f"  **Goal:** To determine if the mean Global Ryff Total Score of Group A significantly differs from the mean of Group B.")
                    self.log_result(f"  **Formula (t-statistic):** t = (Mean_GroupA - Mean_GroupB) / sqrt((Var_GroupA/n_GroupA) + (Var_GroupB/n_GroupB))")
                    self.log_result(f"  **Null Hypothesis (H0):** Mean of Group A's Global Scores = Mean of Group B's Global Scores.")
                    self.log_result(f"  **Alternative Hypothesis (H1):** Mean of Group A's Global Scores ≠ Mean of Group B's Global Scores.")
                    self.log_result(f"  **Cohen's d Formula (for Welch's):** d = |Mean_GroupA - Mean_GroupB| / sqrt((SD_GroupA^2 + SD_GroupB^2)/2)")

                    if len(group_a_global_scores) >= 2 and len(group_b_global_scores) >= 2:
                        t_stat, p_value = stats.ttest_ind(group_a_global_scores, group_b_global_scores, equal_var=False)
                        effect_size = self.calculate_cohens_d(group_a_mean, group_a_std, group_b_mean, group_b_std, comparison_type)
                        
                        # Welch's degrees of freedom calculation
                        if group_a_std is not None and group_b_std is not None and group_a_std > 0 and group_b_std > 0:
                            s1_sq_n1 = group_a_std**2 / len(group_a_global_scores)
                            s2_sq_n2 = group_b_std**2 / len(group_b_global_scores)
                            df = (s1_sq_n1 + s2_sq_n2)**2 / (s1_sq_n1**2/(len(group_a_global_scores)-1) + s2_sq_n2**2/(len(group_b_global_scores)-1))
                        else:
                            df = "N/A (SDs not calculable or zero)"
                        test_description = f"Welch's Two-sample t-test"
                    else:
                        self.log_result(f"  Cannot perform Welch's t-test: Both groups must have at least 2 files to calculate standard deviations. Group A N={len(group_a_global_scores)}, Group B N={len(group_b_global_scores)}.")
                
                else: # both_single
                    self.log_result("\nCOMPARISON SCENARIO: SINGLE INSTANCE (Group A) vs. SINGLE INSTANCE (Group B)")
                    self.log_result("  When comparing two single AI instances, no statistical test (t-test) can be performed as there is no variability within each 'group' to assess.")
                    self.log_result("  Only a direct comparison of their Global Ryff Total Scores is possible.")
                    self.log_result(f"  Difference in Global Ryff Total Score: {abs(group_a_mean - group_b_mean):.3f}")
                    
                    # For effect size estimation in 'both_single' case, use overall population SD if available
                    all_global_scores_from_processed = []
                    for d in self.processed_data.values():
                        if d['valid'] and d['global_ryff_total_score'] is not None:
                            all_global_scores_from_processed.append(d['global_ryff_total_score'])
                    
                    if len(all_global_scores_from_processed) >= 2: # Need at least two for SD
                        population_std_estimate = self.safe_std(all_global_scores_from_processed, ddof=1)
                        if population_std_estimate is not None and population_std_estimate > 0:
                            effect_size_estimated = abs(group_a_mean - group_b_mean) / population_std_estimate
                            self.log_result(f"  Estimated Effect Size (Cohen's d, using overall population SD estimate from ALL valid files): {effect_size_estimated:.3f}")
                        else:
                            self.log_result(f"  Cannot estimate effect size (overall population standard deviation is zero or cannot be calculated).")
                    else:
                        self.log_result(f"  Cannot estimate effect size (not enough overall valid files to estimate population standard deviation).")
                    
                    # For 'both_single', the subscale comparison is particularly relevant
                    self.log_result(f"\n  Proceeding to Subscale Level Comparison for these single files:")
                    self.perform_subscale_comparison(group_a_files, group_b_files, group_a_label, group_b_label)
                    return # EXIT as no global T-test results to print here
                
                # --- GLOBAL RYFF TOTAL SCORE STATISTICAL RESULTS & INTERPRETATION ---
                if t_stat is not None:  # Only print if a statistical test was successfully performed
                    self.log_result(f"\nSTATISTICAL TEST RESULTS (Global Ryff Total Scores):")
                    self.log_result(f"  Test Performed: {test_description}")
                    self.log_result(f"  t-statistic: {t_stat:.3f}")
                    if isinstance(df, (int, float)): # Check if df is a number or string
                        self.log_result(f"  Degrees of freedom (df): {df:.1f}")
                    else:
                        self.log_result(f"  Degrees of freedom (df): {df}") # Print string directly
                    self.log_result(f"  p-value: {p_value:.3f}")
                    self.log_result(f"  Effect size (Cohen's d): {effect_size:.3f}")
                    
                    self.log_result(f"\nINTERPRETATION (Global Ryff Total Scores):")
                    
                    # Statistical significance interpretation
                    if p_value < 0.001:
                        self.log_result(f"  ⚠️⚠️ HIGHLY STATISTICALLY SIGNIFICANT difference (p < 0.001)")
                        self.log_result(f"      Very strong evidence that Global Ryff Total Scores differ between groups.")
                    elif p_value < 0.01:
                        self.log_result(f"  ⚠️  VERY STATISTICALLY SIGNIFICANT difference (p < 0.01)")
                        self.log_result(f"      Strong evidence that Global Ryff Total Scores differ between groups.")
                    elif p_value < 0.05:
                        self.log_result(f"  ⚠️  STATISTICALLY SIGNIFICANT difference (p < 0.05)")
                        self.log_result(f"      Evidence that Global Ryff Total Scores differ between groups.")
                    elif p_value < 0.10:
                        self.log_result(f"  ~ MARGINALLY STATISTICALLY SIGNIFICANT (p < 0.10)")
                        self.log_result(f"      Weak evidence of difference in Global Ryff Total Scores, warrants further investigation.")
                    else:
                        self.log_result(f"  ✓ NO STATISTICALLY SIGNIFICANT difference (p ≥ 0.10)")
                        self.log_result(f"      No sufficient evidence that Global Ryff Total Scores differ between groups.")
                    
                    # Practical significance (Effect size) interpretation
                    if effect_size >= 0.8: # Using >= for clarity on thresholds
                        self.log_result(f"  ⚠️  LARGE PRACTICAL EFFECT SIZE (d ≥ 0.8)")
                        self.log_result(f"      The difference in Global Ryff Total Scores is substantial and practically meaningful.")
                    elif effect_size >= 0.5:
                        self.log_result(f"  ⚠️  MEDIUM PRACTICAL EFFECT SIZE (d ≥ 0.5)")
                        self.log_result(f"      The difference in Global Ryff Total Scores is moderate and practically noticeable.")
                    elif effect_size >= 0.2:
                        self.log_result(f"  ~ SMALL PRACTICAL EFFECT SIZE (d ≥ 0.2)")
                        self.log_result(f"      The difference in Global Ryff Total Scores is minor, though might still be relevant depending on context.")
                    else: # effect_size < 0.2
                        self.log_result(f"  ✓ NEGLIGIBLE PRACTICAL EFFECT SIZE (d < 0.2)")
                        self.log_result(f"      Global Ryff Total Scores are practically equivalent, despite any statistical significance.")
                    
                    # Combined interpretation
                    self.log_result(f"\nOVERALL ASSESSMENT (Global Ryff Total Scores):")
                    if p_value < 0.05 and effect_size >= 0.5:
                        self.log_result(f"  ✅ MEANINGFUL DIFFERENCE: Both statistically significant AND practically important.")
                        self.log_result(f"      The AI's overall well-being assessments are reliably and substantially different between these conditions.")
                    elif p_value < 0.05 and effect_size < 0.5:
                        self.log_result(f"  ~ STATISTICALLY REAL, BUT SMALL PRACTICAL IMPACT: A difference exists, but its real-world importance may be limited.")
                        self.log_result(f"      The AI's overall well-being assessments differ, but the magnitude of the difference might not be highly impactful.")
                    elif p_value >= 0.05 and effect_size >= 0.5:
                        self.log_result(f"  ~ PRACTICALLY IMPORTANT, BUT NOT STATISTICALLY CONFIRMED: A large observed difference, but not enough statistical power (or too much variability) to confirm it's not due to chance.")
                        self.log_result(f"      Consider increasing the number of AI runs/files to potentially detect this difference statistically.")
                    else:
                        self.log_result(f"  ✓ GROUPS ARE ESSENTIALLY EQUIVALENT: Neither statistically significant nor practically important difference.")
                        self.log_result(f"      The AI's overall well-being assessments are consistent across these conditions.")
                
                # After Global Comparison, proceed to Subscale Comparison
                self.log_result(f"\n" + "=" * 60)
                self.log_result(f"INITIATING SUBSCALE LEVEL COMPARISON...")
                self.log_result("=" * 60)
                self.perform_subscale_comparison(group_a_files, group_b_files, group_a_label, group_b_label)
                
            except Exception as e:
                self.log_result(f"Error in main global statistical analysis: {str(e)}")
            
        except Exception as e:
            messagebox.showerror("Error", f"Error in group comparison execution: {str(e)}")
    
    def analyze_similarity(self):
        """
        Analyze cross-file similarity by examining overall variability of Global Ryff Total Scores
        and subscale averages across all valid files. Identifies potential outliers.
        """
        if not self.processed_data:
            messagebox.showwarning("Warning", "Please process Ryff scores first!")
            return
        
        try:
            self.log_result("\n" + "=" * 60)
            self.log_result("CROSS-FILE SIMILARITY ANALYSIS")
            self.log_result("=" * 60)
            
            self.log_result("\nMETHODOLOGY EXPLANATION:")
            self.log_result("This analysis examines the overall variability and similarity of Ryff scores across all valid loaded AI response files.")
            self.log_result("It helps identify if AI responses, overall, tend to cluster closely or if there's significant dispersion, potentially indicating outliers or diverse AI 'behavior'.")
            
            self.log_result("\nSTATISTICAL MEASURES USED:")
            self.log_result("1. **DESCRIPTIVE STATISTICS (Mean, Standard Deviation, Range):**")
            self.log_result("   - Provide a basic summary of the central tendency and spread of Global Ryff Total Scores across all files.")
            self.log_result("2. **COEFFICIENT OF VARIATION (CV):**")
            self.log_result("   - Formula: CV = (Standard Deviation / Mean)")
            self.log_result("   - Measures relative variability, expressing the standard deviation as a percentage of the mean.")
            self.log_result("   - Useful for comparing variability across measures with different scales or means.")
            self.log_result("3. **Z-SCORE OUTLIER DETECTION:**")
            self.log_result("   - Formula: z = (individual_score - overall_mean) / overall_standard_deviation")
            self.log_result("   - Identifies how many standard deviations an individual file's score is from the overall mean.")
            
            self.log_result("\nTHRESHOLDS FOR CONCERN:")
            self.log_result("   - **Overall CV < 0.10:** Indicates very low relative variability (extremely high similarity across files).")
            self.log_result("   - **Overall CV > 0.20:** Indicates moderately high relative variability (some dispersion across files).")
            self.log_result("   - **Overall CV > 0.30:** Indicates high relative variability (significant dispersion, potential inconsistency across files or diverse AI types).")
            self.log_result("   - **|z| > 2.0:** Indicates a potential outlier (score is more than 2 standard deviations from the mean).")
            self.log_result("   - **|z| > 3.0:** Indicates an extreme outlier (score is more than 3 standard deviations from the mean).")
            self.log_result("-" * 60)
            
            valid_files = [f for f, data in self.processed_data.items() if data['valid'] and data['global_ryff_total_score'] is not None]
            
            if len(valid_files) < 2:
                self.log_result("Need at least 2 valid files with Global Ryff Total Scores to perform meaningful cross-file similarity analysis.")
                return
            
            # Overall statistics for Global Ryff Total Scores
            total_global_scores = [self.processed_data[f]['global_ryff_total_score'] for f in valid_files]
            
            overall_mean_global = self.safe_mean(total_global_scores)
            overall_std_global = self.safe_std(total_global_scores, ddof=1)
            
            if overall_mean_global is None:
                self.log_result("Cannot calculate overall mean for Global Ryff Total Scores.")
                return
            
            overall_cv_global = overall_std_global / overall_mean_global if (overall_std_global is not None and overall_mean_global > 0) else 0
            
            self.log_result(f"\nOVERALL STATISTICS FOR GLOBAL RYFF TOTAL SCORES ({len(total_global_scores)} valid scores):")
            self.log_result(f"  Mean: {overall_mean_global:.3f}")
            if overall_std_global is not None:
                self.log_result(f"  Standard Deviation (SD): {overall_std_global:.3f}")
            else:
                self.log_result(f"  Standard Deviation (SD): N/A (single valid score or calculation issue)")
            self.log_result(f"  Range: {min(total_global_scores):.3f} - {max(total_global_scores):.3f}")
            self.log_result(f"  Coefficient of Variation (CV): {overall_cv_global:.3f}")
            
            # Interpret overall variability (CV)
            if overall_cv_global < 0.10:
                self.log_result(f"  ✓ Very Low relative variability (extremely high overall similarity across files).")
            elif overall_cv_global > 0.30:
                self.log_result(f"  ⚠️  High relative variability (significant dispersion across files, potentially indicating diverse AI types or inconsistent behavior).")
            elif overall_cv_global > 0.20:
                self.log_result(f"  ~ Moderately High relative variability (some notable dispersion across files).")
            else:
                self.log_result(f"  ✓ Low relative variability (good overall consistency across files).")
            
            # Subscale level similarity (average of subscale averages across files)
            self.log_result(f"\nSUBSCALE LEVEL SIMILARITY (Mean & SD of Subscale Averages across files):")
            subscale_cvs = {}
            
            for subscale_name in self.subscales.keys():
                subscale_averages_across_files = []
                for filename in valid_files:
                    subscale_avg = self.processed_data[filename]['subscale_averages'].get(subscale_name)
                    if subscale_avg is not None:
                        subscale_averages_across_files.append(subscale_avg)
                
                if len(subscale_averages_across_files) > 1: # Need at least 2 for SD/CV
                    sub_mean = self.safe_mean(subscale_averages_across_files)
                    sub_std = self.safe_std(subscale_averages_across_files, ddof=1)
                    if sub_mean is not None and sub_std is not None and sub_mean > 0:
                        sub_cv = sub_std / sub_mean
                        subscale_cvs[subscale_name] = sub_cv
                        self.log_result(f"  {subscale_name}: Mean={sub_mean:.3f}, SD={sub_std:.3f}, CV={sub_cv:.3f}")
                    else:
                        self.log_result(f"  {subscale_name}: Mean={sub_mean:.3f}, SD/CV calculation failed (check for zero mean or single value)")
                elif len(subscale_averages_across_files) == 1:
                    self.log_result(f"  {subscale_name}: Single value ({subscale_averages_across_files[0]:.3f}), SD/CV not applicable.")
                else:
                    self.log_result(f"  {subscale_name}: No valid data across files.")
            
            # Identify most/least variable subscale across files
            if subscale_cvs:
                most_variable_subscale = max(subscale_cvs.items(), key=lambda x: x[1])
                least_variable_subscale = min(subscale_cvs.items(), key=lambda x: x[1])
                self.log_result(f"\n  Most variable subscale across files (highest CV): {most_variable_subscale[0]} (CV: {most_variable_subscale[1]:.3f})")
                self.log_result(f"  Least variable subscale across files (lowest CV): {least_variable_subscale[0]} (CV: {least_variable_subscale[1]:.3f})")
            else:
                self.log_result(f"\nNo subscale variability data available across files.")
            
            # Outlier detection based on Global Ryff Total Scores
            z_scores = []
            if overall_std_global is not None and overall_std_global > 0:
                z_scores = [(score - overall_mean_global) / overall_std_global for score in total_global_scores]
            
            outliers = []
            extreme_outliers = []
            
            for i, z in enumerate(z_scores):
                if i < len(valid_files):  # Safety check
                    filename = valid_files[i]
                    label = self.file_labels.get(filename, "unlabeled")
                    if abs(z) > 3.0:
                        extreme_outliers.append((filename, label, z))
                    elif abs(z) > 2.0:
                        outliers.append((filename, label, z))
            
            self.log_result(f"\nOUTLIER ANALYSIS (Based on Global Ryff Total Scores):")
            if extreme_outliers:
                self.log_result(f"  ⚠️⚠️ EXTREME OUTLIERS (|z-score| > 3.0) - Highly atypical Global Scores:")
                for filename, label, z_score in extreme_outliers:
                    self.log_result(f"    - {filename} [{label}]: z-score={z_score:.2f}")
            
            if outliers:
                self.log_result(f"  ⚠️  MODERATE OUTLIERS (|z-score| > 2.0) - Potentially atypical Global Scores:")
                for filename, label, z_score in outliers:
                    self.log_result(f"    - {filename} [{label}]: z-score={z_score:.2f}")
            
            if not outliers and not extreme_outliers:
                self.log_result(f"  ✓ No significant outliers detected in Global Ryff Total Scores.")
            
            # Final Summary
            self.log_result(f"\n" + "=" * 40)
            self.log_result(f"CROSS-FILE SIMILARITY SUMMARY:")
            self.log_result(f"Total valid files analyzed: {len(valid_files)}")
            consistency_desc = "VERY HIGH" if overall_cv_global < 0.10 else "GOOD" if overall_cv_global < 0.20 else "MODERATE" if overall_cv_global < 0.30 else "LOW (Significant Dispersion)"
            self.log_result(f"Overall Global Score consistency: {consistency_desc}")
            self.log_result(f"Total outliers detected: {len(outliers + extreme_outliers)}")
            self.log_result("=" * 40)
            
        except Exception as e:
            messagebox.showerror("Error", f"Error in similarity analysis: {str(e)}")
    
    def error_analysis(self):
        """
        Provides a comprehensive error analysis of all loaded files, categorizing
        and summarizing data quality issues.
        """
        if not self.loaded_files:
            messagebox.showwarning("Warning", "No files loaded to perform error analysis!")
            return
        
        try:
            self.log_result("\n" + "=" * 60)
            self.log_result("COMPREHENSIVE ERROR ANALYSIS")
            self.log_result("=" * 60)
            
            self.log_result("\nMETHODOLOGY EXPLANATION:")
            self.log_result("This analysis thoroughly examines data quality issues across all loaded files, regardless of whether they were marked 'valid' for score processing.")
            self.log_result("It identifies and categorizes various types of invalid responses and provides an overall assessment of data quality.")
            
            self.log_result("\nERROR CATEGORIES DETECTED:")
            self.log_result("1. **MISSING ITEMS:** Expected item numbers (1-42) were not found in the JSON file.")
            self.log_result("2. **NULL VALUES:** An item was present, but its value was explicitly null (e.g., '\"1\": null').")
            self.log_result("3. **MULTIPLE VALUES:** An item's value was a list, object, or other structure, indicating more than one response where a single value was expected (e.g., '\"1\": [5, 6]').")
            self.log_result("4. **OUT OF RANGE / NON-INTEGER VALUES:** The item's numeric value was outside the expected 1 to 7 Likert scale range, or it was a decimal number (e.g., 8, 0, 3.5).")
            self.log_result("5. **NON-NUMERIC VALUES:** The item's value could not be converted to a number (e.g., '\"1\": \"abc\"').")
            self.log_result("\nVALIDITY THRESHOLD RECAP:")
            self.log_result("  - Files with 8 or more individual item errors are internally flagged as 'INVALID' and automatically excluded from scoring, consistency, and group comparison analyses.")
            self.log_result("  - This threshold ensures that statistical analyses are performed only on datasets with a reasonable amount of complete and correctly formatted data.")
            self.log_result("\nACCEPTABLE ITEM VALUES:")
            self.log_result(f"  Only single integer values from 1 to {self.scale_points} are considered valid responses for individual items.")
            self.log_result("-" * 60)
            
            total_files = len(self.loaded_files)
            valid_files_count = 0
            invalid_files_count = 0
            overall_error_summary = {}
            all_invalid_entries_list = []
            
            # Re-process each file for error details, ensuring consistency with processed_data
            for filename, raw_data in self.loaded_files.items():
                # If already processed, use that, otherwise process it now for error details
                if filename in self.processed_data:
                    processed = self.processed_data[filename]
                else:
                    processed = self.process_single_file(raw_data, filename) 
                    self.processed_data[filename] = processed

                label = self.file_labels.get(filename, "unlabeled")
                file_error_count = len(processed['errors'])
                
                status = "VALID" if processed['valid'] else "INVALID"
                if processed['valid']:
                    valid_files_count += 1
                else:
                    invalid_files_count += 1
                
                self.log_result(f"\nFILE: {filename} [{label}] - Status: {status} ({file_error_count} errors)")
                
                if processed['invalid_entries']:
                    self.log_result(f"  Specific Invalid Entries in this file:")
                    for entry_detail in processed['invalid_entries']:
                        self.log_result(f"    - {entry_detail}")
                        all_invalid_entries_list.append(f"File '{filename}' ({label}): {entry_detail}")
                else:
                    self.log_result(f"  No specific invalid entries found in this file.")
                
                # Aggregate error types for overall summary
                for error_message in processed['errors']:
                    if 'Missing' in error_message:
                        error_type = 'Missing Items'
                    elif 'Null value' in error_message:
                        error_type = 'Null Values'
                    elif 'Multiple values' in error_message:
                        error_type = 'Multiple Values'
                    elif 'out of range' in error_message.lower() or 'not integer' in error_message.lower():
                        error_type = 'Out of Range / Non-Integer'
                    elif 'Non-numeric' in error_message:
                        error_type = 'Non-Numeric'
                    else:
                        error_type = 'Other Errors'
                    
                    overall_error_summary[error_type] = overall_error_summary.get(error_type, 0) + 1
            
            self.log_result(f"\n" + "=" * 40)
            self.log_result(f"OVERALL ERROR ANALYSIS SUMMARY:")
            self.log_result(f"Total files analyzed: {total_files}")
            if total_files > 0:
                self.log_result(f"Files marked as VALID: {valid_files_count} ({valid_files_count/total_files*100:.1f}%)")
                self.log_result(f"Files marked as INVALID (excluded from analyses): {invalid_files_count} ({invalid_files_count/total_files*100:.1f}%)")
            else:
                self.log_result(f"Files marked as VALID: {valid_files_count} (N/A)")
                self.log_result(f"Files marked as INVALID (excluded from analyses): {invalid_files_count} (N/A)")
            self.log_result(f"Total individual invalid entries found across all files: {len(all_invalid_entries_list)}")
            
            if overall_error_summary:
                self.log_result(f"\nBreakdown of Error Types (Total occurrences):")
                for error_type, count in sorted(overall_error_summary.items(), key=lambda x: x[1], reverse=True):
                    self.log_result(f"  - {error_type:<30}: {count} occurrences")
            else:
                self.log_result("\nNo errors of any type were detected across all loaded files. Excellent data quality!")
            
            self.log_result(f"\n" + "=" * 40)
            self.log_result(f"DATA QUALITY ASSESSMENT:")
            
            if total_files > 0:
                validity_percentage = valid_files_count / total_files
                if validity_percentage >= 0.95:
                    self.log_result(f"✓ EXCELLENT data quality (≥95% valid files).")
                elif validity_percentage >= 0.80:
                    self.log_result(f"✓ GOOD data quality (≥80% valid files).")
                elif validity_percentage >= 0.50:
                    self.log_result(f"⚠️  MODERATE data quality (≥50% valid files). Some attention to data collection or AI output format may be needed.")
                else:
                    self.log_result(f"⚠️⚠️ POOR data quality (<50% valid files). Strongly recommend reviewing AI output generation process and data collection method.")
            else:
                self.log_result("N/A (No files loaded).")
                
            if len(all_invalid_entries_list) == 0:
                self.log_result(f"✓ Absolutely no invalid item responses detected across all files. Perfect data integrity!")
            elif len(all_invalid_entries_list) <= total_files * 3:  # Allowance for a few errors per file
                self.log_result(f"✓ Low overall error rate in item responses. Data quality is generally good, minor issues.")
            else:
                self.log_result(f"⚠️  High overall error rate in item responses. This indicates consistent issues with how the AI provides responses or how data is being collected/formatted. Review strongly recommended.")
            
            self.log_result("=" * 40)
            
        except Exception as e:
            messagebox.showerror("Error", f"Error in error analysis: {str(e)}")
    
    def log_result(self, message):
        """Log results to the text widget"""
        try:
            self.results_text.insert(tk.END, message + "\n")
            self.results_text.see(tk.END)
            # Use after_idle instead of update to avoid recursion issues
            self.root.after_idle(lambda: None)
        except Exception as e:
            print(f"Error logging result: {e}") # Fallback print if Tkinter fails
    
    def run(self):
        """Start the application"""
        try:
            self.root.mainloop()
        except Exception as e:
            print(f"Fatal error running application: {e}")
            import traceback
            traceback.print_exc()

# Create and run the application
if __name__ == "__main__":
    try:
        app = RyffAnalyzer()
        app.run()
    except Exception as e:
        print(f"Fatal error during application startup: {e}")
        import traceback
        traceback.print_exc()