In [4]:
import pandas as pd
import pickle
import numpy as np
from pprint import pprint
import os

def load_and_analyze_pickle(file_path):
    """Load and analyze a pickle file structure"""
    print(f"🔍 Analyzing: {file_path}")
    print("="*60)
    
    if not os.path.exists(file_path):
        print(f"❌ File not found: {file_path}")
        return None
    
    # Load the pickle file
    try:
        with open(file_path, 'rb') as f:
            data = pickle.load(f)
        print(f"✅ Successfully loaded pickle file")
        print(f"📊 Root object type: {type(data)}")
        
        if hasattr(data, '__len__'):
            print(f"📏 Root object length: {len(data)}")
        
        return data
    except Exception as e:
        print(f"❌ Error loading pickle: {e}")
        return None

def explore_dict_structure(data, max_items=10, max_depth=2, current_depth=0):
    """Explore dictionary structure recursively"""
    indent = "  " * current_depth
    
    if isinstance(data, dict):
        print(f"{indent}📁 Dictionary with {len(data)} keys:")
        
        for i, (key, value) in enumerate(data.items()):
            if i >= max_items:
                print(f"{indent}   ... and {len(data) - max_items} more keys")
                break
                
            print(f"{indent}   🔑 {key}: {type(value)}", end="")
            
            if hasattr(value, '__len__') and not isinstance(value, str):
                print(f" (length: {len(value)})")
            else:
                print()
            
            # Show sample values for simple types
            if isinstance(value, (str, int, float, bool)) and len(str(value)) < 100:
                print(f"{indent}      💡 Value: {value}")
            
            # Recurse for nested structures (limited depth)
            if current_depth < max_depth and isinstance(value, dict) and len(value) <= 5:
                explore_dict_structure(value, max_items, max_depth, current_depth + 1)
    
    elif isinstance(data, list):
        print(f"{indent}📋 List with {len(data)} items")
        if len(data) > 0:
            print(f"{indent}   First item type: {type(data[0])}")
            if len(data) <= 5:
                for i, item in enumerate(data):
                    print(f"{indent}   [{i}]: {type(item)}")
    else:
        print(f"{indent}📄 Single object: {type(data)}")

# Load both pickle files
files_to_analyze = [
    '../results/lsd_closed_hamza/lsd_closed_hamza_binary_baseline_rs42.pkl',
    '../results/lsd_closed_hamza/lsd_closed_Hamza.pkl'
]

results = {}
for file_path in files_to_analyze:
    file_name = os.path.basename(file_path)
    print(f"\n{'='*80}")
    results[file_name] = load_and_analyze_pickle(file_path)
    if results[file_name] is not None:
        explore_dict_structure(results[file_name], max_items=5, max_depth=2)



🔍 Analyzing: ../results/lsd_closed_hamza/lsd_closed_hamza_binary_baseline_rs42.pkl
✅ Successfully loaded pickle file
📊 Root object type: <class 'dict'>
📏 Root object length: 3
📁 Dictionary with 3 keys:
   🔑 Logistic Regression: <class 'dict'> (length: 6)
   🔑 Random Forest: <class 'dict'> (length: 6)
   🔑 SVC: <class 'dict'> (length: 6)

🔍 Analyzing: ../results/lsd_closed_hamza/lsd_closed_Hamza.pkl
✅ Successfully loaded pickle file
📊 Root object type: <class 'dict'>
📏 Root object length: 1
📁 Dictionary with 1 keys:
   🔑 classification_baseline: <class 'dict'> (length: 271)


In [5]:
# Detailed analysis of the binary baseline file (smaller file)
print("🔬 DETAILED ANALYSIS: Binary Baseline File")
print("="*80)

binary_file = 'lsd_closed_hamza_binary_baseline_rs42.pkl'
if binary_file in results and results[binary_file] is not None:
    data = results[binary_file]
    
    print(f"📋 Found {len(data)} models:")
    for model_name, model_data in data.items():
        print(f"\n🤖 Model: {model_name}")
        print(f"   📊 Data keys: {list(model_data.keys())}")
        
        # Analyze metric scores
        if 'metric_scores' in model_data:
            metrics = model_data['metric_scores']
            print(f"   📈 Metrics available: {list(metrics.keys())}")
            
            # Show sample metric scores
            for metric_name, metric_data in metrics.items():
                if isinstance(metric_data, dict):
                    print(f"      {metric_name}: {list(metric_data.keys())}")
                    # Show actual values if they're simple
                    for sub_key, sub_value in metric_data.items():
                        if isinstance(sub_value, (int, float, np.number)):
                            print(f"         {sub_key}: {sub_value:.4f}")
        
        # Analyze feature importances
        if 'feature_importances' in model_data:
            fi = model_data['feature_importances']
            if isinstance(fi, dict):
                print(f"   🎯 Feature importance keys: {list(fi.keys())}")
                print(f"      Number of features: {len(fi)}")
        
        # Analyze predictions
        if 'predictions' in model_data:
            pred = model_data['predictions']
            if isinstance(pred, dict):
                print(f"   🎲 Prediction keys: {list(pred.keys())}")
else:
    print("❌ Binary baseline file not loaded successfully")


🔬 DETAILED ANALYSIS: Binary Baseline File
📋 Found 3 models:

🤖 Model: Logistic Regression
   📊 Data keys: ['model_name', 'metric_scores', 'feature_importances', 'predictions', 'params', 'folds_estimators']
   📈 Metrics available: ['accuracy', 'f1', 'roc_auc']
      accuracy: ['mean', 'std', 'fold_scores']
         mean: 0.5667
         std: 0.1167
      f1: ['mean', 'std', 'fold_scores']
         mean: 0.5508
         std: 0.1108
      roc_auc: ['mean', 'std', 'fold_scores']
         mean: 0.5667
         std: 0.1167
   🎯 Feature importance keys: ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11']
      Number of features: 12
   🎲 Prediction keys: ['y_true', 'y_pred', 'y_proba', 'fold_preds']

🤖 Model: Random Forest
   📊 Data keys: ['model_name', 'metric_scores', 'feature_importances', 'predictions', 'params', 'folds_estimators']
   📈 Metrics available: ['accuracy', 'f1', 'roc_auc

In [6]:
# Detailed analysis of the large Hamza file
print("🔬 DETAILED ANALYSIS: Large Hamza File")
print("="*80)

hamza_file = 'lsd_closed_Hamza.pkl'
if hamza_file in results and results[hamza_file] is not None:
    data = results[hamza_file]
    
    print(f"📋 Root keys: {list(data.keys())}")
    
    # Analyze the classification_baseline structure
    if 'classification_baseline' in data:
        baseline_data = data['classification_baseline']
        print(f"\n📊 Classification baseline contains {len(baseline_data)} subjects")
        
        # Show first few subject IDs
        subject_ids = list(baseline_data.keys())[:10]
        print(f"🏷️  First 10 subject IDs: {subject_ids}")
        
        # Analyze structure of first subject
        if len(baseline_data) > 0:
            first_subject_id = list(baseline_data.keys())[0]
            first_subject_data = baseline_data[first_subject_id]
            
            print(f"\n🔍 Structure of first subject ({first_subject_id}):")
            print(f"   Type: {type(first_subject_data)}")
            
            if hasattr(first_subject_data, '__len__'):
                print(f"   Length: {len(first_subject_data)}")
            
            # If it's a dict, show its keys
            if isinstance(first_subject_data, dict):
                print(f"   Keys: {list(first_subject_data.keys())}")
                
                # Show sample values for each key
                for key, value in first_subject_data.items():
                    print(f"      {key}: {type(value)}", end="")
                    if hasattr(value, '__len__') and not isinstance(value, str):
                        print(f" (length: {len(value)})")
                    else:
                        print()
                    
                    # Show actual values for simple types
                    if isinstance(value, (str, int, float, bool)) and len(str(value)) < 100:
                        print(f"         Value: {value}")
                    elif isinstance(value, (list, np.ndarray)) and len(value) <= 10:
                        print(f"         Sample values: {value}")
        
        # Analyze patterns across subjects
        print(f"\n📈 PATTERN ANALYSIS across all {len(baseline_data)} subjects:")
        
        # Collect all unique keys from all subjects
        all_keys = set()
        for subject_data in baseline_data.values():
            if isinstance(subject_data, dict):
                all_keys.update(subject_data.keys())
        
        if all_keys:
            print(f"   🔑 Unique keys found across all subjects: {sorted(all_keys)}")
            
            # Check consistency - do all subjects have the same keys?
            key_counts = {}
            for subject_data in baseline_data.values():
                if isinstance(subject_data, dict):
                    for key in subject_data.keys():
                        key_counts[key] = key_counts.get(key, 0) + 1
            
            print(f"   📊 Key frequency across subjects:")
            for key, count in sorted(key_counts.items()):
                percentage = (count / len(baseline_data)) * 100
                print(f"      {key}: {count}/{len(baseline_data)} subjects ({percentage:.1f}%)")
else:
    print("❌ Large Hamza file not loaded successfully")


🔬 DETAILED ANALYSIS: Large Hamza File
📋 Root keys: ['classification_baseline']

📊 Classification baseline contains 271 subjects
🏷️  First 10 subject IDs: ['MRC41', 'MLF61', 'MLC15', 'MRP32', 'MRT41', 'MLF41', 'MLT55', 'MRF25', 'MRC16', 'MRO14']

🔍 Structure of first subject (MRC41):
   Type: <class 'dict'>
   Length: 3
   Keys: ['Logistic Regression', 'Random Forest', 'SVC']
      Logistic Regression: <class 'dict'> (length: 6)
      Random Forest: <class 'dict'> (length: 6)
      SVC: <class 'dict'> (length: 6)

📈 PATTERN ANALYSIS across all 271 subjects:
   🔑 Unique keys found across all subjects: ['Logistic Regression', 'Random Forest', 'SVC']
   📊 Key frequency across subjects:
      Logistic Regression: 271/271 subjects (100.0%)
      Random Forest: 271/271 subjects (100.0%)
      SVC: 271/271 subjects (100.0%)


In [8]:
# Summary and comparison
print("📊 SUMMARY AND COMPARISON")
print("="*80)

# File size comparison
import os
files = [
    '../results/lsd_closed_hamza/lsd_closed_hamza_binary_baseline_rs42.pkl',
    '../results/lsd_closed_hamza/lsd_closed_Hamza.pkl'
]

for file_path in files:
    if os.path.exists(file_path):
        size_mb = os.path.getsize(file_path) / (1024 * 1024)
        print(f"📁 {os.path.basename(file_path)}: {size_mb:.2f} MB")

print("\n🔍 STRUCTURE COMPARISON:")
print("-" * 40)

binary_file = 'lsd_closed_hamza_binary_baseline_rs42.pkl'
hamza_file = 'lsd_closed_Hamza.pkl'

if binary_file in results and results[binary_file] is not None:
    binary_data = results[binary_file]
    print(f"Binary Baseline File:")
    print(f"  • Structure: Dictionary with {len(binary_data)} ML models")
    print(f"  • Models: {list(binary_data.keys())}")
    print(f"  • Each model contains: {list(next(iter(binary_data.values())).keys())}")

if hamza_file in results and results[hamza_file] is not None:
    hamza_data = results[hamza_file]
    print(f"\nLarge Hamza File:")
    print(f"  • Structure: Dictionary with {len(hamza_data)} top-level key(s)")
    if 'classification_baseline' in hamza_data:
        baseline_data = hamza_data['classification_baseline']
        print(f"  • Contains: {len(baseline_data)} subject records")
        print(f"  • Subject ID pattern: {list(baseline_data.keys())[:3]}... (showing first 3)")

print(f"\n💡 KEY INSIGHTS:")
print(f"  • Binary baseline file: Contains ML model results (3 models with metrics)")
print(f"  • Large Hamza file: Contains individual subject data (271 subjects)")
print(f"  • These appear to be different types of ML analysis results")
print(f"  • Binary baseline: Model comparison results")
print(f"  • Large Hamza: Subject-level analysis results")


📊 SUMMARY AND COMPARISON
📁 lsd_closed_hamza_binary_baseline_rs42.pkl: 0.68 MB
📁 lsd_closed_Hamza.pkl: 190.34 MB

🔍 STRUCTURE COMPARISON:
----------------------------------------
Binary Baseline File:
  • Structure: Dictionary with 3 ML models
  • Models: ['Logistic Regression', 'Random Forest', 'SVC']
  • Each model contains: ['model_name', 'metric_scores', 'feature_importances', 'predictions', 'params', 'folds_estimators']

Large Hamza File:
  • Structure: Dictionary with 1 top-level key(s)
  • Contains: 271 subject records
  • Subject ID pattern: ['MRC41', 'MLF61', 'MLC15']... (showing first 3)

💡 KEY INSIGHTS:
  • Binary baseline file: Contains ML model results (3 models with metrics)
  • Large Hamza file: Contains individual subject data (271 subjects)
  • These appear to be different types of ML analysis results
  • Binary baseline: Model comparison results
  • Large Hamza: Subject-level analysis results


In [9]:
# Utility functions for further exploration
print("🛠️  UTILITY FUNCTIONS FOR FURTHER EXPLORATION")
print("="*80)

def get_model_metrics(model_name, file_key='lsd_closed_hamza_binary_baseline_rs42.pkl'):
    """Extract metrics for a specific model from binary baseline file"""
    if file_key in results and results[file_key] is not None:
        data = results[file_key]
        if model_name in data and 'metric_scores' in data[model_name]:
            return data[model_name]['metric_scores']
    return None

def get_subject_data(subject_id, file_key='lsd_closed_Hamza.pkl'):
    """Extract data for a specific subject from large Hamza file"""
    if file_key in results and results[file_key] is not None:
        data = results[file_key]
        if 'classification_baseline' in data:
            baseline_data = data['classification_baseline']
            if subject_id in baseline_data:
                return baseline_data[subject_id]
    return None

def list_available_subjects(file_key='lsd_closed_Hamza.pkl'):
    """List all available subject IDs"""
    if file_key in results and results[file_key] is not None:
        data = results[file_key]
        if 'classification_baseline' in data:
            return list(data['classification_baseline'].keys())
    return []

def get_feature_importance(model_name, file_key='lsd_closed_hamza_binary_baseline_rs42.pkl'):
    """Get feature importance for a specific model"""
    if file_key in results and results[file_key] is not None:
        data = results[file_key]
        if model_name in data and 'feature_importances' in data[model_name]:
            return data[model_name]['feature_importances']
    return None

# Example usage:
print("🎯 EXAMPLE USAGE:")
print("-" * 30)

# Show available models
if 'lsd_closed_hamza_binary_baseline_rs42.pkl' in results:
    models = list(results['lsd_closed_hamza_binary_baseline_rs42.pkl'].keys())
    print(f"Available models: {models}")
    
    # Get metrics for first model
    if models:
        first_model_metrics = get_model_metrics(models[0])
        print(f"\nMetrics for {models[0]}:")
        if first_model_metrics:
            pprint(first_model_metrics)

# Show sample subjects
subjects = list_available_subjects()
if subjects:
    print(f"\nTotal subjects available: {len(subjects)}")
    print(f"Sample subject IDs: {subjects[:5]}")
    
    # Get data for first subject
    first_subject_data = get_subject_data(subjects[0])
    print(f"\nData for {subjects[0]}:")
    if first_subject_data:
        if isinstance(first_subject_data, dict):
            print(f"Keys: {list(first_subject_data.keys())}")
        else:
            print(f"Type: {type(first_subject_data)}")

print(f"\n✅ Structure analysis complete!")
print(f"💡 You can now use the utility functions above to explore specific models or subjects.")


🛠️  UTILITY FUNCTIONS FOR FURTHER EXPLORATION
🎯 EXAMPLE USAGE:
------------------------------
Available models: ['Logistic Regression', 'Random Forest', 'SVC']

Metrics for Logistic Regression:
{'accuracy': {'fold_scores': array([0.625     , 0.625     , 0.625     , 0.625     , 0.33333333]),
              'mean': np.float64(0.5666666666666667),
              'std': np.float64(0.11666666666666667)},
 'f1': {'fold_scores': array([0.61904762, 0.56363636, 0.61904762, 0.61904762, 0.33333333]),
        'mean': np.float64(0.5508225108225109),
        'std': np.float64(0.11084198963013722)},
 'roc_auc': {'fold_scores': array([0.625     , 0.625     , 0.625     , 0.625     , 0.33333333]),
             'mean': np.float64(0.5666666666666667),
             'std': np.float64(0.11666666666666665)}}

Total subjects available: 271
Sample subject IDs: ['MRC41', 'MLF61', 'MLC15', 'MRP32', 'MRT41']

Data for MRC41:
Keys: ['Logistic Regression', 'Random Forest', 'SVC']

✅ Structure analysis complete!
💡 You 

In [None]:
# Deep analysis of the brain spatial regions and features structure
print("🧠 BRAIN SPATIAL REGIONS AND FEATURES ANALYSIS")
print("="*80)

hamza_file = 'lsd_closed_Hamza.pkl'
if hamza_file in results and results[hamza_file] is not None:
    data = results[hamza_file]
    
    if 'classification_baseline' in data:
        baseline_data = data['classification_baseline']
        
        # Get first brain region data to understand structure
        first_region_id = list(baseline_data.keys())[0]
        first_region_data = baseline_data[first_region_id]
        
        print(f"🏷️ Analyzing brain region: {first_region_id}")
        print(f"📊 Total brain regions: {len(baseline_data)}")
        
        if isinstance(first_region_data, dict):
            print(f"🤖 Models per region: {list(first_region_data.keys())}")
            
            # Check first model to understand feature structure
            first_model = list(first_region_data.keys())[0]
            model_data = first_region_data[first_model]
            
            print(f"\n🔍 Examining model '{first_model}' for region '{first_region_id}':")
            print(f"   📋 Model data keys: {list(model_data.keys())}")
            
            # Look at feature importances to understand the 12 features
            if 'feature_importances' in model_data:
                features = model_data['feature_importances']
                print(f"\n🎯 FEATURE ANALYSIS:")
                print(f"   📏 Number of features: {len(features)}")
                print(f"   🏷️ Feature names: {list(features.keys())}")
                
                # Show actual feature importance values
                print(f"\n   📊 Feature importance values for {first_region_id}:")
                for i, (feature_name, importance) in enumerate(features.items()):
                    print(f"      {i+1:2d}. {feature_name}: {importance:.6f}")
            
            # Look at metrics structure
            if 'metric_scores' in model_data:
                metrics = model_data['metric_scores']
                print(f"\n📈 METRICS STRUCTURE:")
                for metric_name, metric_data in metrics.items():
                    if isinstance(metric_data, dict) and 'mean' in metric_data:
                        print(f"   {metric_name}: {metric_data['mean']:.4f} ± {metric_data.get('std', 0):.4f}")
        
        # Sample a few more regions to check consistency
        print(f"\n🔄 CONSISTENCY CHECK across regions:")
        sample_regions = list(baseline_data.keys())[:5]
        
        for region_id in sample_regions:
            region_data = baseline_data[region_id]
            if isinstance(region_data, dict):
                models = list(region_data.keys())
                first_model_data = region_data[models[0]]
                if 'feature_importances' in first_model_data:
                    n_features = len(first_model_data['feature_importances'])
                    print(f"   {region_id}: {n_features} features, models: {models}")

print("\n" + "="*80)


In [None]:
# CORRECT UNDERSTANDING OF DATA STRUCTURE
print("🧠 CORRECTED DATA STRUCTURE ANALYSIS")
print("="*80)

hamza_file = 'lsd_closed_Hamza.pkl'
if hamza_file in results and results[hamza_file] is not None:
    data = results[hamza_file]
    
    if 'classification_baseline' in data:
        baseline_data = data['classification_baseline']
        
        # Get correct understanding
        first_region = list(baseline_data.keys())[0]
        region_data = baseline_data[first_region]
        first_model = list(region_data.keys())[0]
        model_data = region_data[first_model]
        
        # Extract key information
        n_regions = len(baseline_data)
        n_models = len(region_data)
        n_features = len(model_data['feature_importances']) if 'feature_importances' in model_data else 0
        
        # Get number of subjects from predictions
        n_subjects = 0
        if 'predictions' in model_data and 'y_true' in model_data['predictions']:
            n_subjects = len(model_data['predictions']['y_true'])
        
        # Get number of CV folds
        n_folds = 0
        if 'metric_scores' in model_data:
            first_metric = list(model_data['metric_scores'].values())[0]
            if isinstance(first_metric, dict) and 'fold_scores' in first_metric:
                n_folds = len(first_metric['fold_scores'])
        
        print(f"📊 DATASET DIMENSIONS:")
        print(f"   🧠 Brain regions/channels: {n_regions}")
        print(f"   🎯 Features per region: {n_features}")
        print(f"   👥 Subjects in dataset: {n_subjects}")
        print(f"   🤖 ML models per region: {n_models}")
        print(f"   🔄 Cross-validation folds: {n_folds}")
        
        print(f"\n🏷️ SAMPLE IDENTIFIERS:")
        print(f"   Brain regions: {list(baseline_data.keys())[:10]}...")
        print(f"   ML models: {list(region_data.keys())}")
        print(f"   Features: {list(model_data['feature_importances'].keys())}")
        
        print(f"\n📈 SAMPLE PERFORMANCE (region: {first_region}, model: {first_model}):")
        if 'metric_scores' in model_data:
            for metric, metric_data in model_data['metric_scores'].items():
                if isinstance(metric_data, dict) and 'mean' in metric_data:
                    mean_score = metric_data['mean']
                    std_score = metric_data.get('std', 0)
                    print(f"   {metric}: {mean_score:.4f} ± {std_score:.4f}")
        
        print(f"\n🎯 FEATURE IMPORTANCE SAMPLE (region: {first_region}, model: {first_model}):")
        if 'feature_importances' in model_data:
            feature_imp = model_data['feature_importances']
            for i, (feat_name, importance) in enumerate(list(feature_imp.items())[:5]):
                print(f"   {feat_name}: {importance:.6f}")
            if len(feature_imp) > 5:
                print(f"   ... and {len(feature_imp) - 5} more features")

print("\n" + "="*80)


In [None]:
# Compare with config file to understand feature names and spatial units
print("📋 COMPARISON WITH CONFIG FILE")
print("="*80)

# Features from config file
config_features = [
    "feature-lzivComplexityMeanEpochs",
    "feature-spectralEntropyMeanEpochs", 
    "feature-svdEntropyMeanEpochs",
    "feature-hjorthComplexityMeanEpochs",
    "feature-hjorthMobilityMeanEpochs",
    "feature-higuchiFdMeanEpochs",
    "feature-petrosianFdMeanEpochs"
]

# Spatial units from config file  
config_spatial_units = [
    'MLC11', 'MLC12', 'MLC21', 'MLC22', 'MLC31', 'MLC32', 'MLF24', 'MLF31', 'MLF42', 'MLF51', 'MLF62', 
    'MLP21', 'MLP31', 'MLP42', 'MLP51', 'MLT11', 'MLT21', 'MLT31', 'MLT41', 'MLT51', 'MLO11', 'MLO21', 
    'MLO31', 'MLO41', 'MRC11', 'MRC21', 'MRC31', 'MRC41', 'MRF21', 'MRF31', 'MRF41', 'MRF51', 'MRP21', 
    'MRP31', 'MRP41', 'MRT21', 'MRT31', 'MRT41', 'MZC01', 'MZF01', 'MZO01', 'MZP01'
]

print(f"🎯 Config file features ({len(config_features)}):")
for i, feature in enumerate(config_features, 1):
    print(f"   {i}. {feature}")

print(f"\n🧠 Config file spatial units ({len(config_spatial_units)}):")
print(f"   Sample: {config_spatial_units[:10]}...")

# Compare with pickle data
hamza_file = 'lsd_closed_Hamza.pkl'
if hamza_file in results and results[hamza_file] is not None:
    data = results[hamza_file]
    
    if 'classification_baseline' in data:
        baseline_data = data['classification_baseline']
        
        # Get actual regions from pickle
        pickle_regions = list(baseline_data.keys())
        
        print(f"\n📊 PICKLE vs CONFIG COMPARISON:")
        print(f"   Config spatial units: {len(config_spatial_units)}")
        print(f"   Pickle brain regions: {len(pickle_regions)}")
        
        # Check overlap
        config_set = set(config_spatial_units)
        pickle_set = set(pickle_regions)
        
        overlap = config_set.intersection(pickle_set)
        config_only = config_set - pickle_set
        pickle_only = pickle_set - config_set
        
        print(f"\n🔍 OVERLAP ANALYSIS:")
        print(f"   Regions in both: {len(overlap)}")
        print(f"   Only in config: {len(config_only)}")
        print(f"   Only in pickle: {len(pickle_only)}")
        
        if len(overlap) > 0:
            print(f"   Sample overlap: {list(overlap)[:10]}")
        
        if len(config_only) > 0:
            print(f"   Sample config-only: {list(config_only)[:10]}")
            
        if len(pickle_only) > 0:
            print(f"   Sample pickle-only: {list(pickle_only)[:10]}")
        
        # Check if we can find patterns in the 12 features vs 7 config features
        first_region_data = baseline_data[pickle_regions[0]]
        if isinstance(first_region_data, dict):
            first_model_data = first_region_data[list(first_region_data.keys())[0]]
            if 'feature_importances' in first_model_data:
                pickle_features = list(first_model_data['feature_importances'].keys())
                
                print(f"\n🎯 FEATURE COMPARISON:")
                print(f"   Config features: {len(config_features)}")
                print(f"   Pickle features: {len(pickle_features)}")
                
                print(f"\n   Pickle feature names:")
                for i, feat in enumerate(pickle_features, 1):
                    print(f"      {i:2d}. {feat}")
                
                # Try to map pickle features to config features
                print(f"\n🔗 FEATURE MAPPING ANALYSIS:")
                for pickle_feat in pickle_features:
                    # Look for partial matches
                    matches = [config_feat for config_feat in config_features 
                             if any(part in config_feat.lower() for part in pickle_feat.lower().split('_'))]
                    if matches:
                        print(f"   {pickle_feat} → potential match: {matches[0]}")
                    else:
                        print(f"   {pickle_feat} → no clear match")

print("\n" + "="*80)
