In [1]:
import pandas as pd
import pickle
import numpy as np
from pprint import pprint
import os

def load_and_analyze_pickle(file_path):
    """Load and analyze a pickle file structure"""
    print(f"🔍 Analyzing: {file_path}")
    print("="*60)
    
    if not os.path.exists(file_path):
        print(f"❌ File not found: {file_path}")
        return None
    
    # Load the pickle file
    try:
        with open(file_path, 'rb') as f:
            data = pickle.load(f)
        print(f"✅ Successfully loaded pickle file")
        print(f"📊 Root object type: {type(data)}")
        
        if hasattr(data, '__len__'):
            print(f"📏 Root object length: {len(data)}")
        
        return data
    except Exception as e:
        print(f"❌ Error loading pickle: {e}")
        return None

def explore_dict_structure(data, max_items=10, max_depth=2, current_depth=0):
    """Explore dictionary structure recursively"""
    indent = "  " * current_depth
    
    if isinstance(data, dict):
        print(f"{indent}📁 Dictionary with {len(data)} keys:")
        
        for i, (key, value) in enumerate(data.items()):
            if i >= max_items:
                print(f"{indent}   ... and {len(data) - max_items} more keys")
                break
                
            print(f"{indent}   🔑 {key}: {type(value)}", end="")
            
            if hasattr(value, '__len__') and not isinstance(value, str):
                print(f" (length: {len(value)})")
            else:
                print()
            
            # Show sample values for simple types
            if isinstance(value, (str, int, float, bool)) and len(str(value)) < 100:
                print(f"{indent}      💡 Value: {value}")
            
            # Recurse for nested structures (limited depth)
            if current_depth < max_depth and isinstance(value, dict) and len(value) <= 5:
                explore_dict_structure(value, max_items, max_depth, current_depth + 1)
    
    elif isinstance(data, list):
        print(f"{indent}📋 List with {len(data)} items")
        if len(data) > 0:
            print(f"{indent}   First item type: {type(data[0])}")
            if len(data) <= 5:
                for i, item in enumerate(data):
                    print(f"{indent}   [{i}]: {type(item)}")
    else:
        print(f"{indent}📄 Single object: {type(data)}")

# Load both pickle files
files_to_analyze = [
    '../results/lsd_closed_hamza/lsd_closed_hamza_binary_baseline_rs42.pkl',
    '../results/lsd_closed_hamza/lsd_closed_Hamza.pkl'
]

results = {}
for file_path in files_to_analyze:
    file_name = os.path.basename(file_path)
    print(f"\n{'='*80}")
    results[file_name] = load_and_analyze_pickle(file_path)
    if results[file_name] is not None:
        explore_dict_structure(results[file_name], max_items=5, max_depth=2)



🔍 Analyzing: ../results/lsd_closed_hamza/lsd_closed_hamza_binary_baseline_rs42.pkl
✅ Successfully loaded pickle file
📊 Root object type: <class 'dict'>
📏 Root object length: 3
📁 Dictionary with 3 keys:
   🔑 Logistic Regression: <class 'dict'> (length: 6)
   🔑 Random Forest: <class 'dict'> (length: 6)
   🔑 SVC: <class 'dict'> (length: 6)

🔍 Analyzing: ../results/lsd_closed_hamza/lsd_closed_Hamza.pkl
✅ Successfully loaded pickle file
📊 Root object type: <class 'dict'>
📏 Root object length: 1
📁 Dictionary with 1 keys:
   🔑 classification_baseline: <class 'dict'> (length: 271)


In [None]:
# Detailed analysis of the binary baseline file (smaller file)
print("🔬 DETAILED ANALYSIS: Binary Baseline File")
print("="*80)

binary_file = 'lsd_closed_hamza_binary_baseline_rs42.pkl'
if binary_file in results and results[binary_file] is not None:
    data = results[binary_file]
    
    print(f"📋 Found {len(data)} models:")
    for model_name, model_data in data.items():
        print(f"\n🤖 Model: {model_name}")
        print(f"   📊 Data keys: {list(model_data.keys())}")
        
        # Analyze metric scores
        if 'metric_scores' in model_data:
            metrics = model_data['metric_scores']
            print(f"   📈 Metrics available: {list(metrics.keys())}")
            
            # Show sample metric scores
            for metric_name, metric_data in metrics.items():
                if isinstance(metric_data, dict):
                    print(f"      {metric_name}: {list(metric_data.keys())}")
                    # Show actual values if they're simple
                    for sub_key, sub_value in metric_data.items():
                        if isinstance(sub_value, (int, float, np.number)):
                            print(f"         {sub_key}: {sub_value:.4f}")
        
        # Analyze feature importances
        if 'feature_importances' in model_data:
            fi = model_data['feature_importances']
            if isinstance(fi, dict):
                print(f"   🎯 Feature importance keys: {list(fi.keys())}")
                print(f"      Number of features: {len(fi)}")
        
        # Analyze predictions
        if 'predictions' in model_data:
            pred = model_data['predictions']
            if isinstance(pred, dict):
                print(f"   🎲 Prediction keys: {list(pred.keys())}")
else:
    print("❌ Binary baseline file not loaded successfully")


In [None]:
# Detailed analysis of the large Hamza file
print("🔬 DETAILED ANALYSIS: Large Hamza File")
print("="*80)

hamza_file = 'lsd_closed_Hamza.pkl'
if hamza_file in results and results[hamza_file] is not None:
    data = results[hamza_file]
    
    print(f"📋 Root keys: {list(data.keys())}")
    
    # Analyze the classification_baseline structure
    if 'classification_baseline' in data:
        baseline_data = data['classification_baseline']
        print(f"\n📊 Classification baseline contains {len(baseline_data)} subjects")
        
        # Show first few subject IDs
        subject_ids = list(baseline_data.keys())[:10]
        print(f"🏷️  First 10 subject IDs: {subject_ids}")
        
        # Analyze structure of first subject
        if len(baseline_data) > 0:
            first_subject_id = list(baseline_data.keys())[0]
            first_subject_data = baseline_data[first_subject_id]
            
            print(f"\n🔍 Structure of first subject ({first_subject_id}):")
            print(f"   Type: {type(first_subject_data)}")
            
            if hasattr(first_subject_data, '__len__'):
                print(f"   Length: {len(first_subject_data)}")
            
            # If it's a dict, show its keys
            if isinstance(first_subject_data, dict):
                print(f"   Keys: {list(first_subject_data.keys())}")
                
                # Show sample values for each key
                for key, value in first_subject_data.items():
                    print(f"      {key}: {type(value)}", end="")
                    if hasattr(value, '__len__') and not isinstance(value, str):
                        print(f" (length: {len(value)})")
                    else:
                        print()
                    
                    # Show actual values for simple types
                    if isinstance(value, (str, int, float, bool)) and len(str(value)) < 100:
                        print(f"         Value: {value}")
                    elif isinstance(value, (list, np.ndarray)) and len(value) <= 10:
                        print(f"         Sample values: {value}")
        
        # Analyze patterns across subjects
        print(f"\n📈 PATTERN ANALYSIS across all {len(baseline_data)} subjects:")
        
        # Collect all unique keys from all subjects
        all_keys = set()
        for subject_data in baseline_data.values():
            if isinstance(subject_data, dict):
                all_keys.update(subject_data.keys())
        
        if all_keys:
            print(f"   🔑 Unique keys found across all subjects: {sorted(all_keys)}")
            
            # Check consistency - do all subjects have the same keys?
            key_counts = {}
            for subject_data in baseline_data.values():
                if isinstance(subject_data, dict):
                    for key in subject_data.keys():
                        key_counts[key] = key_counts.get(key, 0) + 1
            
            print(f"   📊 Key frequency across subjects:")
            for key, count in sorted(key_counts.items()):
                percentage = (count / len(baseline_data)) * 100
                print(f"      {key}: {count}/{len(baseline_data)} subjects ({percentage:.1f}%)")
else:
    print("❌ Large Hamza file not loaded successfully")


In [None]:
# Summary and comparison
print("📊 SUMMARY AND COMPARISON")
print("="*80)

# File size comparison
import os
files = [
    '../results/lsd_closed_hamza/lsd_closed_hamza_binary_baseline_rs42.pkl',
    '../results/lsd_closed_hamza/lsd_closed_Hamza.pkl'
]

for file_path in files:
    if os.path.exists(file_path):
        size_mb = os.path.getsize(file_path) / (1024 * 1024)
        print(f"📁 {os.path.basename(file_path)}: {size_mb:.2f} MB")

print("\n🔍 STRUCTURE COMPARISON:")
print("-" * 40)

binary_file = 'lsd_closed_hamza_binary_baseline_rs42.pkl'
hamza_file = 'lsd_closed_Hamza.pkl'

if binary_file in results and results[binary_file] is not None:
    binary_data = results[binary_file]
    print(f"Binary Baseline File:")
    print(f"  • Structure: Dictionary with {len(binary_data)} ML models")
    print(f"  • Models: {list(binary_data.keys())}")
    print(f"  • Each model contains: {list(next(iter(binary_data.values())).keys())}")

if hamza_file in results and results[hamza_file] is not None:
    hamza_data = results[hamza_file]
    print(f"\nLarge Hamza File:")
    print(f"  • Structure: Dictionary with {len(hamza_data)} top-level key(s)")
    if 'classification_baseline' in hamza_data:
        baseline_data = hamza_data['classification_baseline']
        print(f"  • Contains: {len(baseline_data)} subject records")
        print(f"  • Subject ID pattern: {list(baseline_data.keys())[:3]}... (showing first 3)")

print(f"\n💡 KEY INSIGHTS:")
print(f"  • Binary baseline file: Contains ML model results (3 models with metrics)")
print(f"  • Large Hamza file: Contains individual subject data (271 subjects)")
print(f"  • These appear to be different types of ML analysis results")
print(f"  • Binary baseline: Model comparison results")
print(f"  • Large Hamza: Subject-level analysis results")


In [None]:
# Utility functions for further exploration
print("🛠️  UTILITY FUNCTIONS FOR FURTHER EXPLORATION")
print("="*80)

def get_model_metrics(model_name, file_key='lsd_closed_hamza_binary_baseline_rs42.pkl'):
    """Extract metrics for a specific model from binary baseline file"""
    if file_key in results and results[file_key] is not None:
        data = results[file_key]
        if model_name in data and 'metric_scores' in data[model_name]:
            return data[model_name]['metric_scores']
    return None

def get_subject_data(subject_id, file_key='lsd_closed_Hamza.pkl'):
    """Extract data for a specific subject from large Hamza file"""
    if file_key in results and results[file_key] is not None:
        data = results[file_key]
        if 'classification_baseline' in data:
            baseline_data = data['classification_baseline']
            if subject_id in baseline_data:
                return baseline_data[subject_id]
    return None

def list_available_subjects(file_key='lsd_closed_Hamza.pkl'):
    """List all available subject IDs"""
    if file_key in results and results[file_key] is not None:
        data = results[file_key]
        if 'classification_baseline' in data:
            return list(data['classification_baseline'].keys())
    return []

def get_feature_importance(model_name, file_key='lsd_closed_hamza_binary_baseline_rs42.pkl'):
    """Get feature importance for a specific model"""
    if file_key in results and results[file_key] is not None:
        data = results[file_key]
        if model_name in data and 'feature_importances' in data[model_name]:
            return data[model_name]['feature_importances']
    return None

# Example usage:
print("🎯 EXAMPLE USAGE:")
print("-" * 30)

# Show available models
if 'lsd_closed_hamza_binary_baseline_rs42.pkl' in results:
    models = list(results['lsd_closed_hamza_binary_baseline_rs42.pkl'].keys())
    print(f"Available models: {models}")
    
    # Get metrics for first model
    if models:
        first_model_metrics = get_model_metrics(models[0])
        print(f"\nMetrics for {models[0]}:")
        if first_model_metrics:
            pprint(first_model_metrics)

# Show sample subjects
subjects = list_available_subjects()
if subjects:
    print(f"\nTotal subjects available: {len(subjects)}")
    print(f"Sample subject IDs: {subjects[:5]}")
    
    # Get data for first subject
    first_subject_data = get_subject_data(subjects[0])
    print(f"\nData for {subjects[0]}:")
    if first_subject_data:
        if isinstance(first_subject_data, dict):
            print(f"Keys: {list(first_subject_data.keys())}")
        else:
            print(f"Type: {type(first_subject_data)}")

print(f"\n✅ Structure analysis complete!")
print(f"💡 You can now use the utility functions above to explore specific models or subjects.")
