In [1]:
# @title Cell 1: CASME II Multi-Frame Sampling Availability Verification

import os
import pandas as pd
import numpy as np
import json
from google.colab import drive
from pathlib import Path
from collections import defaultdict

# Mount Google Drive
print("=" * 75)
print("CASME II MULTI-FRAME SAMPLING VERIFICATION - PHASE 3")
print("=" * 75)
print("\n[1] Mounting Google Drive...")
drive.mount('/content/drive')
print("✓ Google Drive mounted successfully")

# Define paths
base_path = "/content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project"
raw_path = f"{base_path}/datasets/raw/CASME2_RAW_selected"
metadata_path = f"{base_path}/datasets/metadata/CASME2-coding-20140508.xlsx"
phase1_split_metadata = f"{base_path}/datasets/processed_casme2/data_split/split_metadata.json"

print(f"\n[2] Loading Phase 1 split metadata for consistency...")
print(f"Phase 1 metadata: {phase1_split_metadata}")

try:
    with open(phase1_split_metadata, 'r') as f:
        phase1_splits = json.load(f)

    total_phase1_samples = sum(split['count'] for split in phase1_splits.values())
    print(f"✓ Phase 1 metadata loaded: {total_phase1_samples} total samples")
    print(f"  Train: {phase1_splits['train']['count']} samples")
    print(f"  Val: {phase1_splits['val']['count']} samples")
    print(f"  Test: {phase1_splits['test']['count']} samples")
except Exception as e:
    print(f"✗ Error loading Phase 1 metadata: {str(e)}")
    exit()

# Load CASME II metadata
print(f"\n[3] Loading CASME II metadata...")

try:
    df = pd.read_excel(metadata_path)

    # Clean ApexFrame column if needed
    if df['ApexFrame'].dtype == 'object':
        df['ApexFrame'] = pd.to_numeric(df['ApexFrame'], errors='coerce')
        print(f"⚠ ApexFrame column contained non-numeric values, converted to numeric")

    print(f"✓ Metadata loaded: {len(df)} records")
    print(f"  Columns: Subject, Filename, OnsetFrame, ApexFrame, OffsetFrame, Estimated Emotion")

except Exception as e:
    print(f"✗ Error loading metadata: {str(e)}")
    exit()

# Create comprehensive sample mapping
print(f"\n[4] Creating comprehensive sample mapping from metadata...")

metadata_map = {}
for idx, row in df.iterrows():
    subject = f"sub{str(row['Subject']).zfill(2)}"
    sequence = row['Filename']
    sample_id = f"{subject}_{sequence}"

    metadata_map[sample_id] = {
        'subject': subject,
        'sequence': sequence,
        'emotion': row['Estimated Emotion'],
        'onset_frame': int(row['OnsetFrame']),
        'apex_frame': int(row['ApexFrame']) if pd.notna(row['ApexFrame']) else None,
        'offset_frame': int(row['OffsetFrame'])
    }

print(f"✓ Sample mapping created: {len(metadata_map)} samples indexed")

# Multi-frame window definition
print(f"\n[5] Defining multi-frame sampling windows...")

WINDOW_CONFIG = {
    'onset': {'offsets': [0, 1, 2, 3], 'description': 'Forward: N+[0,1,2,3]'},
    'apex': {'offsets': [-2, -1, 0, 1, 2], 'description': 'Centered: N+[-2,-1,0,1,2]'},
    'offset': {'offsets': [-3, -2, -1, 0], 'description': 'Backward: N+[-3,-2,-1,0]'}
}

print(f"✓ Window configuration:")
for frame_type, config in WINDOW_CONFIG.items():
    print(f"  {frame_type.upper()}: {len(config['offsets'])} frames - {config['description']}")

total_frames_per_sample = sum(len(config['offsets']) for config in WINDOW_CONFIG.values())
print(f"  Total frames per sample: {total_frames_per_sample}")

# Frame availability verification function
def get_available_frames(sequence_path):
    """Get sorted list of available frame numbers in sequence"""
    if not os.path.exists(sequence_path):
        return []

    image_files = [f for f in os.listdir(sequence_path)
                  if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

    frame_numbers = []
    for img_file in image_files:
        try:
            # Extract number from filename (e.g., img46.jpg -> 46)
            frame_num = int(''.join(filter(str.isdigit, img_file.split('.')[0])))
            frame_numbers.append(frame_num)
        except ValueError:
            continue

    return sorted(frame_numbers)

def verify_window_availability(key_frame, window_offsets, available_frames):
    """
    Verify which frames in window are available
    Returns: (available_frames_list, missing_frames_list, completeness_rate)
    """
    requested_frames = [key_frame + offset for offset in window_offsets]
    available_in_window = [f for f in requested_frames if f in available_frames]
    missing_in_window = [f for f in requested_frames if f not in available_frames]

    completeness = len(available_in_window) / len(requested_frames) if requested_frames else 0

    return available_in_window, missing_in_window, completeness

# Verify multi-frame availability for all splits
print(f"\n[6] Verifying multi-frame window availability...")
print(f"Strategy: Train=Multi-frame, Val/Test=Key-frames only")

verification_results = {
    'train': [],
    'val': [],
    'test': []
}

statistics = {
    'train': {
        'total_samples': 0,
        'perfect_samples': 0,
        'partial_samples': 0,
        'problematic_samples': 0,
        'frame_stats': {
            'onset': {'perfect': 0, 'partial': 0, 'missing': 0, 'total_frames': 0},
            'apex': {'perfect': 0, 'partial': 0, 'missing': 0, 'total_frames': 0},
            'offset': {'perfect': 0, 'partial': 0, 'missing': 0, 'total_frames': 0}
        }
    },
    'val': {
        'total_samples': 0,
        'perfect_samples': 0,
        'partial_samples': 0,
        'frame_stats': {
            'onset': {'perfect': 0, 'partial': 0},
            'apex': {'perfect': 0, 'partial': 0},
            'offset': {'perfect': 0, 'partial': 0}
        }
    },
    'test': {
        'total_samples': 0,
        'perfect_samples': 0,
        'partial_samples': 0,
        'frame_stats': {
            'onset': {'perfect': 0, 'partial': 0},
            'apex': {'perfect': 0, 'partial': 0},
            'offset': {'perfect': 0, 'partial': 0}
        }
    }
}

for split_name, split_data in phase1_splits.items():
    print(f"\n  Verifying {split_name.upper()} set ({split_data['count']} samples)...")

    is_train = (split_name == 'train')

    for sample in split_data['samples']:
        sample_id = sample['sample_id']

        # Skip if sample not in metadata
        if sample_id not in metadata_map:
            print(f"  ✗ Sample {sample_id} not found in metadata")
            continue

        meta = metadata_map[sample_id]
        sequence_path = os.path.join(raw_path, meta['subject'], meta['sequence'])

        # Get available frames in sequence
        available_frames = get_available_frames(sequence_path)

        if not available_frames:
            print(f"  ✗ No frames found for {sample_id}")
            continue

        statistics[split_name]['total_samples'] += 1

        # Prepare sample verification result
        sample_result = {
            'sample_id': sample_id,
            'subject': meta['subject'],
            'sequence': meta['sequence'],
            'emotion': meta['emotion'],
            'available_frame_range': [min(available_frames), max(available_frames)],
            'total_available_frames': len(available_frames),
            'windows': {}
        }

        # Verify each frame type window
        sample_all_perfect = True
        sample_has_partial = False

        for frame_type in ['onset', 'apex', 'offset']:
            key_frame_map = {
                'onset': meta['onset_frame'],
                'apex': meta['apex_frame'] if meta['apex_frame'] else meta['onset_frame'],
                'offset': meta['offset_frame']
            }

            key_frame = key_frame_map[frame_type]
            window_offsets = WINDOW_CONFIG[frame_type]['offsets']

            if is_train:
                # Multi-frame verification for training
                available_list, missing_list, completeness = verify_window_availability(
                    key_frame, window_offsets, available_frames
                )

                window_status = 'perfect' if completeness == 1.0 else ('partial' if completeness >= 0.75 else 'problematic')

                sample_result['windows'][frame_type] = {
                    'key_frame': key_frame,
                    'window_offsets': window_offsets,
                    'requested_frames': [key_frame + offset for offset in window_offsets],
                    'available_frames': available_list,
                    'missing_frames': missing_list,
                    'completeness': completeness,
                    'status': window_status
                }

                # Update statistics
                if completeness == 1.0:
                    statistics[split_name]['frame_stats'][frame_type]['perfect'] += 1
                elif completeness >= 0.75:
                    statistics[split_name]['frame_stats'][frame_type]['partial'] += 1
                    sample_has_partial = True
                    sample_all_perfect = False
                else:
                    statistics[split_name]['frame_stats'][frame_type]['missing'] += 1
                    sample_all_perfect = False

                statistics[split_name]['frame_stats'][frame_type]['total_frames'] += len(available_list)

            else:
                # Key-frame only verification for val/test
                is_available = key_frame in available_frames

                sample_result['windows'][frame_type] = {
                    'key_frame': key_frame,
                    'is_available': is_available,
                    'status': 'perfect' if is_available else 'missing'
                }

                if is_available:
                    statistics[split_name]['frame_stats'][frame_type]['perfect'] += 1
                else:
                    statistics[split_name]['frame_stats'][frame_type]['partial'] += 1
                    sample_all_perfect = False

        # Update sample-level statistics
        if sample_all_perfect:
            statistics[split_name]['perfect_samples'] += 1
        elif sample_has_partial or not sample_all_perfect:
            statistics[split_name]['partial_samples'] += 1

        if is_train and any(sample_result['windows'][ft]['completeness'] < 0.75 for ft in ['onset', 'apex', 'offset']):
            statistics[split_name]['problematic_samples'] += 1

        verification_results[split_name].append(sample_result)

    print(f"  ✓ {split_name.upper()} verification completed")

# Display comprehensive statistics
print(f"\n[7] Multi-Frame Sampling Availability Statistics:")
print(f"=" * 75)

print(f"\nTRAIN SET (Multi-Frame Sampling):")
train_stats = statistics['train']
print(f"  Total samples verified: {train_stats['total_samples']}")
print(f"  ✓ Perfect (all windows 100%): {train_stats['perfect_samples']} ({train_stats['perfect_samples']/train_stats['total_samples']*100:.1f}%)")
print(f"  ⚠ Partial (some windows 75-99%): {train_stats['partial_samples']} ({train_stats['partial_samples']/train_stats['total_samples']*100:.1f}%)")
print(f"  ✗ Problematic (windows <75%): {train_stats['problematic_samples']} ({train_stats['problematic_samples']/train_stats['total_samples']*100:.1f}%)")

print(f"\n  Frame-wise availability (Multi-Frame Windows):")
for frame_type, stats in train_stats['frame_stats'].items():
    total = stats['perfect'] + stats['partial'] + stats['missing']
    print(f"    {frame_type.upper()} windows ({len(WINDOW_CONFIG[frame_type]['offsets'])} frames each):")
    print(f"      Perfect windows: {stats['perfect']}/{total} ({stats['perfect']/total*100:.1f}%)")
    print(f"      Partial windows: {stats['partial']}/{total} ({stats['partial']/total*100:.1f}%)")
    print(f"      Problematic: {stats['missing']}/{total} ({stats['missing']/total*100:.1f}%)")
    print(f"      Total frames extracted: {stats['total_frames']}")

print(f"\nVALIDATION SET (Key-Frames Only):")
val_stats = statistics['val']
print(f"  Total samples verified: {val_stats['total_samples']}")
print(f"  ✓ Perfect (all key frames available): {val_stats['perfect_samples']} ({val_stats['perfect_samples']/val_stats['total_samples']*100:.1f}%)")
print(f"  ⚠ Partial (some key frames missing): {val_stats['partial_samples']}")

print(f"\n  Key-frame availability:")
for frame_type, stats in val_stats['frame_stats'].items():
    total = stats['perfect'] + stats['partial']
    print(f"    {frame_type.upper()}: {stats['perfect']}/{total} available ({stats['perfect']/total*100:.1f}%)")

print(f"\nTEST SET (Key-Frames Only):")
test_stats = statistics['test']
print(f"  Total samples verified: {test_stats['total_samples']}")
print(f"  ✓ Perfect (all key frames available): {test_stats['perfect_samples']} ({test_stats['perfect_samples']/test_stats['total_samples']*100:.1f}%)")
print(f"  ⚠ Partial (some key frames missing): {test_stats['partial_samples']}")

print(f"\n  Key-frame availability:")
for frame_type, stats in test_stats['frame_stats'].items():
    total = stats['perfect'] + stats['partial']
    print(f"    {frame_type.upper()}: {stats['perfect']}/{total} available ({stats['perfect']/total*100:.1f}%)")

# Show sample problematic cases if any
if train_stats['problematic_samples'] > 0:
    print(f"\n[8] Problematic Training Samples Analysis (windows <75% complete):")
    problematic_count = 0
    for sample in verification_results['train']:
        if any(sample['windows'][ft]['completeness'] < 0.75 for ft in ['onset', 'apex', 'offset']):
            if problematic_count < 5:
                print(f"\n  Sample: {sample['sample_id']}")
                print(f"    Available frames: {sample['available_frame_range'][0]} to {sample['available_frame_range'][1]} ({sample['total_available_frames']} frames)")
                for ft in ['onset', 'apex', 'offset']:
                    window = sample['windows'][ft]
                    print(f"    {ft.upper()}: {window['completeness']*100:.0f}% complete - Missing: {window['missing_frames']}")
                problematic_count += 1

    if train_stats['problematic_samples'] > 5:
        print(f"\n  ... and {train_stats['problematic_samples'] - 5} more problematic samples")

# Export verification results
print(f"\n[9] Exporting verification results...")

output_path = f"{base_path}/datasets/processed_casme2"
verification_file = f"{output_path}/multi_frame_verification.json"

# Calculate expected dataset sizes
expected_train_frames = train_stats['frame_stats']['onset']['total_frames'] + \
                        train_stats['frame_stats']['apex']['total_frames'] + \
                        train_stats['frame_stats']['offset']['total_frames']

expected_val_frames = val_stats['frame_stats']['onset']['perfect'] + \
                      val_stats['frame_stats']['apex']['perfect'] + \
                      val_stats['frame_stats']['offset']['perfect']

expected_test_frames = test_stats['frame_stats']['onset']['perfect'] + \
                       test_stats['frame_stats']['apex']['perfect'] + \
                       test_stats['frame_stats']['offset']['perfect']

verification_export = {
    'verification_date': pd.Timestamp.now().isoformat(),
    'phase': 'Phase 3 - Multi-Frame Sampling',
    'strategy': {
        'train': 'multi_frame_windows',
        'val': 'key_frames_only',
        'test': 'key_frames_only'
    },
    'window_configuration': WINDOW_CONFIG,
    'statistics': statistics,
    'expected_dataset_sizes': {
        'train': {
            'samples': train_stats['total_samples'],
            'total_frames': expected_train_frames,
            'frames_per_sample': total_frames_per_sample
        },
        'val': {
            'samples': val_stats['total_samples'],
            'total_frames': expected_val_frames,
            'frames_per_sample': 3
        },
        'test': {
            'samples': test_stats['total_samples'],
            'total_frames': expected_test_frames,
            'frames_per_sample': 3
        }
    },
    'split_results': verification_results
}

with open(verification_file, 'w') as f:
    json.dump(verification_export, f, indent=2)

print(f"✓ Verification results saved to: multi_frame_verification.json")

# Final readiness assessment
print(f"\n[10] Phase 3 Dataset Readiness Assessment:")
print(f"=" * 75)

train_readiness = (train_stats['perfect_samples'] + train_stats['partial_samples']) / train_stats['total_samples'] * 100
val_readiness = val_stats['perfect_samples'] / val_stats['total_samples'] * 100
test_readiness = test_stats['perfect_samples'] / test_stats['total_samples'] * 100

overall_readiness = (train_readiness + val_readiness + test_readiness) / 3

if overall_readiness >= 95:
    status = "READY"
    message = "Dataset ready for multi-frame extraction"
elif overall_readiness >= 90:
    status = "READY WITH MINOR ISSUES"
    message = f"{train_stats['problematic_samples']} training samples may need fallback strategy"
else:
    status = "NEEDS ATTENTION"
    message = "Significant frame availability issues detected"

print(f"Readiness Status: {status}")
print(f"Overall Readiness: {overall_readiness:.1f}%")
print(f"Assessment: {message}")

print(f"\n✓ Expected Phase 3 dataset composition:")
print(f"  TRAIN (Multi-Frame):")
print(f"    Samples: {train_stats['total_samples']}")
print(f"    Total frames: {expected_train_frames}")
print(f"    Breakdown: {train_stats['frame_stats']['onset']['total_frames']} onset + {train_stats['frame_stats']['apex']['total_frames']} apex + {train_stats['frame_stats']['offset']['total_frames']} offset")
print(f"\n  VAL (Key-Frames):")
print(f"    Samples: {val_stats['total_samples']}")
print(f"    Total frames: {expected_val_frames} (3 per sample)")
print(f"\n  TEST (Key-Frames):")
print(f"    Samples: {test_stats['total_samples']}")
print(f"    Total frames: {expected_test_frames} (3 per sample)")
print(f"\n  GRAND TOTAL: {expected_train_frames + expected_val_frames + expected_test_frames} frames")

print(f"\n✓ Next steps:")
print(f"  - Review verification results in multi_frame_verification.json")
print(f"  - Proceed to Cell 2: Multi-frame extraction and dataset preparation")
if train_stats['problematic_samples'] > 0:
    print(f"  - Cell 2 will apply fallback strategy for {train_stats['problematic_samples']} problematic samples")
else:
    print(f"  - No fallback strategy needed (all windows sufficient)")

print("=" * 75)

CASME II MULTI-FRAME SAMPLING VERIFICATION - PHASE 3

[1] Mounting Google Drive...
Mounted at /content/drive
✓ Google Drive mounted successfully

[2] Loading Phase 1 split metadata for consistency...
Phase 1 metadata: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/data_split/split_metadata.json
✓ Phase 1 metadata loaded: 255 total samples
  Train: 201 samples
  Val: 26 samples
  Test: 28 samples

[3] Loading CASME II metadata...
⚠ ApexFrame column contained non-numeric values, converted to numeric
✓ Metadata loaded: 255 records
  Columns: Subject, Filename, OnsetFrame, ApexFrame, OffsetFrame, Estimated Emotion

[4] Creating comprehensive sample mapping from metadata...
✓ Sample mapping created: 255 samples indexed

[5] Defining multi-frame sampling windows...
✓ Window configuration:
  ONSET: 4 frames - Forward: N+[0,1,2,3]
  APEX: 5 frames - Centered: N+[-2,-1,0,1,2]
  OFFSET: 4 frames - Backward: N+[-3,-2,-1,0]
  Total frames per

In [2]:
# @title Cell 2: CASME II Multi-Frame Sampling Extraction and Dataset Preparation

import os
import shutil
import json
import pandas as pd
from google.colab import drive
from pathlib import Path

# Mount Google Drive
print("=" * 75)
print("CASME II MULTI-FRAME SAMPLING DATASET PREPARATION - PHASE 3")
print("=" * 75)
print("\n[1] Mounting Google Drive...")
drive.mount('/content/drive')
print("✓ Google Drive mounted successfully")

# Define paths
base_path = "/content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project"
processed_path = f"{base_path}/datasets/processed_casme2"
verification_file = f"{processed_path}/multi_frame_verification.json"

print(f"\n[2] Setting up Phase 3 directory structure...")
print(f"Target: data_split_v3 (Multi-Frame for Train, Key-Frames for Val/Test)")

# Create data_split_v3 directory structure
data_split_v3_path = f"{processed_path}/data_split_v3"
directories = [
    f"{data_split_v3_path}/train",
    f"{data_split_v3_path}/val",
    f"{data_split_v3_path}/test"
]

for directory in directories:
    os.makedirs(directory, exist_ok=True)
    print(f"✓ Created directory: {directory}")

# Load verification results
print(f"\n[3] Loading verification results from Cell 1...")

try:
    with open(verification_file, 'r') as f:
        verification_data = json.load(f)

    print(f"✓ Verification data loaded")
    print(f"  Strategy: Train=Multi-frame, Val/Test=Key-frames")
    print(f"  Train samples: {verification_data['statistics']['train']['total_samples']}")
    print(f"  Expected train frames: {verification_data['expected_dataset_sizes']['train']['total_frames']}")
    print(f"  Problematic samples: {verification_data['statistics']['train']['problematic_samples']}")

except Exception as e:
    print(f"✗ Error loading verification data: {str(e)}")
    exit()

# Extract multi-frame sequences and copy to split directories
print(f"\n[4] Extracting multi-frame sequences with fallback strategy...")
print(f"Fallback: Missing frames will use nearest available frame (duplication)")

copy_stats = {
    'train': {'onset': 0, 'apex': 0, 'offset': 0, 'duplicated_frames': 0},
    'val': {'onset': 0, 'apex': 0, 'offset': 0},
    'test': {'onset': 0, 'apex': 0, 'offset': 0}
}

copy_errors = []
metadata_v3 = {
    'train': {'count': 0, 'samples': [], 'class_distribution': {}},
    'val': {'count': 0, 'samples': [], 'class_distribution': {}},
    'test': {'count': 0, 'samples': [], 'class_distribution': {}}
}

for split_name, samples in verification_data['split_results'].items():
    print(f"\n  Processing {split_name.upper()} set ({len(samples)} samples)...")

    split_dir = f"{data_split_v3_path}/{split_name}"
    is_train = (split_name == 'train')

    for sample in samples:
        sample_id = sample['sample_id']
        emotion = sample['emotion']
        subject = sample['subject']
        sequence = sample['sequence']

        # Process each frame type
        for frame_type in ['onset', 'apex', 'offset']:
            window = sample['windows'][frame_type]

            if is_train:
                # Multi-frame extraction for training
                requested_frames = window['requested_frames']
                available_frames = window['available_frames']
                missing_frames = window['missing_frames']

                # Build frame path mapping with fallback
                frame_paths = {}
                for req_frame in requested_frames:
                    if req_frame in available_frames:
                        # Frame available, use it
                        frame_paths[req_frame] = req_frame
                    else:
                        # Frame missing, use nearest available frame
                        if available_frames:
                            nearest_frame = min(available_frames, key=lambda x: abs(x - req_frame))
                            frame_paths[req_frame] = nearest_frame
                            copy_stats[split_name]['duplicated_frames'] += 1
                        else:
                            copy_errors.append(f"No available frames for {sample_id} {frame_type}")
                            continue

                # Extract and copy frames
                for req_frame, actual_frame in frame_paths.items():
                    # Calculate position offset
                    key_frame = window['key_frame']
                    position_offset = req_frame - key_frame

                    # Source path
                    source_path = os.path.join(
                        base_path, 'datasets/raw/CASME2_RAW_selected',
                        subject, sequence, f"img{actual_frame}.jpg"
                    )

                    # Destination filename: {sample_id}_{frame_type}_p{offset}_{emotion}.jpg
                    offset_str = f"p{position_offset:+d}" if position_offset >= 0 else f"p{position_offset}"
                    dest_filename = f"{sample_id}_{frame_type}_{offset_str}_{emotion}.jpg"
                    dest_path = os.path.join(split_dir, dest_filename)

                    try:
                        if os.path.exists(source_path):
                            shutil.copy2(source_path, dest_path)
                            copy_stats[split_name][frame_type] += 1

                            # Add to metadata
                            metadata_v3[split_name]['samples'].append({
                                'sample_id': f"{sample_id}_{frame_type}_{offset_str}",
                                'original_sample_id': sample_id,
                                'frame_type': frame_type,
                                'frame_offset': position_offset,
                                'key_frame': key_frame,
                                'actual_frame': actual_frame,
                                'requested_frame': req_frame,
                                'is_duplicated': (req_frame != actual_frame),
                                'subject': subject,
                                'sequence': sequence,
                                'emotion': emotion,
                                'image_filename': dest_filename
                            })
                        else:
                            copy_errors.append(f"Source not found: {source_path}")

                    except Exception as e:
                        copy_errors.append(f"Copy error for {sample_id} {frame_type} frame {req_frame}: {str(e)}")

            else:
                # Key-frame only extraction for val/test
                key_frame = window['key_frame']

                if not window['is_available']:
                    copy_errors.append(f"Key frame missing: {sample_id} {frame_type}")
                    continue

                # Source path
                source_path = os.path.join(
                    base_path, 'datasets/raw/CASME2_RAW_selected',
                    subject, sequence, f"img{key_frame}.jpg"
                )

                # Destination filename: {sample_id}_{frame_type}_{emotion}.jpg
                dest_filename = f"{sample_id}_{frame_type}_{emotion}.jpg"
                dest_path = os.path.join(split_dir, dest_filename)

                try:
                    if os.path.exists(source_path):
                        shutil.copy2(source_path, dest_path)
                        copy_stats[split_name][frame_type] += 1

                        # Add to metadata
                        metadata_v3[split_name]['samples'].append({
                            'sample_id': f"{sample_id}_{frame_type}",
                            'original_sample_id': sample_id,
                            'frame_type': frame_type,
                            'key_frame': key_frame,
                            'subject': subject,
                            'sequence': sequence,
                            'emotion': emotion,
                            'image_filename': dest_filename
                        })
                    else:
                        copy_errors.append(f"Source not found: {source_path}")

                except Exception as e:
                    copy_errors.append(f"Copy error for {sample_id} {frame_type}: {str(e)}")

    # Update split statistics
    total_frames = sum(copy_stats[split_name].get(ft, 0) for ft in ['onset', 'apex', 'offset'])
    metadata_v3[split_name]['count'] = total_frames

    if is_train:
        print(f"  ✓ {split_name.upper()} extraction completed:")
        print(f"    Onset frames: {copy_stats[split_name]['onset']}")
        print(f"    Apex frames: {copy_stats[split_name]['apex']}")
        print(f"    Offset frames: {copy_stats[split_name]['offset']}")
        print(f"    Duplicated frames (fallback): {copy_stats[split_name]['duplicated_frames']}")
        print(f"    Total: {total_frames} images")
    else:
        print(f"  ✓ {split_name.upper()} extraction completed:")
        print(f"    Key frames: {total_frames} images ({copy_stats[split_name]['onset']} onset + {copy_stats[split_name]['apex']} apex + {copy_stats[split_name]['offset']} offset)")

# Calculate class distribution per split
print(f"\n[5] Calculating class distribution for each split...")

for split_name in ['train', 'val', 'test']:
    emotion_counts = {}
    for sample in metadata_v3[split_name]['samples']:
        emotion = sample['emotion']
        emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1

    metadata_v3[split_name]['class_distribution'] = emotion_counts

    print(f"\n  {split_name.upper()} set distribution ({metadata_v3[split_name]['count']} images):")
    for emotion, count in sorted(emotion_counts.items(), key=lambda x: x[1], reverse=True):
        percentage = (count / metadata_v3[split_name]['count']) * 100
        print(f"    {emotion}: {count} images ({percentage:.1f}%)")

# Display copy errors if any
if copy_errors:
    print(f"\n[6] Copy Errors Summary:")
    print(f"  Total errors: {len(copy_errors)}")
    if len(copy_errors) <= 5:
        for error in copy_errors:
            print(f"    {error}")
    else:
        for error in copy_errors[:5]:
            print(f"    {error}")
        print(f"    ... and {len(copy_errors) - 5} more errors")

# Save enhanced metadata
print(f"\n[7] Saving enhanced metadata with frame tracking...")

metadata_file_v3 = f"{data_split_v3_path}/split_metadata_v3.json"

metadata_export = {
    'dataset': 'CASME2_MultiFrameSampling',
    'phase': 'Phase 3',
    'strategy': {
        'train': 'multi_frame_windows',
        'val': 'key_frames_only',
        'test': 'key_frames_only'
    },
    'window_configuration': verification_data['window_configuration'],
    'creation_date': pd.Timestamp.now().isoformat(),
    'splits': metadata_v3
}

with open(metadata_file_v3, 'w') as f:
    json.dump(metadata_export, f, indent=2)

print(f"✓ Enhanced metadata saved to: data_split_v3/split_metadata_v3.json")

# Save processing summary
print(f"\n[8] Generating processing summary...")

total_copied = sum(sum(stats.get(ft, 0) for ft in ['onset', 'apex', 'offset']) for stats in copy_stats.values())
expected_total = verification_data['expected_dataset_sizes']['train']['total_frames'] + \
                 verification_data['expected_dataset_sizes']['val']['total_frames'] + \
                 verification_data['expected_dataset_sizes']['test']['total_frames']

processing_summary_v3 = {
    'dataset': 'CASME2_MultiFrameSampling',
    'phase': 'Phase 3',
    'processing_date': pd.Timestamp.now().isoformat(),
    'source_phase1_samples': verification_data['statistics']['train']['total_samples'] + \
                            verification_data['statistics']['val']['total_samples'] + \
                            verification_data['statistics']['test']['total_samples'],
    'extraction_strategy': {
        'train': 'multi_frame_windows_with_fallback',
        'val': 'key_frames_only',
        'test': 'key_frames_only',
        'fallback_method': 'nearest_frame_duplication'
    },
    'copy_statistics': {
        'train': {
            'total_images': sum(copy_stats['train'].get(ft, 0) for ft in ['onset', 'apex', 'offset']),
            'frame_breakdown': {
                'onset': copy_stats['train']['onset'],
                'apex': copy_stats['train']['apex'],
                'offset': copy_stats['train']['offset']
            },
            'duplicated_frames': copy_stats['train']['duplicated_frames'],
            'duplication_rate': (copy_stats['train']['duplicated_frames'] / sum(copy_stats['train'].get(ft, 0) for ft in ['onset', 'apex', 'offset']) * 100) if sum(copy_stats['train'].get(ft, 0) for ft in ['onset', 'apex', 'offset']) > 0 else 0
        },
        'val': {
            'total_images': sum(copy_stats['val'].get(ft, 0) for ft in ['onset', 'apex', 'offset']),
            'frame_breakdown': copy_stats['val']
        },
        'test': {
            'total_images': sum(copy_stats['test'].get(ft, 0) for ft in ['onset', 'apex', 'offset']),
            'frame_breakdown': copy_stats['test']
        }
    },
    'total_images_copied': total_copied,
    'expected_images': expected_total,
    'success_rate': (total_copied / expected_total * 100) if expected_total > 0 else 0,
    'copy_errors': len(copy_errors),
    'class_preservation': {
        'train': metadata_v3['train']['class_distribution'],
        'val': metadata_v3['val']['class_distribution'],
        'test': metadata_v3['test']['class_distribution']
    }
}

summary_file_v3 = f"{data_split_v3_path}/processing_summary_v3.json"

with open(summary_file_v3, 'w') as f:
    json.dump(processing_summary_v3, f, indent=2)

print(f"✓ Processing summary saved to: data_split_v3/processing_summary_v3.json")

# Final validation
print(f"\n[9] Final validation of data_split_v3 structure...")

validation_results = {}
frame_types = ['onset', 'apex', 'offset']

for split_name in ['train', 'val', 'test']:
    split_dir = f"{data_split_v3_path}/{split_name}"

    if os.path.exists(split_dir):
        image_files = [f for f in os.listdir(split_dir)
                      if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

        # Count by frame type
        frame_counts = {'onset': 0, 'apex': 0, 'offset': 0}
        for img_file in image_files:
            for frame_type in frame_types:
                if f"_{frame_type}_" in img_file:
                    frame_counts[frame_type] += 1
                    break

        validation_results[split_name] = {
            'directory_exists': True,
            'total_images': len(image_files),
            'frame_breakdown': frame_counts,
            'sample_files': image_files[:3]
        }
    else:
        validation_results[split_name] = {
            'directory_exists': False,
            'total_images': 0,
            'frame_breakdown': {},
            'sample_files': []
        }

print(f"✓ Structure validation:")
for split_name, results in validation_results.items():
    status = "✓" if results['directory_exists'] and results['total_images'] > 0 else "✗"
    print(f"  {status} {split_name}: {results['total_images']} images")
    if results['frame_breakdown']:
        print(f"    Onset: {results['frame_breakdown']['onset']}, "
              f"Apex: {results['frame_breakdown']['apex']}, "
              f"Offset: {results['frame_breakdown']['offset']}")
    if results['sample_files']:
        print(f"    Sample: {results['sample_files'][0]}")

# Final summary
print(f"\n" + "=" * 75)
print("PHASE 3 MULTI-FRAME DATASET PREPARATION SUMMARY")
print("=" * 75)

success_rate = processing_summary_v3['success_rate']
status = "SUCCESS" if success_rate >= 99 else "PARTIAL"

print(f"Processing Status: {status}")
print(f"Total images copied: {total_copied}/{expected_total} ({success_rate:.1f}%)")
print(f"Dataset composition: Multi-frame train + Key-frame val/test")

print(f"\n✓ Split distribution:")
print(f"  TRAIN (Multi-Frame):")
print(f"    Samples: {verification_data['statistics']['train']['total_samples']}")
print(f"    Total frames: {sum(copy_stats['train'].get(ft, 0) for ft in ['onset', 'apex', 'offset'])}")
print(f"    Breakdown: {copy_stats['train']['onset']} onset + {copy_stats['train']['apex']} apex + {copy_stats['train']['offset']} offset")
print(f"    Duplicated frames: {copy_stats['train']['duplicated_frames']} ({processing_summary_v3['copy_statistics']['train']['duplication_rate']:.2f}%)")
print(f"\n  VAL (Key-Frames):")
print(f"    Samples: {verification_data['statistics']['val']['total_samples']}")
print(f"    Total frames: {sum(copy_stats['val'].get(ft, 0) for ft in ['onset', 'apex', 'offset'])} (3 per sample)")
print(f"\n  TEST (Key-Frames):")
print(f"    Samples: {verification_data['statistics']['test']['total_samples']}")
print(f"    Total frames: {sum(copy_stats['test'].get(ft, 0) for ft in ['onset', 'apex', 'offset'])} (3 per sample)")

print(f"\n✓ Data quality:")
print(f"  Fallback strategy applied: {copy_stats['train']['duplicated_frames']} frames duplicated")
print(f"  Duplication rate: {processing_summary_v3['copy_statistics']['train']['duplication_rate']:.2f}% (acceptable threshold)")
print(f"  Copy errors: {len(copy_errors)}")
print(f"  Split consistency: Maintained from Phase 1")

print(f"\n✓ Output files:")
print(f"  - Dataset: data_split_v3/train|val|test/")
print(f"  - Metadata: data_split_v3/split_metadata_v3.json")
print(f"  - Summary: data_split_v3/processing_summary_v3.json")

print(f"\n✓ Next steps:")
print(f"  - Phase 3 dataset ready for model training with temporal augmentation")
print(f"  - Train: 13 frames per sample (onset×4 + apex×5 + offset×4)")
print(f"  - Val/Test: 3 key frames per sample for fair evaluation")
print(f"  - Proceed to Phase 3 model experiments with multi-frame input")

print("=" * 75)

CASME II MULTI-FRAME SAMPLING DATASET PREPARATION - PHASE 3

[1] Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Google Drive mounted successfully

[2] Setting up Phase 3 directory structure...
Target: data_split_v3 (Multi-Frame for Train, Key-Frames for Val/Test)
✓ Created directory: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/data_split_v3/train
✓ Created directory: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/data_split_v3/val
✓ Created directory: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/data_split_v3/test

[3] Loading verification results from Cell 1...
✓ Verification data loaded
  Strategy: Train=Multi-frame, Val/Test=Key-frames
  Train samples: 201
  Expected train frames: 2603
  Problematic sample

In [4]:
# @title Cell 3: CASME II Multi-Frame Sampling Visualization

import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Professional visualization setup
plt.style.use('default')
plt.rcParams.update({
    'font.family': 'DejaVu Sans',
    'font.size': 12,
    'axes.titlesize': 16,
    'axes.labelsize': 14,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'legend.fontsize': 11,
    'figure.titlesize': 18,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'axes.grid': False,
    'figure.facecolor': 'white',
    'axes.facecolor': 'white'
})

def convert_to_serializable(obj):
    """Convert numpy/pandas types to native Python types for JSON serialization"""
    if isinstance(obj, dict):
        return {key: convert_to_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(item) for item in obj]
    elif isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64, np.float32)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif pd.isna(obj):
        return None
    else:
        return obj

# Color palettes
EMOTION_COLORS = {
    'others': '#1f77b4',
    'disgust': '#d62728',
    'happiness': '#2ca02c',
    'repression': '#9467bd',
    'surprise': '#ff7f0e',
    'sadness': '#8c564b',
    'fear': '#e377c2'
}

SPLIT_COLORS = {
    'raw': '#1f77b4',
    'train': '#ff7f0e',
    'val': '#d62728',
    'test': '#2ca02c'
}

print("=" * 80)
print("CASME II MULTI-FRAME SAMPLING VISUALIZATION - PHASE 3")
print("=" * 80)

print("\n[1] Environment setup and drive mounting...")
drive.mount('/content/drive')
print("✓ Google Drive mounted successfully")

# Define paths
base_path = "/content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project"
metadata_path = f"{base_path}/datasets/metadata/CASME2-coding-20140508.xlsx"
data_split_v3_path = f"{base_path}/datasets/processed_casme2/data_split_v3"
split_metadata_v3_path = f"{data_split_v3_path}/split_metadata_v3.json"
visualization_path = f"{base_path}/datasets/visualization/03_casme2-mfs"

os.makedirs(visualization_path, exist_ok=True)
print(f"✓ Output directory created: {visualization_path}")

print("\n[2] Loading raw metadata and Phase 3 data...")

# Load raw metadata for comparison
try:
    raw_metadata = pd.read_excel(metadata_path)
    if raw_metadata['ApexFrame'].dtype == 'object':
        raw_metadata['ApexFrame'] = pd.to_numeric(raw_metadata['ApexFrame'], errors='coerce')
    print(f"✓ Raw metadata loaded: {len(raw_metadata)} samples")
except Exception as e:
    print(f"✗ Error loading raw metadata: {str(e)}")
    exit()

# Load Phase 3 metadata
try:
    with open(split_metadata_v3_path, 'r') as f:
        metadata_v3 = json.load(f)

    print(f"✓ Phase 3 metadata loaded")
    print(f"  Dataset: {metadata_v3['dataset']}")
    print(f"  Strategy: Train={metadata_v3['strategy']['train']}, Val/Test={metadata_v3['strategy']['val']}")

except Exception as e:
    print(f"✗ Error loading metadata: {str(e)}")
    exit()

print("\n[3] File integrity validation...")

splits_info = metadata_v3['splits']
total_actual_files = 0

for split_name in ['train', 'val', 'test']:
    split_dir = f"{data_split_v3_path}/{split_name}"
    if os.path.exists(split_dir):
        image_files = [f for f in os.listdir(split_dir)
                      if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        actual_count = len(image_files)
        expected_count = splits_info[split_name]['count']
        total_actual_files += actual_count
        status = '✓' if actual_count == expected_count else '✗'
        print(f"  {status} {split_name.upper()}: {actual_count}/{expected_count} images")

print(f"✓ Total validated files: {total_actual_files}")

print("\n[4] Preparing visualization data...")

# Extract raw emotion distribution (multiply by 3 for frame comparison)
raw_emotion_dist = raw_metadata['Estimated Emotion'].value_counts()

# Extract class distributions per split
split_data = {}
for split_name in ['train', 'val', 'test']:
    split_data[split_name] = splits_info[split_name]['class_distribution']

# Get all unique emotions sorted by raw distribution (highest to lowest)
all_emotions = raw_emotion_dist.index.tolist()

print(f"✓ Data preparation complete")
print(f"  Emotion classes: {len(all_emotions)}")
print(f"  Raw samples: {len(raw_metadata)}")
print(f"  Total Phase 3 images: {total_actual_files}")

print("\n[5] Generating File 1: Multi-Frame vs Key-Frame Distribution Comparison...")

fig1, ax1 = plt.subplots(1, 1, figsize=(16, 9))

# Prepare data aligned with emotion order
# Raw counts multiplied by 3 to show potential frames
raw_counts = [raw_emotion_dist.get(emotion, 0) * 3 for emotion in all_emotions]
train_counts = [split_data['train'].get(emotion, 0) for emotion in all_emotions]
val_counts = [split_data['val'].get(emotion, 0) for emotion in all_emotions]
test_counts = [split_data['test'].get(emotion, 0) for emotion in all_emotions]

x = np.arange(len(all_emotions))
width = 0.2

# Create grouped bars with raw dataset included
bars0 = ax1.bar(x - width*1.5, raw_counts, width, label='Raw Dataset (Key Frames)',
                color=SPLIT_COLORS['raw'], alpha=0.85)
bars1 = ax1.bar(x - width/2, train_counts, width, label='Train Split (Multi Frame)',
                color=SPLIT_COLORS['train'], alpha=0.85)
bars2 = ax1.bar(x + width/2, val_counts, width, label='Validation Split (Key Frames)',
                color=SPLIT_COLORS['val'], alpha=0.85)
bars3 = ax1.bar(x + width*1.5, test_counts, width, label='Test Split (Key Frames)',
                color=SPLIT_COLORS['test'], alpha=0.85)

ax1.set_title('CASME II Multi-Frame Sampling Distribution - Phase 3\nTrain: Multi-Frame Windows | Val/Test: Key Frames',
              fontsize=18, fontweight='bold', pad=25)
ax1.set_xlabel('Emotion Classes (Sorted by Frequency)', fontsize=16, labelpad=20)
ax1.set_ylabel('Image Count', fontsize=16, labelpad=20)
ax1.set_xticks(x)
ax1.set_xticklabels(all_emotions, rotation=0)

# Add legend with window info
legend_labels = [
    'Raw Dataset (Key Frames)',
    'Train Split (Multi Frame)\nOnset: +[0,1,2,3] | Apex: [-2,-1,0,1,2] | Offset: [-3,-2,-1,0]',
    'Validation Split (Key Frames)',
    'Test Split (Key Frames)'
]
handles = [bars0, bars1, bars2, bars3]
ax1.legend(handles, legend_labels, loc='upper right', fontsize=10)
ax1.grid(False)

# Add value labels on bars
all_bars_data = [(bars0, raw_counts), (bars1, train_counts), (bars2, val_counts), (bars3, test_counts)]
for bars, values in all_bars_data:
    for bar, value in zip(bars, values):
        if value > 0:
            ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5,
                    str(value), ha='center', va='bottom', fontsize=9,
                    fontweight='bold')

plt.tight_layout()
file1_path = f"{visualization_path}/1_multiframe_split_distribution.png"
plt.savefig(file1_path, dpi=300, bbox_inches='tight', facecolor='white')
plt.close()
print(f"✓ File 1 saved: 1_multiframe_split_distribution.png")

print("\n[6] Generating File 2: Multi-Frame Statistical Analysis Table...")

fig2 = plt.figure(figsize=(18, 10))
ax2 = fig2.add_subplot(1, 1, 1)
ax2.axis('off')

# Prepare table data
table_data = []
total_raw = 0
total_train = 0
total_val = 0
total_test = 0

for emotion in all_emotions:
    raw_count = raw_emotion_dist.get(emotion, 0)
    train_count = split_data['train'].get(emotion, 0)
    val_count = split_data['val'].get(emotion, 0)
    test_count = split_data['test'].get(emotion, 0)

    # Calculate original samples from train multi-frame count
    # Each sample has 13 frames total, but distribution varies
    # Use raw count as reference
    train_samples = raw_count * 0.788  # approximate train split ratio

    # Calculate raw percentage
    raw_percentage = (raw_count / len(raw_metadata)) * 100

    # Calculate expansion factor for train
    expansion_factor = f"{train_count / (raw_count * 3) * 100:.0f}%" if raw_count > 0 else "N/A"

    # Calculate imbalance ratio
    max_count = raw_emotion_dist.max()
    imbalance_ratio = f"{max_count / raw_count:.1f}:1" if raw_count > 0 else "∞:1"

    table_data.append([
        emotion.title(),
        raw_count,
        f"{raw_percentage:.1f}%",
        train_count,
        val_count,
        test_count,
        train_count + val_count + test_count,
        expansion_factor,
        imbalance_ratio
    ])

    total_raw += raw_count
    total_train += train_count
    total_val += val_count
    total_test += test_count

# Add total row
total_all = total_train + total_val + total_test

table_data.append([
    'TOTAL',
    total_raw,
    '100.0%',
    total_train,
    total_val,
    total_test,
    total_all,
    f"{total_all / (total_raw * 3) * 100:.0f}%",
    '1.0:1'
])

# Convert to display format
table_display_data = []
for row in table_data:
    display_row = [
        row[0],  # emotion
        f"{row[1]:,}",  # raw samples
        row[2],  # raw percentage
        f"{row[3]:,}",  # train images
        f"{row[4]:,}",  # val images
        f"{row[5]:,}",  # test images
        f"{row[6]:,}",  # total images
        row[7],  # expansion
        row[8]   # imbalance
    ]
    table_display_data.append(display_row)

# Create table
table = ax2.table(
    cellText=table_display_data,
    colLabels=['Emotion', 'Raw\nSamples', 'Raw %', 'Train\nImages\n(Multi-Frame)', 'Val\nImages\n(Key-Frame)',
               'Test\nImages\n(Key-Frame)', 'Total\nImages', 'Train\nExpansion', 'Imbalance'],
    cellLoc='center',
    loc='center',
    colWidths=[0.10, 0.09, 0.08, 0.12, 0.10, 0.10, 0.09, 0.09, 0.09]
)

table.auto_set_font_size(False)
table.set_fontsize(11)
table.scale(1, 2.8)

# Professional table styling
num_rows = len(table_display_data)
for i in range(num_rows + 1):
    for j in range(9):
        cell = table[(i, j)]
        if i == 0:  # Header
            cell.set_facecolor('#1f77b4')
            cell.set_text_props(weight='bold', color='white')
        elif i == num_rows:  # Total row
            cell.set_facecolor('#f0f0f0')
            cell.set_text_props(weight='bold')
        else:
            emotion = table_display_data[i-1][0].lower()
            if emotion in ['fear', 'sadness']:
                cell.set_facecolor('#ffe6e6')
            else:
                cell.set_facecolor('#ffffff')

ax2.set_title('CASME II Multi-Frame Sampling Statistical Analysis (Phase 3)\n' +
              'Train: Onset [+0,+1,+2,+3] + Apex [-2,-1,0,+1,+2] + Offset [-3,-2,-1,0] = 13 frames/sample\n' +
              'Val/Test: Key-Frames Only (Onset + Apex + Offset = 3 frames/sample)',
              fontsize=16, fontweight='bold', pad=40)

plt.tight_layout()
file2_path = f"{visualization_path}/2_multiframe_statistical_table.png"
plt.savefig(file2_path, dpi=300, bbox_inches='tight', facecolor='white')
plt.close()
print(f"✓ File 2 saved: 2_multiframe_statistical_table.png")

print("\n[7] Generating visualization metadata...")

# Calculate totals for metadata
total_samples = len(raw_metadata)
total_images_all = total_train + total_val + total_test

# Generate comprehensive metadata
analysis_metadata = {
    'analysis_timestamp': pd.Timestamp.now().isoformat(),
    'phase': 'Phase 3 - Multi-Frame Sampling',
    'dataset_info': {
        'name': 'CASME II Multi-Frame Sampling Dataset',
        'extraction_strategy': 'train_multi_frame_val_test_keyframes',
        'original_samples': total_samples,
        'total_images': total_images_all,
        'window_configuration': {
            'onset': 'forward_window_+[0,1,2,3]',
            'apex': 'centered_window_[-2,-1,0,1,2]',
            'offset': 'backward_window_[-3,-2,-1,0]'
        },
        'emotion_classes': len(all_emotions)
    },
    'split_distribution': convert_to_serializable({
        'train': {
            'strategy': 'multi_frame_windows',
            'total_images': total_train,
            'original_samples': splits_info['train']['count'] // 13,  # approximate
            'frames_per_sample': 13,
            'class_distribution': split_data['train']
        },
        'validation': {
            'strategy': 'key_frames_only',
            'total_images': total_val,
            'original_samples': total_val // 3,
            'frames_per_sample': 3,
            'class_distribution': split_data['val']
        },
        'test': {
            'strategy': 'key_frames_only',
            'total_images': total_test,
            'original_samples': total_test // 3,
            'frames_per_sample': 3,
            'class_distribution': split_data['test']
        }
    }),
    'visualization_files': {
        'split_distribution': '1_multiframe_split_distribution.png',
        'statistical_table': '2_multiframe_statistical_table.png'
    },
    'color_scheme': {
        'emotion_colors': EMOTION_COLORS,
        'split_colors': SPLIT_COLORS
    }
}

metadata_file = f"{visualization_path}/multiframe_visualization_metadata.json"
with open(metadata_file, 'w') as f:
    json.dump(analysis_metadata, f, indent=2)

print(f"✓ Metadata saved: multiframe_visualization_metadata.json")

print("\n" + "=" * 80)
print("MULTI-FRAME SAMPLING VISUALIZATION COMPLETE - PHASE 3")
print("=" * 80)
print(f"Status: SUCCESS")
print(f"Total images validated: {total_actual_files}")
print(f"Output location: {visualization_path}")

print("\nGenerated files:")
print("  • 1_multiframe_split_distribution.png - Multi-frame vs key-frame comparison")
print("  • 2_multiframe_statistical_table.png - Statistical analysis with expansion info")
print("  • multiframe_visualization_metadata.json - Comprehensive metadata")

print("\nDataset Summary:")
print(f"  Original samples: {total_samples}")
print(f"  Total images (Phase 3): {total_images_all}")
print(f"  Train: {total_train} images (multi-frame windows)")
print(f"  Val: {total_val} images (key-frames)")
print(f"  Test: {total_test} images (key-frames)")

print("\nWindow Configuration:")
print(f"  Train multi-frame strategy:")
print(f"    • Onset: +[0,1,2,3] = 4 frames per sample")
print(f"    • Apex: [-2,-1,0,1,2] = 5 frames per sample")
print(f"    • Offset: [-3,-2,-1,0] = 4 frames per sample")
print(f"    • Total: 13 frames per training sample")
print(f"  Val/Test key-frame strategy:")
print(f"    • 3 frames per sample (onset, apex, offset)")

print("\n✓ Phase 3 visualization ready for thesis documentation")
print("=" * 80)

CASME II MULTI-FRAME SAMPLING VISUALIZATION - PHASE 3

[1] Environment setup and drive mounting...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Google Drive mounted successfully
✓ Output directory created: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/visualization/03_casme2-mfs

[2] Loading raw metadata and Phase 3 data...
✓ Raw metadata loaded: 255 samples
✓ Phase 3 metadata loaded
  Dataset: CASME2_MultiFrameSampling
  Strategy: Train=multi_frame_windows, Val/Test=key_frames_only

[3] File integrity validation...
  ✓ TRAIN: 2613/2613 images
  ✓ VAL: 78/78 images
  ✓ TEST: 83/83 images
✓ Total validated files: 2774

[4] Preparing visualization data...
✓ Data preparation complete
  Emotion classes: 7
  Raw samples: 255
  Total Phase 3 images: 2774

[5] Generating File 1: Multi-Frame vs Key-Frame Distribution Comparison...
✓ File 1 saved: 1_multiframe_split_dist