In [None]:
# @title Cell 1: CASME II Key Frames Availability Verification

import os
import pandas as pd
import numpy as np
import json
from google.colab import drive
from pathlib import Path
from collections import defaultdict

# Mount Google Drive
print("=" * 75)
print("CASME II KEY FRAMES DATASET VERIFICATION - PHASE 2")
print("=" * 75)
print("\n[1] Mounting Google Drive...")
drive.mount('/content/drive')
print("✓ Google Drive mounted successfully")

# Define paths
base_path = "/content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project"
raw_path = f"{base_path}/datasets/raw/CASME2_RAW_selected"
metadata_path = f"{base_path}/datasets/metadata/CASME2-coding-20140508.xlsx"
phase1_split_metadata = f"{base_path}/datasets/processed_casme2/data_split/split_metadata.json"

print(f"\n[2] Loading Phase 1 split metadata for consistency...")
print(f"Phase 1 metadata: {phase1_split_metadata}")

try:
    with open(phase1_split_metadata, 'r') as f:
        phase1_splits = json.load(f)

    total_phase1_samples = sum(split['count'] for split in phase1_splits.values())
    print(f"✓ Phase 1 metadata loaded: {total_phase1_samples} total samples")
    print(f"  Train: {phase1_splits['train']['count']} samples")
    print(f"  Val: {phase1_splits['val']['count']} samples")
    print(f"  Test: {phase1_splits['test']['count']} samples")
except Exception as e:
    print(f"✗ Error loading Phase 1 metadata: {str(e)}")
    exit()

# Load CASME II metadata
print(f"\n[3] Loading CASME II metadata...")

try:
    df = pd.read_excel(metadata_path)

    # Clean ApexFrame column if needed
    if df['ApexFrame'].dtype == 'object':
        df['ApexFrame'] = pd.to_numeric(df['ApexFrame'], errors='coerce')
        print(f"⚠ ApexFrame column contained non-numeric values, converted to numeric")

    print(f"✓ Metadata loaded: {len(df)} records")
    print(f"  Columns: Subject, Filename, OnsetFrame, ApexFrame, OffsetFrame, Estimated Emotion")

except Exception as e:
    print(f"✗ Error loading metadata: {str(e)}")
    exit()

# Create comprehensive sample mapping
print(f"\n[4] Creating comprehensive sample mapping from metadata...")

metadata_map = {}
for idx, row in df.iterrows():
    subject = f"sub{str(row['Subject']).zfill(2)}"
    sequence = row['Filename']
    sample_id = f"{subject}_{sequence}"

    metadata_map[sample_id] = {
        'subject': subject,
        'sequence': sequence,
        'emotion': row['Estimated Emotion'],
        'onset_frame': int(row['OnsetFrame']),
        'apex_frame': int(row['ApexFrame']) if pd.notna(row['ApexFrame']) else None,
        'offset_frame': int(row['OffsetFrame'])
    }

print(f"✓ Sample mapping created: {len(metadata_map)} samples indexed")

# Frame availability verification function
def find_nearest_frame(sequence_path, target_frame, frame_type):
    """
    Find the nearest available frame to target frame.
    Returns tuple: (actual_frame_path, frame_number, status)
    status: 'exact', 'fallback_before', 'fallback_after', 'missing'
    """
    if not os.path.exists(sequence_path):
        return None, None, 'missing'

    # List all image files
    image_files = sorted([f for f in os.listdir(sequence_path)
                         if f.lower().endswith(('.jpg', '.jpeg', '.png'))])

    if not image_files:
        return None, None, 'missing'

    # Extract frame numbers
    frame_numbers = []
    frame_map = {}
    for img_file in image_files:
        try:
            # Extract number from filename (e.g., img46.jpg -> 46)
            frame_num = int(''.join(filter(str.isdigit, img_file.split('.')[0])))
            frame_numbers.append(frame_num)
            frame_map[frame_num] = os.path.join(sequence_path, img_file)
        except ValueError:
            continue

    if not frame_numbers:
        return None, None, 'missing'

    frame_numbers.sort()

    # Check exact match
    if target_frame in frame_map:
        return frame_map[target_frame], target_frame, 'exact'

    # Find nearest frame
    # Try frames before target
    before_frames = [f for f in frame_numbers if f < target_frame]
    # Try frames after target
    after_frames = [f for f in frame_numbers if f > target_frame]

    # Prefer closest frame (before or after)
    candidates = []
    if before_frames:
        closest_before = max(before_frames)
        candidates.append((closest_before, abs(target_frame - closest_before), 'fallback_before'))
    if after_frames:
        closest_after = min(after_frames)
        candidates.append((closest_after, abs(target_frame - closest_after), 'fallback_after'))

    if candidates:
        # Choose the closest one
        candidates.sort(key=lambda x: x[1])
        best_frame, distance, status = candidates[0]
        return frame_map[best_frame], best_frame, status

    # If no suitable frame found, return None
    return None, None, 'missing'

# Verify frame availability for all Phase 1 samples
print(f"\n[5] Verifying key frames availability (onset, apex, offset)...")
print(f"Strategy: Use nearest frame if exact frame not available")

verification_results = {
    'train': [],
    'val': [],
    'test': []
}

statistics = {
    'total_samples': 0,
    'perfect_samples': 0,  # All 3 frames exact match
    'fallback_samples': 0,  # At least 1 frame needs fallback
    'problematic_samples': 0,  # Missing frames even with fallback
    'frame_statistics': {
        'onset': {'exact': 0, 'fallback': 0, 'missing': 0},
        'apex': {'exact': 0, 'fallback': 0, 'missing': 0},
        'offset': {'exact': 0, 'fallback': 0, 'missing': 0}
    }
}

for split_name, split_data in phase1_splits.items():
    print(f"\n  Verifying {split_name.upper()} set ({split_data['count']} samples)...")

    for sample in split_data['samples']:
        sample_id = sample['sample_id']

        # Skip if sample not in metadata
        if sample_id not in metadata_map:
            print(f"  ✗ Sample {sample_id} not found in metadata")
            continue

        meta = metadata_map[sample_id]
        sequence_path = os.path.join(raw_path, meta['subject'], meta['sequence'])

        # Verify each frame type
        onset_path, onset_num, onset_status = find_nearest_frame(
            sequence_path, meta['onset_frame'], 'onset'
        )

        apex_frame = meta['apex_frame'] if meta['apex_frame'] is not None else meta['onset_frame']
        apex_path, apex_num, apex_status = find_nearest_frame(
            sequence_path, apex_frame, 'apex'
        )

        offset_path, offset_num, offset_status = find_nearest_frame(
            sequence_path, meta['offset_frame'], 'offset'
        )

        # Normalize status for statistics (fallback_before/fallback_after -> fallback)
        onset_stat = 'fallback' if 'fallback' in onset_status else onset_status
        apex_stat = 'fallback' if 'fallback' in apex_status else apex_status
        offset_stat = 'fallback' if 'fallback' in offset_status else offset_status

        # Update statistics
        statistics['total_samples'] += 1
        statistics['frame_statistics']['onset'][onset_stat] += 1
        statistics['frame_statistics']['apex'][apex_stat] += 1
        statistics['frame_statistics']['offset'][offset_stat] += 1

        # Determine sample status
        all_exact = (onset_status == 'exact' and apex_status == 'exact' and offset_status == 'exact')
        has_fallback = ('fallback' in onset_status or 'fallback' in apex_status or 'fallback' in offset_status)
        has_missing = (onset_status == 'missing' or apex_status == 'missing' or offset_status == 'missing')

        if all_exact:
            statistics['perfect_samples'] += 1
        elif has_fallback and not has_missing:
            statistics['fallback_samples'] += 1
        elif has_missing:
            statistics['problematic_samples'] += 1

        # Store verification result
        verification_results[split_name].append({
            'sample_id': sample_id,
            'subject': meta['subject'],
            'sequence': meta['sequence'],
            'emotion': meta['emotion'],
            'frames': {
                'onset': {
                    'target_frame': meta['onset_frame'],
                    'actual_frame': onset_num,
                    'path': onset_path,
                    'status': onset_status
                },
                'apex': {
                    'target_frame': apex_frame,
                    'actual_frame': apex_num,
                    'path': apex_path,
                    'status': apex_status
                },
                'offset': {
                    'target_frame': meta['offset_frame'],
                    'actual_frame': offset_num,
                    'path': offset_path,
                    'status': offset_status
                }
            },
            'sample_status': 'perfect' if all_exact else ('fallback' if has_fallback else 'problematic')
        })

    print(f"  ✓ {split_name.upper()} verification completed")

# Display comprehensive statistics
print(f"\n[6] Frame Availability Statistics:")
print(f"=" * 75)

print(f"\nSample Status Summary:")
print(f"  Total samples verified: {statistics['total_samples']}")
print(f"  ✓ Perfect (all exact): {statistics['perfect_samples']} ({statistics['perfect_samples']/statistics['total_samples']*100:.1f}%)")
print(f"  ⚠ Fallback needed: {statistics['fallback_samples']} ({statistics['fallback_samples']/statistics['total_samples']*100:.1f}%)")
print(f"  ✗ Problematic (missing): {statistics['problematic_samples']} ({statistics['problematic_samples']/statistics['total_samples']*100:.1f}%)")

print(f"\nFrame-wise Availability:")
for frame_type, stats in statistics['frame_statistics'].items():
    total = sum(stats.values())
    print(f"  {frame_type.upper()} frame:")
    print(f"    Exact match: {stats['exact']} ({stats['exact']/total*100:.1f}%)")
    print(f"    Fallback used: {stats['fallback']} ({stats['fallback']/total*100:.1f}%)")
    print(f"    Missing: {stats['missing']} ({stats['missing']/total*100:.1f}%)")

# Show sample problematic cases
if statistics['problematic_samples'] > 0:
    print(f"\n[7] Problematic Samples Analysis:")
    problematic_found = 0
    for split_name, samples in verification_results.items():
        for sample in samples:
            if sample['sample_status'] == 'problematic':
                if problematic_found < 5:  # Show first 5
                    print(f"  Sample: {sample['sample_id']}")
                    print(f"    Onset: {sample['frames']['onset']['status']}")
                    print(f"    Apex: {sample['frames']['apex']['status']}")
                    print(f"    Offset: {sample['frames']['offset']['status']}")
                    problematic_found += 1

    if statistics['problematic_samples'] > 5:
        print(f"  ... and {statistics['problematic_samples'] - 5} more problematic samples")

# Export verification results
print(f"\n[8] Exporting verification results...")

output_path = f"{base_path}/datasets/processed_casme2"
verification_file = f"{output_path}/key_frames_verification.json"

verification_export = {
    'verification_date': pd.Timestamp.now().isoformat(),
    'phase': 'Phase 2 - Key Frames Dataset',
    'strategy': 'nearest_frame_fallback',
    'statistics': statistics,
    'split_results': verification_results
}

with open(verification_file, 'w') as f:
    json.dump(verification_export, f, indent=2)

print(f"✓ Verification results saved to: key_frames_verification.json")

# Final readiness assessment
print(f"\n[9] Phase 2 Dataset Readiness Assessment:")
print(f"=" * 75)

readiness_score = (statistics['perfect_samples'] + statistics['fallback_samples']) / statistics['total_samples'] * 100

if readiness_score == 100:
    status = "READY"
    message = "All samples have complete key frames (exact or fallback)"
elif readiness_score >= 95:
    status = "READY WITH CAUTION"
    message = f"{statistics['problematic_samples']} samples may need manual review"
else:
    status = "NEEDS ATTENTION"
    message = f"{statistics['problematic_samples']} problematic samples require investigation"

print(f"Readiness Status: {status}")
print(f"Readiness Score: {readiness_score:.1f}%")
print(f"Assessment: {message}")

print(f"\n✓ Expected Phase 2 dataset size:")
print(f"  Train: {phase1_splits['train']['count']} samples × 3 frames = {phase1_splits['train']['count']*3} images")
print(f"  Val: {phase1_splits['val']['count']} samples × 3 frames = {phase1_splits['val']['count']*3} images")
print(f"  Test: {phase1_splits['test']['count']} samples × 3 frames = {phase1_splits['test']['count']*3} images")
print(f"  Total: {statistics['total_samples']} samples × 3 frames = {statistics['total_samples']*3} images")

print(f"\n✓ Next steps:")
print(f"  - Review verification results in key_frames_verification.json")
print(f"  - Proceed to Cell 2: Key frames extraction and dataset preparation")
print(f"  - Fallback strategy will be applied automatically for missing frames")

print("=" * 75)

CASME II KEY FRAMES DATASET VERIFICATION - PHASE 2

[1] Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Google Drive mounted successfully

[2] Loading Phase 1 split metadata for consistency...
Phase 1 metadata: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/data_split/split_metadata.json
✓ Phase 1 metadata loaded: 255 total samples
  Train: 201 samples
  Val: 26 samples
  Test: 28 samples

[3] Loading CASME II metadata...
⚠ ApexFrame column contained non-numeric values, converted to numeric
✓ Metadata loaded: 255 records
  Columns: Subject, Filename, OnsetFrame, ApexFrame, OffsetFrame, Estimated Emotion

[4] Creating comprehensive sample mapping from metadata...
✓ Sample mapping created: 255 samples indexed

[5] Verifying key frames availability (onset, apex, offset)...
Strategy: Use nearest frame if exact frame not availabl

In [None]:
# @title Cell 2: CASME II Key Frames Extraction and Dataset Preparation

import os
import shutil
import json
import pandas as pd
from google.colab import drive
from pathlib import Path

# Mount Google Drive
print("=" * 75)
print("CASME II KEY FRAMES DATASET PREPARATION - PHASE 2")
print("=" * 75)
print("\n[1] Mounting Google Drive...")
drive.mount('/content/drive')
print("✓ Google Drive mounted successfully")

# Define paths
base_path = "/content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project"
processed_path = f"{base_path}/datasets/processed_casme2"
verification_file = f"{processed_path}/key_frames_verification.json"

print(f"\n[2] Setting up Phase 2 directory structure...")
print(f"Target: data_split_v2 (key frames: onset + apex + offset)")

# Create data_split_v2 directory structure
data_split_v2_path = f"{processed_path}/data_split_v2"
directories = [
    f"{data_split_v2_path}/train",
    f"{data_split_v2_path}/val",
    f"{data_split_v2_path}/test"
]

for directory in directories:
    os.makedirs(directory, exist_ok=True)
    print(f"✓ Created directory: {directory}")

# Load verification results
print(f"\n[3] Loading verification results from Cell 1...")

try:
    with open(verification_file, 'r') as f:
        verification_data = json.load(f)

    print(f"✓ Verification data loaded")
    print(f"  Strategy: {verification_data['strategy']}")
    print(f"  Total samples: {verification_data['statistics']['total_samples']}")
    print(f"  Perfect samples: {verification_data['statistics']['perfect_samples']}")
    print(f"  Fallback samples: {verification_data['statistics']['fallback_samples']}")

except Exception as e:
    print(f"✗ Error loading verification data: {str(e)}")
    exit()

# Extract key frames and copy to split directories
print(f"\n[4] Extracting and copying key frames to data_split_v2...")

copy_stats = {
    'train': {'onset': 0, 'apex': 0, 'offset': 0},
    'val': {'onset': 0, 'apex': 0, 'offset': 0},
    'test': {'onset': 0, 'apex': 0, 'offset': 0}
}

copy_errors = []
metadata_v2 = {
    'train': {'count': 0, 'samples': [], 'class_distribution': {}},
    'val': {'count': 0, 'samples': [], 'class_distribution': {}},
    'test': {'count': 0, 'samples': [], 'class_distribution': {}}
}

frame_types = ['onset', 'apex', 'offset']

for split_name, samples in verification_data['split_results'].items():
    print(f"\n  Processing {split_name.upper()} set ({len(samples)} samples × 3 frames)...")

    split_dir = f"{data_split_v2_path}/{split_name}"

    for sample in samples:
        sample_id = sample['sample_id']
        emotion = sample['emotion']
        subject = sample['subject']
        sequence = sample['sequence']

        # Process each frame type
        for frame_type in frame_types:
            frame_info = sample['frames'][frame_type]
            source_path = frame_info['path']

            if source_path is None:
                copy_errors.append(f"Missing {frame_type} frame for {sample_id}")
                continue

            # Create destination filename: {sample_id}_{frame_type}_{emotion}.jpg
            dest_filename = f"{sample_id}_{frame_type}_{emotion}.jpg"
            dest_path = os.path.join(split_dir, dest_filename)

            try:
                if os.path.exists(source_path):
                    shutil.copy2(source_path, dest_path)
                    copy_stats[split_name][frame_type] += 1

                    # Add to metadata
                    metadata_v2[split_name]['samples'].append({
                        'sample_id': f"{sample_id}_{frame_type}",
                        'original_sample_id': sample_id,
                        'frame_type': frame_type,
                        'subject': subject,
                        'sequence': sequence,
                        'emotion': emotion,
                        'image_filename': dest_filename,
                        'target_frame': frame_info['target_frame'],
                        'actual_frame': frame_info['actual_frame'],
                        'frame_status': frame_info['status']
                    })

                else:
                    copy_errors.append(f"Source not found: {source_path}")

            except Exception as e:
                copy_errors.append(f"Copy error for {sample_id} {frame_type}: {str(e)}")

    # Update split statistics
    total_frames = sum(copy_stats[split_name].values())
    metadata_v2[split_name]['count'] = total_frames

    print(f"  ✓ {split_name.upper()} extraction completed:")
    print(f"    Onset frames: {copy_stats[split_name]['onset']}")
    print(f"    Apex frames: {copy_stats[split_name]['apex']}")
    print(f"    Offset frames: {copy_stats[split_name]['offset']}")
    print(f"    Total: {total_frames} images")

# Calculate class distribution per split
print(f"\n[5] Calculating class distribution for each split...")

for split_name in ['train', 'val', 'test']:
    emotion_counts = {}
    for sample in metadata_v2[split_name]['samples']:
        emotion = sample['emotion']
        emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1

    metadata_v2[split_name]['class_distribution'] = emotion_counts

    print(f"\n  {split_name.upper()} set distribution ({metadata_v2[split_name]['count']} images):")
    for emotion, count in sorted(emotion_counts.items(), key=lambda x: x[1], reverse=True):
        percentage = (count / metadata_v2[split_name]['count']) * 100
        print(f"    {emotion}: {count} images ({percentage:.1f}%)")

# Display copy errors if any
if copy_errors:
    print(f"\n[6] Copy Errors Summary:")
    print(f"  Total errors: {len(copy_errors)}")
    if len(copy_errors) <= 5:
        for error in copy_errors:
            print(f"    {error}")
    else:
        for error in copy_errors[:5]:
            print(f"    {error}")
        print(f"    ... and {len(copy_errors) - 5} more errors")

# Save enhanced metadata
print(f"\n[7] Saving enhanced metadata with frame type tracking...")

metadata_file_v2 = f"{data_split_v2_path}/split_metadata_v2.json"

metadata_export = {
    'dataset': 'CASME2_KeyFrames',
    'phase': 'Phase 2',
    'frame_types': ['onset', 'apex', 'offset'],
    'expansion_factor': 3,
    'creation_date': pd.Timestamp.now().isoformat(),
    'splits': metadata_v2
}

with open(metadata_file_v2, 'w') as f:
    json.dump(metadata_export, f, indent=2)

print(f"✓ Enhanced metadata saved to: data_split_v2/split_metadata_v2.json")

# Save processing summary
print(f"\n[8] Generating processing summary...")

total_copied = sum(sum(stats.values()) for stats in copy_stats.values())
expected_total = verification_data['statistics']['total_samples'] * 3

processing_summary_v2 = {
    'dataset': 'CASME2_KeyFrames',
    'phase': 'Phase 2',
    'processing_date': pd.Timestamp.now().isoformat(),
    'source_phase1_samples': verification_data['statistics']['total_samples'],
    'expansion_strategy': 'onset_apex_offset_extraction',
    'frame_types': ['onset', 'apex', 'offset'],
    'copy_statistics': {
        'train': {
            'total_images': sum(copy_stats['train'].values()),
            'frame_breakdown': copy_stats['train']
        },
        'val': {
            'total_images': sum(copy_stats['val'].values()),
            'frame_breakdown': copy_stats['val']
        },
        'test': {
            'total_images': sum(copy_stats['test'].values()),
            'frame_breakdown': copy_stats['test']
        }
    },
    'total_images_copied': total_copied,
    'expected_images': expected_total,
    'success_rate': (total_copied / expected_total * 100) if expected_total > 0 else 0,
    'copy_errors': len(copy_errors),
    'fallback_frames_used': verification_data['statistics']['fallback_samples'],
    'class_preservation': {
        'train': metadata_v2['train']['class_distribution'],
        'val': metadata_v2['val']['class_distribution'],
        'test': metadata_v2['test']['class_distribution']
    }
}

summary_file_v2 = f"{data_split_v2_path}/processing_summary_v2.json"

with open(summary_file_v2, 'w') as f:
    json.dump(processing_summary_v2, f, indent=2)

print(f"✓ Processing summary saved to: data_split_v2/processing_summary_v2.json")

# Final validation
print(f"\n[9] Final validation of data_split_v2 structure...")

validation_results = {}
for split_name in ['train', 'val', 'test']:
    split_dir = f"{data_split_v2_path}/{split_name}"

    if os.path.exists(split_dir):
        image_files = [f for f in os.listdir(split_dir)
                      if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

        # Count by frame type
        frame_counts = {'onset': 0, 'apex': 0, 'offset': 0}
        for img_file in image_files:
            for frame_type in frame_types:
                if f"_{frame_type}_" in img_file:
                    frame_counts[frame_type] += 1
                    break

        validation_results[split_name] = {
            'directory_exists': True,
            'total_images': len(image_files),
            'frame_breakdown': frame_counts,
            'sample_files': image_files[:3]
        }
    else:
        validation_results[split_name] = {
            'directory_exists': False,
            'total_images': 0,
            'frame_breakdown': {},
            'sample_files': []
        }

print(f"✓ Structure validation:")
for split_name, results in validation_results.items():
    status = "✓" if results['directory_exists'] and results['total_images'] > 0 else "✗"
    print(f"  {status} {split_name}: {results['total_images']} images")
    if results['frame_breakdown']:
        print(f"    Onset: {results['frame_breakdown']['onset']}, "
              f"Apex: {results['frame_breakdown']['apex']}, "
              f"Offset: {results['frame_breakdown']['offset']}")
    if results['sample_files']:
        print(f"    Sample: {results['sample_files'][0]}")

# Final summary
print(f"\n" + "=" * 75)
print("PHASE 2 DATASET PREPARATION SUMMARY")
print("=" * 75)

success_rate = processing_summary_v2['success_rate']
status = "SUCCESS" if success_rate >= 99 else "PARTIAL"

print(f"Processing Status: {status}")
print(f"Total images copied: {total_copied}/{expected_total} ({success_rate:.1f}%)")
print(f"Dataset expansion: {verification_data['statistics']['total_samples']} samples → {total_copied} images (3x)")
print(f"Frame distribution: {copy_stats['train']['onset']+copy_stats['val']['onset']+copy_stats['test']['onset']} onset, "
      f"{copy_stats['train']['apex']+copy_stats['val']['apex']+copy_stats['test']['apex']} apex, "
      f"{copy_stats['train']['offset']+copy_stats['val']['offset']+copy_stats['test']['offset']} offset")

print(f"\n✓ Split distribution:")
print(f"  Train: {sum(copy_stats['train'].values())} images (201 samples × 3)")
print(f"  Val: {sum(copy_stats['val'].values())} images (26 samples × 3)")
print(f"  Test: {sum(copy_stats['test'].values())} images (28 samples × 3)")

print(f"\n✓ Data quality:")
print(f"  Fallback frames used: {verification_data['statistics']['fallback_samples']}")
print(f"  Copy errors: {len(copy_errors)}")
print(f"  Split consistency: Maintained from Phase 1")

print(f"\n✓ Output files:")
print(f"  - Dataset: data_split_v2/train|val|test/")
print(f"  - Metadata: data_split_v2/split_metadata_v2.json")
print(f"  - Summary: data_split_v2/processing_summary_v2.json")

print(f"\n✓ Next steps:")
print(f"  - Phase 2 dataset ready for model training")
print(f"  - Each sample now represented by 3 temporal frames")
print(f"  - Proceed to model experiments with key frames approach")

print("=" * 75)

CASME II KEY FRAMES DATASET PREPARATION - PHASE 2

[1] Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Google Drive mounted successfully

[2] Setting up Phase 2 directory structure...
Target: data_split_v2 (key frames: onset + apex + offset)
✓ Created directory: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/data_split_v2/train
✓ Created directory: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/data_split_v2/val
✓ Created directory: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/data_split_v2/test

[3] Loading verification results from Cell 1...
✓ Verification data loaded
  Strategy: nearest_frame_fallback
  Total samples: 255
  Perfect samples: 254
  Fallback samples: 1

[4] Extracting and copying key frames to da

In [2]:
# @title Cell 3: CASME II Key Frames Visualization

import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Professional visualization setup
plt.style.use('default')
plt.rcParams.update({
    'font.family': 'DejaVu Sans',
    'font.size': 12,
    'axes.titlesize': 16,
    'axes.labelsize': 14,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'legend.fontsize': 12,
    'figure.titlesize': 18,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'axes.grid': False,
    'figure.facecolor': 'white',
    'axes.facecolor': 'white'
})

def convert_to_serializable(obj):
    """Convert numpy/pandas types to native Python types for JSON serialization"""
    if isinstance(obj, dict):
        return {key: convert_to_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(item) for item in obj]
    elif isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64, np.float32)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif pd.isna(obj):
        return None
    else:
        return obj

# Color palettes
EMOTION_COLORS = {
    'others': '#1f77b4',
    'disgust': '#d62728',
    'happiness': '#2ca02c',
    'repression': '#9467bd',
    'surprise': '#ff7f0e',
    'sadness': '#8c564b',
    'fear': '#e377c2'
}

SPLIT_COLORS = {
    'raw': '#1f77b4',
    'train': '#ff7f0e',
    'val': '#d62728',
    'test': '#2ca02c'
}

print("=" * 80)
print("CASME II KEY FRAMES DATASET VISUALIZATION - PHASE 2")
print("=" * 80)

print("\n[1] Environment setup and drive mounting...")
drive.mount('/content/drive')
print("✓ Google Drive mounted successfully")

# Define paths
base_path = "/content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project"
metadata_path = f"{base_path}/datasets/metadata/CASME2-coding-20140508.xlsx"
data_split_v2_path = f"{base_path}/datasets/processed_casme2/data_split_v2"
split_metadata_v2_path = f"{data_split_v2_path}/split_metadata_v2.json"
visualization_path = f"{base_path}/datasets/visualization/02_casme2-kfs"

os.makedirs(visualization_path, exist_ok=True)
print(f"✓ Output directory created: {visualization_path}")

print("\n[2] Loading raw metadata and Phase 2 data...")

# Load raw metadata for comparison
try:
    raw_metadata = pd.read_excel(metadata_path)
    if raw_metadata['ApexFrame'].dtype == 'object':
        raw_metadata['ApexFrame'] = pd.to_numeric(raw_metadata['ApexFrame'], errors='coerce')
    print(f"✓ Raw metadata loaded: {len(raw_metadata)} samples")
except Exception as e:
    print(f"✗ Error loading raw metadata: {str(e)}")
    exit()

# Load Phase 2 metadata

try:
    with open(split_metadata_v2_path, 'r') as f:
        metadata_v2 = json.load(f)

    print(f"✓ Phase 2 metadata loaded")
    print(f"  Dataset: {metadata_v2['dataset']}")
    print(f"  Frame types: {', '.join(metadata_v2['frame_types'])}")
    print(f"  Expansion factor: {metadata_v2['expansion_factor']}x")

except Exception as e:
    print(f"✗ Error loading metadata: {str(e)}")
    exit()

print("\n[3] File integrity validation...")

splits_info = metadata_v2['splits']
total_actual_files = 0

for split_name in ['train', 'val', 'test']:
    split_dir = f"{data_split_v2_path}/{split_name}"
    if os.path.exists(split_dir):
        image_files = [f for f in os.listdir(split_dir)
                      if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        actual_count = len(image_files)
        expected_count = splits_info[split_name]['count']
        total_actual_files += actual_count
        status = '✓' if actual_count == expected_count else '✗'
        print(f"  {status} {split_name.upper()}: {actual_count}/{expected_count} images")

print(f"✓ Total validated files: {total_actual_files}")

print("\n[4] Preparing visualization data...")

# Extract raw emotion distribution
raw_emotion_dist = raw_metadata['Estimated Emotion'].value_counts()

# Extract class distributions per split
split_data = {}
for split_name in ['train', 'val', 'test']:
    split_data[split_name] = splits_info[split_name]['class_distribution']

# Get all unique emotions sorted by raw distribution (highest to lowest)
all_emotions = raw_emotion_dist.index.tolist()

print(f"✓ Data preparation complete")
print(f"  Emotion classes: {len(all_emotions)}")
print(f"  Raw samples: {len(raw_metadata)}")
print(f"  Total Phase 2 images: {splits_info['train']['count'] + splits_info['val']['count'] + splits_info['test']['count']}")

print("\n[5] Generating File 1: Key Frames vs Raw Distribution Comparison...")

fig1, ax1 = plt.subplots(1, 1, figsize=(16, 8))

# Prepare data aligned with emotion order
# Raw counts multiplied by 3 to show potential frames (onset+apex+offset)
raw_counts = [raw_emotion_dist.get(emotion, 0) * 3 for emotion in all_emotions]
train_counts = [split_data['train'].get(emotion, 0) for emotion in all_emotions]
val_counts = [split_data['val'].get(emotion, 0) for emotion in all_emotions]
test_counts = [split_data['test'].get(emotion, 0) for emotion in all_emotions]

x = np.arange(len(all_emotions))
width = 0.2

# Create grouped bars with raw dataset included
bars0 = ax1.bar(x - width*1.5, raw_counts, width, label='Raw Dataset (Key Frames)',
                color=SPLIT_COLORS['raw'], alpha=0.85)
bars1 = ax1.bar(x - width/2, train_counts, width, label='Train Split (Key Frames)',
                color=SPLIT_COLORS['train'], alpha=0.85)
bars2 = ax1.bar(x + width/2, val_counts, width, label='Validation Split (Key Frames)',
                color=SPLIT_COLORS['val'], alpha=0.85)
bars3 = ax1.bar(x + width*1.5, test_counts, width, label='Test Split (Key Frames)',
                color=SPLIT_COLORS['test'], alpha=0.85)

ax1.set_title('CASME II Key Frames Distribution - Raw vs Split Comparison',
              fontsize=18, fontweight='bold', pad=25)
ax1.set_xlabel('Emotion Classes (Sorted by Frequency)', fontsize=16, labelpad=20)
ax1.set_ylabel('Image Count', fontsize=16, labelpad=20)
ax1.set_xticks(x)
ax1.set_xticklabels(all_emotions, rotation=0)
ax1.legend(loc='upper right', fontsize=13)
ax1.grid(False)

# Add value labels on bars
all_bars_data = [(bars0, raw_counts), (bars1, train_counts), (bars2, val_counts), (bars3, test_counts)]
for bars, values in all_bars_data:
    for bar, value in zip(bars, values):
        if value > 0:
            ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2,
                    str(value), ha='center', va='bottom', fontsize=9,
                    fontweight='bold')

plt.tight_layout()
file1_path = f"{visualization_path}/1_keyframes_split_distribution.png"
plt.savefig(file1_path, dpi=300, bbox_inches='tight', facecolor='white')
plt.close()
print(f"✓ File 1 saved: 1_keyframes_split_distribution.png")

print("\n[6] Generating File 2: Key Frames Statistical Analysis Table...")

fig2 = plt.figure(figsize=(18, 10))
ax2 = fig2.add_subplot(1, 1, 1)
ax2.axis('off')

# Prepare table data
table_data = []
total_raw = 0
total_train = 0
total_val = 0
total_test = 0

for emotion in all_emotions:
    raw_count = raw_emotion_dist.get(emotion, 0)
    train_count = split_data['train'].get(emotion, 0)
    val_count = split_data['val'].get(emotion, 0)
    test_count = split_data['test'].get(emotion, 0)
    total_images = train_count + val_count + test_count

    # Calculate expansion verification (should be 3x)
    expansion_check = f"{total_images // raw_count}x" if raw_count > 0 else "N/A"

    # Calculate percentages
    raw_percentage = (raw_count / len(raw_metadata)) * 100

    # Calculate imbalance ratio
    max_count = raw_emotion_dist.max()
    imbalance_ratio = f"{max_count / raw_count:.1f}:1" if raw_count > 0 else "∞:1"

    table_data.append([
        emotion.title(),
        raw_count,
        f"{raw_percentage:.1f}%",
        train_count,
        val_count,
        test_count,
        total_images,
        expansion_check,
        imbalance_ratio
    ])

    total_raw += raw_count
    total_train += train_count
    total_val += val_count
    total_test += test_count

# Add total row
total_images_all = total_train + total_val + total_test

table_data.append([
    'TOTAL',
    total_raw,
    '100.0%',
    total_train,
    total_val,
    total_test,
    total_images_all,
    '3x',
    '1.0:1'
])

# Convert to display format
table_display_data = []
for row in table_data:
    display_row = [
        row[0],  # emotion
        f"{row[1]:,}",  # raw samples
        row[2],  # raw percentage
        f"{row[3]:,}",  # train images
        f"{row[4]:,}",  # val images
        f"{row[5]:,}",  # test images
        f"{row[6]:,}",  # total images
        row[7],  # expansion
        row[8]   # imbalance
    ]
    table_display_data.append(display_row)

# Create table
table = ax2.table(
    cellText=table_display_data,
    colLabels=['Emotion', 'Raw\nSamples', 'Raw %', 'Train\nImages', 'Val\nImages',
               'Test\nImages', 'Total\nImages', 'Expansion', 'Imbalance'],
    cellLoc='center',
    loc='center',
    colWidths=[0.11, 0.10, 0.09, 0.10, 0.10, 0.10, 0.10, 0.09, 0.10]
)

table.auto_set_font_size(False)
table.set_fontsize(11)
table.scale(1, 2.8)

# Professional table styling
num_rows = len(table_display_data)
for i in range(num_rows + 1):
    for j in range(9):
        cell = table[(i, j)]
        if i == 0:  # Header
            cell.set_facecolor('#1f77b4')
            cell.set_text_props(weight='bold', color='white')
        elif i == num_rows:  # Total row
            cell.set_facecolor('#f0f0f0')
            cell.set_text_props(weight='bold')
        else:
            emotion = table_display_data[i-1][0].lower()
            if emotion in ['fear', 'sadness']:
                cell.set_facecolor('#ffe6e6')
            else:
                cell.set_facecolor('#ffffff')

ax2.set_title('CASME II Key Frames Dataset Statistical Analysis (Phase 2)\n' +
              'Temporal Expansion: Each Sample → 3 Frames (Onset + Apex + Offset)',
              fontsize=18, fontweight='bold', pad=40)

plt.tight_layout()
file2_path = f"{visualization_path}/2_keyframes_statistical_table.png"
plt.savefig(file2_path, dpi=300, bbox_inches='tight', facecolor='white')
plt.close()
print(f"✓ File 2 saved: 2_keyframes_statistical_table.png")

print("\n[7] Generating visualization metadata...")

# Calculate totals for metadata
total_samples = len(raw_metadata)
total_images_all = total_train + total_val + total_test

# Generate comprehensive metadata
analysis_metadata = {
    'analysis_timestamp': pd.Timestamp.now().isoformat(),
    'phase': 'Phase 2 - Key Frames Dataset',
    'dataset_info': {
        'name': 'CASME II Key Frames Dataset',
        'expansion_strategy': 'onset_apex_offset_extraction',
        'original_samples': total_samples,
        'total_images': total_images_all,
        'expansion_factor': 3,
        'frame_types': ['onset', 'apex', 'offset'],
        'emotion_classes': len(all_emotions)
    },
    'split_distribution': convert_to_serializable({
        'train': {
            'total_images': total_train,
            'original_samples': total_train // 3,
            'class_distribution': split_data['train']
        },
        'validation': {
            'total_images': total_val,
            'original_samples': total_val // 3,
            'class_distribution': split_data['val']
        },
        'test': {
            'total_images': total_test,
            'original_samples': total_test // 3,
            'class_distribution': split_data['test']
        }
    }),
    'visualization_files': {
        'split_distribution': '1_keyframes_split_distribution.png',
        'statistical_table': '2_keyframes_statistical_table.png'
    },
    'color_scheme': {
        'emotion_colors': EMOTION_COLORS,
        'split_colors': SPLIT_COLORS
    }
}

metadata_file = f"{visualization_path}/keyframes_visualization_metadata.json"
with open(metadata_file, 'w') as f:
    json.dump(analysis_metadata, f, indent=2)

print(f"✓ Metadata saved: keyframes_visualization_metadata.json")

print("\n" + "=" * 80)
print("KEY FRAMES VISUALIZATION COMPLETE - PHASE 2")
print("=" * 80)
print(f"Status: SUCCESS")
print(f"Total images validated: {total_actual_files}")
print(f"Output location: {visualization_path}")

print("\nGenerated files:")
print("  • 1_keyframes_split_distribution.png - Split comparison with 3x expansion")
print("  • 2_keyframes_statistical_table.png - Statistical analysis table")
print("  • keyframes_visualization_metadata.json - Comprehensive metadata")

print("\nDataset Summary:")
print(f"  Original samples: {total_samples}")
print(f"  Total images (3x): {total_images_all}")
print(f"  Train: {total_train} images ({total_train//3} samples × 3)")
print(f"  Val: {total_val} images ({total_val//3} samples × 3)")
print(f"  Test: {total_test} images ({total_test//3} samples × 3)")

print("\n✓ Phase 2 visualization ready for thesis documentation")
print("=" * 80)

CASME II KEY FRAMES DATASET VISUALIZATION - PHASE 2

[1] Environment setup and drive mounting...
Mounted at /content/drive
✓ Google Drive mounted successfully
✓ Output directory created: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/visualization/02_casme2-kfs

[2] Loading raw metadata and Phase 2 data...
✓ Raw metadata loaded: 255 samples
✓ Phase 2 metadata loaded
  Dataset: CASME2_KeyFrames
  Frame types: onset, apex, offset
  Expansion factor: 3x

[3] File integrity validation...
  ✓ TRAIN: 603/603 images
  ✓ VAL: 78/78 images
  ✓ TEST: 84/84 images
✓ Total validated files: 765

[4] Preparing visualization data...
✓ Data preparation complete
  Emotion classes: 7
  Raw samples: 255
  Total Phase 2 images: 765

[5] Generating File 1: Key Frames vs Raw Distribution Comparison...
✓ File 1 saved: 1_keyframes_split_distribution.png

[6] Generating File 2: Key Frames Statistical Analysis Table...
✓ File 2 saved: 2_keyframes_statistical_table.png

[7]