In [None]:
# @title Cell 1: CASME II ZIP Extraction

import os
import zipfile
import time
from google.colab import drive

# Mount Google Drive
print("=" * 60)
print("CASME II ROBUST ZIP EXTRACTION")
print("=" * 60)
print("\n[1] Mounting Google Drive...")
drive.mount('/content/drive')
print("✓ Google Drive mounted successfully")

# Define paths - extract directly to raw directory
base_path = "/content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project"
raw_path = f"{base_path}/datasets/raw"
zip_file_path = f"{raw_path}/CASME2_RAW_selected.zip"

print(f"\n[2] Verification and cleanup...")
print(f"ZIP file location: {zip_file_path}")
print(f"Extract destination: {raw_path}")

# Check ZIP file existence and size
if not os.path.exists(zip_file_path):
    print("✗ ERROR: CASME2_RAW_selected.zip not found")
    print("Please ensure ZIP file is uploaded to datasets/raw/ directory")
    exit()

zip_size_mb = round(os.path.getsize(zip_file_path) / (1024 * 1024), 2)
print(f"✓ ZIP file found (Size: {zip_size_mb} MB)")

# Clean up any partial extraction
casme2_folder = os.path.join(raw_path, "CASME2")
casme2_raw_selected = os.path.join(raw_path, "CASME2_RAW_selected")

print(f"\n[3] Cleanup previous extraction attempts...")

# Remove any existing partial extractions
cleanup_paths = [casme2_folder, casme2_raw_selected]
for path in cleanup_paths:
    if os.path.exists(path):
        import shutil
        shutil.rmtree(path)
        print(f"✓ Removed existing directory: {os.path.basename(path)}")
    else:
        print(f"- No existing directory: {os.path.basename(path)}")

# Inspect ZIP contents first
print(f"\n[4] Analyzing ZIP file structure...")
try:
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        file_list = zip_ref.namelist()
        total_files = len(file_list)

        # Analyze directory structure in ZIP
        directories = set()
        subject_folders = set()

        for file_path in file_list:
            # Get all directory components
            parts = file_path.split('/')
            if len(parts) > 1:
                directories.add(parts[0])  # Root level folders
                if len(parts) > 2 and parts[1].startswith('sub'):
                    subject_folders.add(parts[1])

        print(f"✓ ZIP analysis complete:")
        print(f"  - Total files: {total_files}")
        print(f"  - Root directories: {len(directories)}")
        print(f"  - Subject folders detected: {len(subject_folders)}")
        print(f"  - Root structure: {sorted(list(directories))}")

        if subject_folders:
            sorted_subjects = sorted(list(subject_folders))
            if len(sorted_subjects) <= 10:
                print(f"  - Subject folders: {sorted_subjects}")
            else:
                print(f"  - Subject range: {sorted_subjects[0]} to {sorted_subjects[-1]} ({len(sorted_subjects)} total)")

except Exception as e:
    print(f"✗ ERROR analyzing ZIP: {str(e)}")
    exit()

# Robust extraction with progress tracking
print(f"\n[5] Starting robust extraction...")
start_time = time.time()

try:
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        extracted_count = 0
        error_count = 0

        for file_info in zip_ref.infolist():
            try:
                # Extract individual file
                zip_ref.extract(file_info, raw_path)
                extracted_count += 1

                # Progress tracking every 1000 files
                if extracted_count % 1000 == 0:
                    elapsed = time.time() - start_time
                    rate = extracted_count / elapsed
                    remaining = total_files - extracted_count
                    eta = remaining / rate if rate > 0 else 0

                    print(f"  Progress: {extracted_count}/{total_files} ({(extracted_count/total_files)*100:.1f}%) | "
                          f"Rate: {rate:.1f} files/sec | ETA: {eta/60:.1f} min")

            except Exception as file_error:
                error_count += 1
                if error_count <= 5:  # Show first 5 errors only
                    print(f"  ⚠ Error extracting {file_info.filename}: {str(file_error)}")

    extraction_time = time.time() - start_time
    print(f"\n✓ Extraction completed:")
    print(f"  - Files extracted: {extracted_count}/{total_files}")
    print(f"  - Errors encountered: {error_count}")
    print(f"  - Total time: {extraction_time:.2f} seconds")
    print(f"  - Average rate: {extracted_count/extraction_time:.1f} files/second")

except Exception as e:
    print(f"✗ EXTRACTION FAILED: {str(e)}")
    exit()

# Verify extraction results
print(f"\n[6] Verifying extraction results...")

# Find the actual extracted structure
extracted_items = []
for item in os.listdir(raw_path):
    if item != "CASME2_RAW_selected.zip":  # Skip the ZIP file
        item_path = os.path.join(raw_path, item)
        if os.path.isdir(item_path):
            extracted_items.append(item)

print(f"✓ Extracted directory structure:")
for item in sorted(extracted_items):
    item_path = os.path.join(raw_path, item)

    # Count contents
    total_contents = 0
    subject_dirs = 0

    if os.path.exists(item_path):
        for root, dirs, files in os.walk(item_path):
            total_contents += len(files)
            # Count directories that look like subjects
            for d in dirs:
                if d.startswith('sub') and len(d) >= 4:
                    subject_dirs += 1

    print(f"  {item}/: {total_contents} files, {subject_dirs} subject directories")

# Final status
extraction_success = extracted_count > 0 and error_count < (total_files * 0.1)

print(f"\n" + "=" * 60)
print("EXTRACTION SUMMARY")
print("=" * 60)
print(f"Status: {'SUCCESS' if extraction_success else 'PARTIAL/FAILED'}")
print(f"Files processed: {extracted_count}/{total_files}")
print(f"Error rate: {(error_count/total_files)*100:.2f}%" if total_files > 0 else "N/A")
print(f"Extraction time: {extraction_time/60:.1f} minutes")

if extraction_success:
    print(f"✓ Ready for Cell 2: Structure validation and analysis")
else:
    print(f"⚠ Partial extraction - may need to retry or investigate ZIP file integrity")

print("=" * 60)

CASME II ROBUST ZIP EXTRACTION

[1] Mounting Google Drive...
Mounted at /content/drive
✓ Google Drive mounted successfully

[2] Verification and cleanup...
ZIP file location: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/raw/CASME2_RAW_selected.zip
Extract destination: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/raw
✓ ZIP file found (Size: 967.34 MB)

[3] Cleanup previous extraction attempts...
- No existing directory: CASME2
- No existing directory: CASME2_RAW_selected

[4] Analyzing ZIP file structure...
✓ ZIP analysis complete:
  - Total files: 17406
  - Root directories: 1
  - Subject folders detected: 26
  - Root structure: ['CASME2_RAW_selected']
  - Subject range: sub01 to sub26 (26 total)

[5] Starting robust extraction...
  Progress: 1000/17406 (5.7%) | Rate: 88.0 files/sec | ETA: 3.1 min
  Progress: 2000/17406 (11.5%) | Rate: 95.3 files/sec | ETA: 2.7 min
  Progress: 3000/17406 (17.2%) | Rate: 9

In [None]:
# @title Cell 2: CASME II Metadata Analysis and Structure Mapping

import pandas as pd
import numpy as np
from google.colab import drive
from collections import Counter
import os

# Mount Google Drive
print("=" * 70)
print("CASME II METADATA ANALYSIS AND STRUCTURE MAPPING")
print("=" * 70)
print("\n[1] Mounting Google Drive...")
drive.mount('/content/drive')
print("✓ Google Drive mounted successfully")

# Define metadata path
base_path = "/content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project"
metadata_path = f"{base_path}/datasets/metadata/CASME2-coding-20140508.xlsx"

print(f"\n[2] Loading metadata file...")
print(f"File path: {metadata_path}")

# Load metadata with comprehensive error handling
try:
    if os.path.exists(metadata_path):
        file_size_mb = round(os.path.getsize(metadata_path) / (1024 * 1024), 3)
        print(f"✓ Metadata file found (Size: {file_size_mb} MB)")

        # Load Excel file
        df = pd.read_excel(metadata_path)
        print(f"✓ Excel file loaded successfully")
        print(f"  - Shape: {df.shape[0]} rows × {df.shape[1]} columns")

    else:
        print("✗ Metadata file not found")
        print("Please ensure CASME2-coding-20140508.xlsx is in datasets/metadata/")
        exit()

except Exception as e:
    print(f"✗ Error loading metadata: {str(e)}")
    exit()

# Column structure analysis
print(f"\n[3] Column structure analysis...")
print(f"Raw column names:")
for i, col in enumerate(df.columns):
    print(f"  [{i}] '{col}' - {df[col].dtype}")

# Handle unnamed columns and clean column names
print(f"\n[4] Column cleaning and mapping...")
cleaned_columns = []
column_mapping = {}

for i, col in enumerate(df.columns):
    if 'Unnamed' in str(col):
        # Try to infer content from first few non-null values
        sample_values = df[col].dropna().head(3).tolist()
        print(f"  Unnamed column [{i}]: Sample values = {sample_values}")
        cleaned_name = f"Unknown_Col_{i}"
    else:
        cleaned_name = str(col).strip()

    cleaned_columns.append(cleaned_name)
    column_mapping[col] = cleaned_name

# Update dataframe with cleaned columns
df_clean = df.copy()
df_clean.columns = cleaned_columns

print(f"\n✓ Cleaned column mapping:")
for orig, clean in column_mapping.items():
    non_null_count = df[orig].count()
    print(f"  '{orig}' → '{clean}' ({non_null_count} non-null values)")

# Data type and completeness analysis
print(f"\n[5] Data completeness analysis...")
completeness_stats = []

for col in df_clean.columns:
    total_values = len(df_clean)
    non_null_values = df_clean[col].count()
    null_values = df_clean[col].isnull().sum()
    completeness_pct = (non_null_values / total_values) * 100

    completeness_stats.append({
        'Column': col,
        'Non_Null': non_null_values,
        'Null': null_values,
        'Completeness': f"{completeness_pct:.1f}%",
        'Data_Type': str(df_clean[col].dtype)
    })

# Display completeness table
completeness_df = pd.DataFrame(completeness_stats)
print(f"\n✓ Data completeness summary:")
print(completeness_df.to_string(index=False))

# Core field identification for MER
print(f"\n[6] Core field identification for micro-expression recognition...")

# Identify key columns based on common MER naming patterns
key_fields = {}
potential_mappings = {
    'subject': ['subject', 'participant', 'person', 'id'],
    'filename': ['filename', 'file', 'video', 'clip'],
    'onset_frame': ['onset', 'start', 'begin'],
    'apex_frame': ['apex', 'peak', 'max'],
    'offset_frame': ['offset', 'end', 'finish'],
    'emotion': ['emotion', 'expression', 'feeling', 'estimated'],
    'action_units': ['action', 'au', 'units', 'facs']
}

for field_type, keywords in potential_mappings.items():
    for col in df_clean.columns:
        col_lower = col.lower()
        if any(keyword in col_lower for keyword in keywords):
            key_fields[field_type] = col
            break

print(f"✓ Identified key fields:")
for field_type, column_name in key_fields.items():
    print(f"  {field_type.upper()}: '{column_name}'")

# Missing critical fields check
required_fields = ['subject', 'filename', 'onset_frame', 'apex_frame', 'offset_frame', 'emotion']
missing_fields = [field for field in required_fields if field not in key_fields]

if missing_fields:
    print(f"\n⚠ Missing critical fields: {missing_fields}")
    print("Manual column inspection may be required")

# Subject analysis for LOSO preparation
print(f"\n[7] Subject analysis for LOSO cross-validation...")

if 'subject' in key_fields:
    subject_col = key_fields['subject']
    subject_analysis = df_clean[subject_col].describe()

    print(f"✓ Subject statistics:")
    print(f"  Total unique subjects: {df_clean[subject_col].nunique()}")
    print(f"  Subject range: {df_clean[subject_col].min()} to {df_clean[subject_col].max()}")
    print(f"  Mean samples per subject: {len(df_clean) / df_clean[subject_col].nunique():.1f}")

    # Samples per subject distribution
    samples_per_subject = df_clean[subject_col].value_counts().sort_index()

    print(f"\n✓ Samples per subject distribution:")
    print(f"  Min samples: {samples_per_subject.min()}")
    print(f"  Max samples: {samples_per_subject.max()}")
    print(f"  Median samples: {samples_per_subject.median()}")

    # Show subjects with extreme sample counts
    low_sample_subjects = samples_per_subject[samples_per_subject < 5]
    high_sample_subjects = samples_per_subject[samples_per_subject > 20]

    if len(low_sample_subjects) > 0:
        print(f"  ⚠ Subjects with <5 samples: {len(low_sample_subjects)}")
        print(f"    {dict(low_sample_subjects)}")

    if len(high_sample_subjects) > 0:
        print(f"  ✓ Subjects with >20 samples: {len(high_sample_subjects)}")
        print(f"    Top 3: {dict(high_sample_subjects.head(3))}")

# Emotion class analysis
print(f"\n[8] Emotion class distribution analysis...")

if 'emotion' in key_fields:
    emotion_col = key_fields['emotion']
    emotion_counts = df_clean[emotion_col].value_counts()
    total_samples = len(df_clean)

    print(f"✓ Emotion class distribution:")
    print(f"  Total classes: {len(emotion_counts)}")
    print(f"  Class distribution:")

    for emotion, count in emotion_counts.items():
        percentage = (count / total_samples) * 100
        print(f"    {emotion}: {count} samples ({percentage:.1f}%)")

    # Class balance analysis
    max_samples = emotion_counts.max()
    min_samples = emotion_counts.min()
    imbalance_ratio = max_samples / min_samples

    print(f"\n✓ Class balance metrics:")
    print(f"  Most frequent class: {emotion_counts.index[0]} ({emotion_counts.iloc[0]} samples)")
    print(f"  Least frequent class: {emotion_counts.index[-1]} ({emotion_counts.iloc[-1]} samples)")
    print(f"  Imbalance ratio: {imbalance_ratio:.2f}:1")

    if imbalance_ratio > 3:
        print(f"  ⚠ Significant class imbalance detected - focal loss recommended")
    else:
        print(f"  ✓ Moderate class balance - standard loss functions applicable")

# Frame sequence analysis
print(f"\n[9] Frame sequence analysis...")

frame_fields = ['onset_frame', 'apex_frame', 'offset_frame']
available_frame_fields = [field for field in frame_fields if field in key_fields]

if len(available_frame_fields) >= 2:
    print(f"✓ Frame sequence fields available: {available_frame_fields}")

    # Calculate sequence lengths and statistics
    if all(field in key_fields for field in ['onset_frame', 'offset_frame']):
        onset_col = key_fields['onset_frame']
        offset_col = key_fields['offset_frame']

        sequence_lengths = df_clean[offset_col] - df_clean[onset_col] + 1

        print(f"\n✓ Sequence length statistics:")
        print(f"  Mean length: {sequence_lengths.mean():.1f} frames")
        print(f"  Median length: {sequence_lengths.median():.1f} frames")
        print(f"  Min length: {sequence_lengths.min()} frames")
        print(f"  Max length: {sequence_lengths.max()} frames")
        print(f"  Standard deviation: {sequence_lengths.std():.1f} frames")

# Sample data display
print(f"\n[10] Sample metadata records...")

if len(key_fields) >= 3:
    # Select most relevant columns for display
    display_cols = [col for col in [key_fields.get('subject'),
                                   key_fields.get('filename'),
                                   key_fields.get('onset_frame'),
                                   key_fields.get('apex_frame'),
                                   key_fields.get('offset_frame'),
                                   key_fields.get('emotion')] if col]

    sample_df = df_clean[display_cols].head(5)
    print(f"✓ Sample records (first 5 rows):")
    print(sample_df.to_string(index=False))

# Key information summary for next cells
print(f"\n[11] Key information for Cell 3 cross-validation...")

print(f"✓ Critical metadata summary:")
print(f"  - Total records: {len(df_clean)}")
print(f"  - Unique subjects: {df_clean[key_fields['subject']].nunique() if 'subject' in key_fields else 'Unknown'}")
print(f"  - Emotion classes: {df_clean[key_fields['emotion']].nunique() if 'emotion' in key_fields else 'Unknown'}")
print(f"  - Key columns identified: {len(key_fields)}/{len(required_fields)} required fields")

# Export summary for next cells
field_summary = {
    'subject_column': key_fields.get('subject'),
    'filename_column': key_fields.get('filename'),
    'emotion_column': key_fields.get('emotion'),
    'total_subjects': df_clean[key_fields['subject']].nunique() if 'subject' in key_fields else 0,
    'total_samples': len(df_clean),
    'emotion_classes': df_clean[key_fields['emotion']].nunique() if 'emotion' in key_fields else 0
}

print(f"\n✓ Field mapping for Cell 3:")
for key, value in field_summary.items():
    print(f"  {key}: {value}")

# Final summary
print(f"\n" + "=" * 70)
print("METADATA ANALYSIS SUMMARY")
print("=" * 70)

analysis_status = "COMPLETE" if len(key_fields) >= 5 else "PARTIAL"
print(f"Analysis Status: {analysis_status}")
print(f"Records processed: {len(df_clean)}")
print(f"Key fields identified: {len(key_fields)}")
print(f"Ready for cross-validation: {'Yes' if analysis_status == 'COMPLETE' else 'Needs manual review'}")

if analysis_status == "COMPLETE":
    print(f"✓ Metadata structure fully mapped - proceed to Cell 3 for dataset cross-validation")
else:
    print(f"⚠ Some fields require manual identification - review column mappings before Cell 3")

print("=" * 70)

CASME II METADATA ANALYSIS AND STRUCTURE MAPPING

[1] Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Google Drive mounted successfully

[2] Loading metadata file...
File path: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/metadata/CASME2-coding-20140508.xlsx
✓ Metadata file found (Size: 0.024 MB)
✓ Excel file loaded successfully
  - Shape: 255 rows × 9 columns

[3] Column structure analysis...
Raw column names:
  [0] 'Subject' - int64
  [1] 'Filename' - object
  [2] 'Unnamed: 2' - float64
  [3] 'OnsetFrame' - int64
  [4] 'ApexFrame' - object
  [5] 'OffsetFrame' - int64
  [6] 'Unnamed: 6' - float64
  [7] 'Action Units' - object
  [8] 'Estimated Emotion' - object

[4] Column cleaning and mapping...
  Unnamed column [2]: Sample values = []
  Unnamed column [6]: Sample values = []

✓ Cleaned column mapping:
  'Subject' → 'Subject' (255 non-nul

In [None]:
# @title Cell 3: CASME II Dataset-Metadata Cross-Validation

import os
import pandas as pd
import numpy as np
from google.colab import drive
from collections import defaultdict
import glob

# Mount Google Drive
print("=" * 75)
print("CASME II DATASET-METADATA CROSS-VALIDATION")
print("=" * 75)
print("\n[1] Mounting Google Drive...")
drive.mount('/content/drive')
print("✓ Google Drive mounted successfully")

# Define paths
base_path = "/content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project"
raw_path = f"{base_path}/datasets/raw"
metadata_path = f"{base_path}/datasets/metadata/CASME2-coding-20140508.xlsx"

# Find dataset root path
dataset_paths = [
    f"{raw_path}/CASME2_RAW_selected",  # Direct extraction
    f"{raw_path}/CASME2/CASME2_RAW_selected",  # Nested extraction
]

dataset_root = None
for path in dataset_paths:
    if os.path.exists(path):
        dataset_root = path
        break

print(f"\n[2] Locating dataset structure...")
print(f"Dataset root: {dataset_root}")

if dataset_root is None:
    print("✗ Dataset not found in expected locations")
    print("Available paths:")
    for item in os.listdir(raw_path):
        if os.path.isdir(os.path.join(raw_path, item)):
            print(f"  - {item}/")
    exit()

print(f"✓ Dataset found at: {dataset_root}")

# Load metadata with key field mappings from Cell 2
print(f"\n[3] Loading metadata with field mappings...")

try:
    df = pd.read_excel(metadata_path)

    # Key field mappings from Cell 2 analysis
    field_mapping = {
        'subject': 'Subject',
        'filename': 'Filename',
        'onset_frame': 'OnsetFrame',
        'apex_frame': 'ApexFrame',
        'offset_frame': 'OffsetFrame',
        'emotion': 'Estimated Emotion'
    }

    print(f"✓ Metadata loaded: {len(df)} records")
    print(f"✓ Field mappings applied:")
    for key, col in field_mapping.items():
        print(f"  {key} → '{col}'")

except Exception as e:
    print(f"✗ Error loading metadata: {str(e)}")
    exit()

# Analyze dataset directory structure
print(f"\n[4] Analyzing dataset directory structure...")

subject_folders = []
dataset_structure = {}

for item in sorted(os.listdir(dataset_root)):
    item_path = os.path.join(dataset_root, item)

    if os.path.isdir(item_path):
        # Check if this is a subject folder
        if item.startswith('sub') or (item.startswith('s') and len(item) <= 4) or item.isdigit():
            subject_folders.append(item)

            # Analyze contents of subject folder
            video_folders = []
            total_images = 0

            for root, dirs, files in os.walk(item_path):
                # Count image files
                image_files = [f for f in files if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))]
                total_images += len(image_files)

                # Track video/sequence folders
                if root != item_path:  # Subdirectories
                    rel_path = os.path.relpath(root, item_path)
                    video_folders.append(rel_path)

            dataset_structure[item] = {
                'path': item_path,
                'video_folders': sorted(video_folders),
                'total_images': total_images,
                'video_count': len(video_folders)
            }

print(f"✓ Found {len(subject_folders)} subject folders in dataset")

# Display dataset structure summary
if subject_folders:
    print(f"\n[5] Dataset structure analysis:")
    total_images_all = 0
    total_videos_all = 0

    for subject in sorted(subject_folders[:15]):  # Show first 15
        data = dataset_structure[subject]
        total_images_all += data['total_images']
        total_videos_all += data['video_count']

        print(f"  {subject}: {data['total_images']} images, {data['video_count']} video sequences")

    if len(subject_folders) > 15:
        remaining = subject_folders[15:]
        remaining_images = sum(dataset_structure[s]['total_images'] for s in remaining)
        remaining_videos = sum(dataset_structure[s]['video_count'] for s in remaining)
        total_images_all += remaining_images
        total_videos_all += remaining_videos
        print(f"  ... and {len(remaining)} more subjects ({remaining_images} images, {remaining_videos} videos)")

    print(f"\n✓ Dataset totals: {total_images_all} images, {total_videos_all} video sequences")

# Subject mapping between metadata and dataset
print(f"\n[6] Cross-validation: Subject mapping...")

# Extract subjects from metadata
metadata_subjects = sorted(df[field_mapping['subject']].unique())
metadata_subject_folders = [f"sub{str(s).zfill(2)}" for s in metadata_subjects]

# Normalize dataset subject folder names
normalized_dataset_subjects = []
subject_name_mapping = {}

for folder in subject_folders:
    if folder.startswith('sub'):
        normalized_name = folder
    elif folder.startswith('s') and folder[1:].isdigit():
        num = int(folder[1:])
        normalized_name = f"sub{str(num).zfill(2)}"
    elif folder.isdigit():
        num = int(folder)
        normalized_name = f"sub{str(num).zfill(2)}"
    else:
        normalized_name = folder

    normalized_dataset_subjects.append(normalized_name)
    subject_name_mapping[normalized_name] = folder

normalized_dataset_subjects = sorted(set(normalized_dataset_subjects))

# Compare subject sets
missing_in_dataset = set(metadata_subject_folders) - set(normalized_dataset_subjects)
extra_in_dataset = set(normalized_dataset_subjects) - set(metadata_subject_folders)
matched_subjects = set(metadata_subject_folders) & set(normalized_dataset_subjects)

print(f"✓ Subject mapping analysis:")
print(f"  Metadata subjects: {len(metadata_subject_folders)} ({min(metadata_subjects)} to {max(metadata_subjects)})")
print(f"  Dataset subjects: {len(normalized_dataset_subjects)}")
print(f"  Matched subjects: {len(matched_subjects)}")

if missing_in_dataset:
    print(f"  ⚠ Missing in dataset: {len(missing_in_dataset)}")
    print(f"    {sorted(list(missing_in_dataset))}")

if extra_in_dataset:
    print(f"  ⚠ Extra in dataset: {len(extra_in_dataset)}")
    print(f"    {sorted(list(extra_in_dataset))}")

subject_coverage = (len(matched_subjects) / len(metadata_subject_folders)) * 100
print(f"  Coverage: {subject_coverage:.1f}%")

# Video sequence validation
print(f"\n[7] Cross-validation: Video sequence mapping...")

sequence_validation = {}
total_matched_sequences = 0
total_metadata_sequences = len(df)

for subject_num in metadata_subjects:
    subject_folder = f"sub{str(subject_num).zfill(2)}"

    if subject_folder in matched_subjects:
        # Get metadata sequences for this subject
        subject_metadata = df[df[field_mapping['subject']] == subject_num]
        metadata_filenames = set(subject_metadata[field_mapping['filename']].values)

        # Get actual sequences in dataset
        actual_folder = subject_name_mapping[subject_folder]
        dataset_sequences = set(dataset_structure[actual_folder]['video_folders'])

        # Compare sequences
        matched_sequences = metadata_filenames & dataset_sequences
        missing_sequences = metadata_filenames - dataset_sequences
        extra_sequences = dataset_sequences - metadata_filenames

        sequence_validation[subject_folder] = {
            'metadata_count': len(metadata_filenames),
            'dataset_count': len(dataset_sequences),
            'matched_count': len(matched_sequences),
            'missing_in_dataset': list(missing_sequences),
            'extra_in_dataset': list(extra_sequences)
        }

        total_matched_sequences += len(matched_sequences)

print(f"✓ Sequence validation results:")
print(f"  Total metadata sequences: {total_metadata_sequences}")
print(f"  Total matched sequences: {total_matched_sequences}")
print(f"  Sequence coverage: {(total_matched_sequences/total_metadata_sequences)*100:.1f}%")

# Show detailed validation for subjects with issues
problematic_subjects = 0
for subject, validation in sequence_validation.items():
    if validation['missing_in_dataset'] or len(validation['missing_in_dataset']) > 2:
        problematic_subjects += 1

if problematic_subjects > 0:
    print(f"  ⚠ Subjects with sequence mismatches: {problematic_subjects}")

    # Show first few problematic subjects
    shown_count = 0
    for subject, validation in sequence_validation.items():
        if validation['missing_in_dataset'] and shown_count < 3:
            print(f"    {subject}: {len(validation['missing_in_dataset'])} missing sequences")
            shown_count += 1

# Sample file validation
print(f"\n[8] Sample file structure validation...")

if matched_subjects:
    # Pick a sample subject for detailed file analysis
    sample_subject = sorted(list(matched_subjects))[0]
    actual_folder = subject_name_mapping[sample_subject]
    sample_metadata = df[df[field_mapping['subject']] == int(sample_subject[3:])].iloc[0]

    print(f"✓ Sample validation using {sample_subject}:")
    print(f"  Subject folder: {actual_folder}")
    print(f"  Sample sequence: {sample_metadata[field_mapping['filename']]}")
    print(f"  Expected frames: {sample_metadata[field_mapping['onset_frame']]} to {sample_metadata[field_mapping['offset_frame']]}")

    # Check if sample sequence exists
    sequence_path = os.path.join(dataset_structure[actual_folder]['path'], sample_metadata[field_mapping['filename']])

    if os.path.exists(sequence_path):
        # Count actual image files
        image_files = [f for f in os.listdir(sequence_path) if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))]
        image_count = len(image_files)

        # Sample a few filenames to check naming pattern
        sample_files = sorted(image_files)[:5]

        print(f"  ✓ Sequence exists with {image_count} images")
        print(f"  ✓ Sample filenames: {sample_files}")

        # Check if frame range matches approximately
        expected_frames = sample_metadata[field_mapping['offset_frame']] - sample_metadata[field_mapping['onset_frame']] + 1
        frame_match = abs(image_count - expected_frames) <= 5  # Allow some tolerance

        print(f"  Frame count match: {'✓' if frame_match else '⚠'} (expected ~{expected_frames}, found {image_count})")

    else:
        print(f"  ✗ Sample sequence not found at expected path")

# Data integrity summary
print(f"\n[9] Data integrity assessment...")

integrity_score = 0
max_score = 4

# Subject coverage score
if subject_coverage >= 95:
    integrity_score += 1
    subject_status = "✓"
else:
    subject_status = "⚠"

# Sequence coverage score
sequence_coverage = (total_matched_sequences/total_metadata_sequences)*100
if sequence_coverage >= 90:
    integrity_score += 1
    sequence_status = "✓"
else:
    sequence_status = "⚠"

# File structure score
if 'image_count' in locals() and image_count > 0:
    integrity_score += 1
    file_status = "✓"
else:
    file_status = "⚠"

# Overall completeness
if missing_in_dataset == set() and extra_in_dataset == set():
    integrity_score += 1
    completeness_status = "✓"
else:
    completeness_status = "⚠"

print(f"✓ Data integrity summary:")
print(f"  {subject_status} Subject coverage: {subject_coverage:.1f}%")
print(f"  {sequence_status} Sequence coverage: {sequence_coverage:.1f}%")
print(f"  {file_status} File structure: {'Valid' if 'image_count' in locals() else 'Needs validation'}")
print(f"  {completeness_status} Data completeness: {'Complete' if integrity_score == 4 else 'Partial'}")

integrity_percentage = (integrity_score / max_score) * 100
print(f"  Overall integrity: {integrity_percentage:.0f}% ({integrity_score}/{max_score} checks passed)")

# LOSO readiness assessment
print(f"\n[10] LOSO cross-validation readiness...")

loso_ready_subjects = 0
loso_issues = []

for subject_num in metadata_subjects:
    subject_folder = f"sub{str(subject_num).zfill(2)}"

    if subject_folder in matched_subjects:
        subject_metadata = df[df[field_mapping['subject']] == subject_num]
        sample_count = len(subject_metadata)

        if sample_count >= 3:  # Minimum for meaningful validation
            loso_ready_subjects += 1
        else:
            loso_issues.append(f"{subject_folder}({sample_count} samples)")

print(f"✓ LOSO readiness assessment:")
print(f"  Total subjects: {len(metadata_subjects)}")
print(f"  LOSO-ready subjects: {loso_ready_subjects}")
print(f"  Subjects with issues: {len(loso_issues)}")

if loso_issues:
    print(f"  ⚠ Low-sample subjects: {', '.join(loso_issues)}")
    print(f"  Recommendation: Consider excluding subjects with <3 samples")

loso_readiness = (loso_ready_subjects / len(metadata_subjects)) * 100
print(f"  LOSO readiness: {loso_readiness:.1f}%")

# Final summary and recommendations
print(f"\n" + "=" * 75)
print("CROSS-VALIDATION SUMMARY")
print("=" * 75)

overall_status = "READY" if integrity_score >= 3 and loso_readiness >= 80 else "NEEDS_ATTENTION"
print(f"Overall Status: {overall_status}")
print(f"Data Integrity: {integrity_percentage:.0f}%")
print(f"Subject Coverage: {subject_coverage:.1f}%")
print(f"LOSO Readiness: {loso_readiness:.1f}%")

print(f"\n✓ Next steps:")
if overall_status == "READY":
    print(f"  - Dataset validated and ready for preprocessing")
    print(f"  - Proceed to Cell 4: Data preprocessing pipeline")
    print(f"  - Consider focal loss for class imbalance (49.5:1 ratio)")
else:
    print(f"  - Resolve missing sequences and subjects")
    print(f"  - Manual verification of problematic subjects recommended")
    print(f"  - Consider subset analysis if full dataset unavailable")

print("=" * 75)

CASME II DATASET-METADATA CROSS-VALIDATION

[1] Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Google Drive mounted successfully

[2] Locating dataset structure...
Dataset root: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/raw/CASME2_RAW_selected
✓ Dataset found at: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/raw/CASME2_RAW_selected

[3] Loading metadata with field mappings...
✓ Metadata loaded: 255 records
✓ Field mappings applied:
  subject → 'Subject'
  filename → 'Filename'
  onset_frame → 'OnsetFrame'
  apex_frame → 'ApexFrame'
  offset_frame → 'OffsetFrame'
  emotion → 'Estimated Emotion'

[4] Analyzing dataset directory structure...
✓ Found 26 subject folders in dataset

[5] Dataset structure analysis:
  sub01: 497 images, 9 video sequences
  sub02: 1096 images, 13 video sequences
  sub03: 

In [None]:
# @title Cell 4: CASME II Data Preprocessing with Fixed JSON Serialization

import os
import pandas as pd
import numpy as np
import shutil
from google.colab import drive
from collections import Counter
from sklearn.model_selection import train_test_split
import json
from pathlib import Path

# Mount Google Drive
print("=" * 75)
print("CASME II DATA PREPROCESSING AND STRATIFIED SPLIT")
print("=" * 75)
print("\n[1] Mounting Google Drive...")
drive.mount('/content/drive')
print("✓ Google Drive mounted successfully")

# Define paths
base_path = "/content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project"
raw_path = f"{base_path}/datasets/raw/CASME2_RAW_selected"
metadata_path = f"{base_path}/datasets/metadata/CASME2-coding-20140508.xlsx"
processed_path = f"{base_path}/datasets/processed_casme2"

print(f"\n[2] Setting up processed dataset structure...")
print(f"Raw data source: {raw_path}")
print(f"Processed destination: {processed_path}")

# Create processed directory structure
data_split_path = f"{processed_path}/data_split_v1"
directories = [
    f"{data_split_path}/train",
    f"{data_split_path}/val",
    f"{data_split_path}/test"
]

for directory in directories:
    os.makedirs(directory, exist_ok=True)
    print(f"✓ Created directory: {directory}")

# Load metadata with emotion class focus
print(f"\n[3] Loading metadata and analyzing class distribution...")

try:
    df = pd.read_excel(metadata_path)

    # Clean ApexFrame column if it contains non-numeric values
    if df['ApexFrame'].dtype == 'object':
        # Convert to numeric, handle any text values
        df['ApexFrame'] = pd.to_numeric(df['ApexFrame'], errors='coerce')
        print(f"⚠ ApexFrame column contained non-numeric values, converted to numeric")

    print(f"✓ Metadata loaded: {len(df)} records")

except Exception as e:
    print(f"✗ Error loading metadata: {str(e)}")
    exit()

# Analyze emotion class distribution for stratification
emotion_distribution = df['Estimated Emotion'].value_counts()
total_samples = len(df)

print(f"\n[4] Emotion class analysis for stratification:")
print(f"Total classes: {len(emotion_distribution)}")

stratification_feasible = {}
for emotion, count in emotion_distribution.items():
    min_for_split = 3  # Minimum 1 per train/val/test
    feasible = count >= min_for_split
    stratification_feasible[emotion] = feasible

    percentage = (count / total_samples) * 100
    status = "✓" if feasible else "⚠"
    print(f"  {status} {emotion}: {count} samples ({percentage:.1f}%)")

problematic_classes = [k for k, v in stratification_feasible.items() if not v]
if problematic_classes:
    print(f"\n⚠ Classes with <3 samples: {problematic_classes}")
    print(f"  Strategy: Use random assignment for these classes")

# Create sample inventory for processing
print(f"\n[5] Creating sample inventory for apex frame extraction...")

sample_inventory = []
processing_errors = []

for idx, row in df.iterrows():
    try:
        subject = f"sub{str(row['Subject']).zfill(2)}"
        sequence = row['Filename']

        # Frame information
        onset_frame = int(row['OnsetFrame'])
        apex_frame = int(row['ApexFrame']) if pd.notna(row['ApexFrame']) else None
        offset_frame = int(row['OffsetFrame'])
        emotion = row['Estimated Emotion']

        # Source path for sequence
        source_sequence_path = os.path.join(raw_path, subject, sequence)

        # Check if source exists
        if os.path.exists(source_sequence_path):
            # List image files in sequence
            image_files = sorted([f for f in os.listdir(source_sequence_path)
                                if f.lower().endswith(('.jpg', '.jpeg', '.png'))])

            if image_files:
                # Determine apex image file
                if apex_frame is not None:
                    # Find image file corresponding to apex frame
                    apex_img = f"img{apex_frame}.jpg"
                    if apex_img in image_files:
                        apex_image_path = os.path.join(source_sequence_path, apex_img)
                    else:
                        # Fallback to middle frame
                        middle_idx = len(image_files) // 2
                        apex_image_path = os.path.join(source_sequence_path, image_files[middle_idx])
                        apex_frame = "middle_frame"
                else:
                    # Use middle frame as apex
                    middle_idx = len(image_files) // 2
                    apex_image_path = os.path.join(source_sequence_path, image_files[middle_idx])
                    apex_frame = "middle_frame"

                sample_inventory.append({
                    'sample_id': f"{subject}_{sequence}",
                    'subject': subject,
                    'sequence': sequence,
                    'emotion': emotion,
                    'apex_frame_num': apex_frame,
                    'source_apex_path': apex_image_path,
                    'total_frames': len(image_files),
                    'onset_frame': onset_frame,
                    'offset_frame': offset_frame
                })
            else:
                processing_errors.append(f"No images in {source_sequence_path}")
        else:
            processing_errors.append(f"Sequence not found: {source_sequence_path}")

    except Exception as e:
        processing_errors.append(f"Row {idx}: {str(e)}")

print(f"✓ Sample inventory created:")
print(f"  Valid samples: {len(sample_inventory)}")
print(f"  Processing errors: {len(processing_errors)}")

if processing_errors and len(processing_errors) <= 5:
    print(f"  Sample errors: {processing_errors}")

# Convert to DataFrame for easier manipulation
inventory_df = pd.DataFrame(sample_inventory)

# Stratified splitting with special handling for minority classes
print(f"\n[6] Performing stratified dataset splitting...")

train_samples = []
val_samples = []
test_samples = []

# Handle each emotion class separately
for emotion in emotion_distribution.index:
    emotion_samples = inventory_df[inventory_df['emotion'] == emotion].copy()
    n_samples = len(emotion_samples)

    print(f"\n  Processing {emotion} class ({n_samples} samples):")

    if n_samples >= 3:
        # Standard stratified split: 80% train, 10% val, 10% test
        # First split: 80% train, 20% temp
        train_data, temp_data = train_test_split(
            emotion_samples,
            test_size=0.2,
            random_state=42,
            stratify=None  # Can't stratify single class
        )

        # Second split: 50% val, 50% test from temp (10% each of original)
        if len(temp_data) >= 2:
            val_data, test_data = train_test_split(
                temp_data,
                test_size=0.5,
                random_state=42,
                stratify=None
            )
        else:
            # If only 1 sample in temp, assign to val
            val_data = temp_data
            test_data = pd.DataFrame()  # Empty

        train_count = len(train_data)
        val_count = len(val_data)
        test_count = len(test_data)

        print(f"    ✓ Split: {train_count} train, {val_count} val, {test_count} test")

    else:
        # For very small classes, distribute manually
        if n_samples == 2:
            train_data = emotion_samples.iloc[:1]
            val_data = emotion_samples.iloc[1:2]
            test_data = pd.DataFrame()
            print(f"    ⚠ Manual split: 1 train, 1 val, 0 test")
        elif n_samples == 1:
            train_data = emotion_samples
            val_data = pd.DataFrame()
            test_data = pd.DataFrame()
            print(f"    ⚠ Single sample: 1 train, 0 val, 0 test")
        else:
            # No samples
            train_data = val_data = test_data = pd.DataFrame()
            print(f"    ✗ No samples available")

    # Add to respective lists
    if not train_data.empty:
        train_samples.append(train_data)
    if not val_data.empty:
        val_samples.append(val_data)
    if not test_data.empty:
        test_samples.append(test_data)

# Combine all splits
train_df = pd.concat(train_samples, ignore_index=True) if train_samples else pd.DataFrame()
val_df = pd.concat(val_samples, ignore_index=True) if val_samples else pd.DataFrame()
test_df = pd.concat(test_samples, ignore_index=True) if test_samples else pd.DataFrame()

print(f"\n✓ Final split summary:")
print(f"  Training set: {len(train_df)} samples")
print(f"  Validation set: {len(val_df)} samples")
print(f"  Test set: {len(test_df)} samples")
print(f"  Total: {len(train_df) + len(val_df) + len(test_df)} samples")

# Display class distribution per split
print(f"\n[7] Class distribution verification:")

splits = {'train': train_df, 'val': val_df, 'test': test_df}
split_stats = {}

for split_name, split_data in splits.items():
    if not split_data.empty:
        class_counts = split_data['emotion'].value_counts()
        split_stats[split_name] = dict(class_counts)

        print(f"\n  {split_name.upper()} set distribution:")
        for emotion, count in class_counts.items():
            percentage = (count / len(split_data)) * 100
            print(f"    {emotion}: {count} ({percentage:.1f}%)")

# Copy apex frames to respective directories
print(f"\n[8] Copying apex frames to split directories...")

copy_stats = {'train': 0, 'val': 0, 'test': 0}
copy_errors = []

for split_name, split_data in splits.items():
    if split_data.empty:
        continue

    split_dir = f"{data_split_path}/{split_name}"

    for idx, row in split_data.iterrows():
        try:
            source_path = row['source_apex_path']

            # Create destination filename: subject_sequence_emotion.jpg
            dest_filename = f"{row['sample_id']}_{row['emotion']}.jpg"
            dest_path = os.path.join(split_dir, dest_filename)

            # Copy apex frame
            if os.path.exists(source_path):
                shutil.copy2(source_path, dest_path)
                copy_stats[split_name] += 1
            else:
                copy_errors.append(f"Source not found: {source_path}")

        except Exception as e:
            copy_errors.append(f"Copy error for {row['sample_id']}: {str(e)}")

print(f"✓ Apex frame copying completed:")
for split_name, count in copy_stats.items():
    print(f"  {split_name}: {count} images copied")

if copy_errors:
    print(f"  ⚠ Copy errors: {len(copy_errors)} (showing first 3)")
    for error in copy_errors[:3]:
        print(f"    {error}")

# JSON Serialization Helper Function
def make_json_serializable(obj):
    """Recursively convert numpy/pandas types to Python built-ins for json."""
    # dict -> convert keys/values
    if isinstance(obj, dict):
        return {make_json_serializable(k): make_json_serializable(v) for k, v in obj.items()}
    # list/tuple -> convert items
    if isinstance(obj, (list, tuple)):
        return [make_json_serializable(x) for x in obj]
    # pandas timestamp
    if isinstance(obj, pd.Timestamp):
        return obj.isoformat()
    # numpy / pandas integer types
    if isinstance(obj, (np.integer,)):
        return int(obj)
    # numpy / pandas floating types
    if isinstance(obj, (np.floating,)):
        return float(obj)
    # pandas / numpy booleans
    if isinstance(obj, (np.bool_,)):
        return bool(obj)
    # fallback: return as-is (strings, ints, floats are fine)
    return obj

# Save split metadata and statistics
print(f"\n[9] Saving split metadata and statistics...")

# Create metadata for each split
metadata_export = {}

for split_name, split_data in splits.items():
    if not split_data.empty:
        # Convert to serializable format
        split_metadata = []
        for idx, row in split_data.iterrows():
            split_metadata.append({
                'sample_id': str(row['sample_id']),
                'subject': str(row['subject']),
                'sequence': str(row['sequence']),
                'emotion': str(row['emotion']),
                'apex_frame_num': str(row['apex_frame_num']),
                'total_frames': int(row['total_frames']),
                'onset_frame': int(row['onset_frame']),
                'offset_frame': int(row['offset_frame']),
                'image_filename': f"{row['sample_id']}_{row['emotion']}.jpg"
            })

        # Convert class distribution to JSON-serializable format
        class_dist = {}
        for emotion, count in split_data['emotion'].value_counts().items():
            class_dist[str(emotion)] = int(count)

        metadata_export[split_name] = {
            'count': int(len(split_data)),
            'class_distribution': class_dist,
            'samples': split_metadata
        }

# Save comprehensive metadata using serialization helper
metadata_file = f"{processed_path}/split_metadata.json"
metadata_serializable = make_json_serializable(metadata_export)

with open(metadata_file, 'w') as f:
    json.dump(metadata_serializable, f, indent=2)

print(f"✓ Split metadata saved to: split_metadata.json")

# Save processing summary
processing_summary = {
    'dataset': 'CASME2',
    'total_samples': len(inventory_df),
    'processing_date': pd.Timestamp.now().isoformat(),
    'split_strategy': 'stratified_80_10_10',
    'problematic_classes': problematic_classes,
    'split_statistics': split_stats,
    'copy_statistics': copy_stats,
    'processing_errors': len(processing_errors) + len(copy_errors)
}

# Convert the whole structure to JSON-serializable Python built-ins
processing_summary_serializable = make_json_serializable(processing_summary)

# Write to file
summary_file = f"{processed_path}/processing_summary.json"
with open(summary_file, 'w') as f:
    json.dump(processing_summary_serializable, f, indent=2)

print(f"✓ Processing summary saved to: processing_summary.json")

# Validate processed structure
print(f"\n[10] Final validation of processed structure...")

validation_results = {}
for split_name in ['train', 'val', 'test']:
    split_dir = f"{data_split_path}/{split_name}"

    if os.path.exists(split_dir):
        image_files = [f for f in os.listdir(split_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        validation_results[split_name] = {
            'directory_exists': True,
            'image_count': len(image_files),
            'sample_files': image_files[:3]  # Show first 3 as sample
        }
    else:
        validation_results[split_name] = {
            'directory_exists': False,
            'image_count': 0,
            'sample_files': []
        }

print(f"✓ Structure validation:")
for split_name, results in validation_results.items():
    status = "✓" if results['directory_exists'] and results['image_count'] > 0 else "⚠"
    print(f"  {status} {split_name}: {results['image_count']} images")
    if results['sample_files']:
        print(f"    Sample files: {results['sample_files']}")

# Final summary
total_processed = sum(copy_stats.values())
success_rate = (total_processed / len(inventory_df)) * 100 if inventory_df is not None and len(inventory_df) > 0 else 0

print(f"\n" + "=" * 75)
print("PREPROCESSING SUMMARY")
print("=" * 75)
print(f"Processing Status: {'SUCCESS' if success_rate >= 95 else 'PARTIAL'}")
print(f"Total samples processed: {total_processed}/{len(inventory_df)} ({success_rate:.1f}%)")
print(f"Split distribution: {len(train_df)} train, {len(val_df)} val, {len(test_df)} test")
print(f"Class balance handling: {'Applied' if problematic_classes else 'Standard stratification'}")
print(f"Apex frames extracted: {sum(copy_stats.values())}")

print(f"\n✓ Next steps:")
print(f"  - Processed data ready in: processed_casme2/data_split/")
print(f"  - Metadata available in: split_metadata.json")
print(f"  - Proceed to Cell 5: Model baseline implementation")
print(f"  - Or proceed to Cell 5: LOSO split generation (optional)")

if problematic_classes:
    print(f"\n⚠ Note: Classes {problematic_classes} have limited samples")
    print(f"  Consider focal loss for class imbalance mitigation")

print("=" * 75)

CASME II DATA PREPROCESSING AND STRATIFIED SPLIT

[1] Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Google Drive mounted successfully

[2] Setting up processed dataset structure...
Raw data source: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/raw/CASME2_RAW_selected
Processed destination: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2
✓ Created directory: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/data_split/train
✓ Created directory: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/data_split/val
✓ Created directory: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/data_split/test

[3] Loading metadata and analyzing 

In [None]:
# @title Cell 5: CASME II Visualization

import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Professional visualization setup - clean medical standard
plt.style.use('default')
plt.rcParams.update({
    'font.family': 'DejaVu Sans',
    'font.size': 12,
    'axes.titlesize': 16,
    'axes.labelsize': 14,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'legend.fontsize': 12,
    'figure.titlesize': 18,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'axes.grid': False,
    'figure.facecolor': 'white',
    'axes.facecolor': 'white'
})

def convert_to_serializable(obj):
    """Convert numpy/pandas types to native Python types for JSON serialization"""
    if isinstance(obj, dict):
        return {key: convert_to_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(item) for item in obj]
    elif isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64, np.float32)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif pd.isna(obj):
        return None
    else:
        return obj

# Enhanced medical visualization color palette - consistent across all charts
EMOTION_COLORS = {
    'others': '#1f77b4',      # Standard blue
    'disgust': '#d62728',     # Clear red
    'happiness': '#2ca02c',   # Clear green
    'repression': '#9467bd',  # Purple
    'surprise': '#ff7f0e',    # Orange
    'sadness': '#8c564b',     # Brown - distinct from purple
    'fear': '#e377c2'         # Pink - highly distinct
}

# Split colors for comparison
SPLIT_COLORS = {
    'raw': '#1f77b4',
    'train': '#ff7f0e',
    'val': '#d62728',
    'test': '#2ca02c'
}

print("=" * 80)
print("CASME II PROFESSIONAL VISUALIZATION - MULTI-FILE CLEAN GENERATION")
print("=" * 80)

print("\n[1] Environment setup and drive mounting...")
drive.mount('/content/drive')
print("Drive mounted successfully")

# Define paths
base_path = "/content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project"
metadata_path = f"{base_path}/datasets/metadata/CASME2-coding-20140508.xlsx"
data_split_path = f"{base_path}/datasets/processed_casme2/data_split"
split_metadata_path = f"{data_split_path}/split_metadata.json"
visualization_path = f"{base_path}/datasets/visualization/01_casme2-af"

os.makedirs(visualization_path, exist_ok=True)
print(f"Output directory: {visualization_path}")

print("\n[2] Loading and processing metadata...")

# Load original metadata
try:
    raw_metadata = pd.read_excel(metadata_path)
    print(f"Raw metadata loaded: {len(raw_metadata)} records")

    if raw_metadata['ApexFrame'].dtype == 'object':
        raw_metadata['ApexFrame'] = pd.to_numeric(raw_metadata['ApexFrame'], errors='coerce')
        print("ApexFrame column normalized to numeric")

except Exception as e:
    print(f"Error loading raw metadata: {str(e)}")
    exit()

# Load split metadata
try:
    with open(split_metadata_path, 'r') as f:
        split_metadata = json.load(f)
    print("Split metadata loaded successfully")

    splits_info = {}
    for split_name in ['train', 'val', 'test']:
        if split_name in split_metadata:
            splits_info[split_name] = split_metadata[split_name]

except Exception as e:
    print(f"Error loading split metadata: {str(e)}")
    exit()

print("\n[3] File integrity validation...")

total_actual_files = 0
for split_name in ['train', 'val', 'test']:
    split_dir = f"{data_split_path}/{split_name}"
    if os.path.exists(split_dir):
        image_files = [f for f in os.listdir(split_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        actual_count = len(image_files)
        expected_count = splits_info.get(split_name, {}).get('count', 0)
        total_actual_files += actual_count
        status = 'PASS' if actual_count == expected_count else 'FAIL'
        print(f"  {split_name.upper()}: {status} ({actual_count}/{expected_count})")

print(f"Total validated files: {total_actual_files}")

print("\n[4] Preparing visualization data...")

# Raw distribution data - sorted from highest to lowest
raw_emotion_dist = raw_metadata['Estimated Emotion'].value_counts().sort_values(ascending=False)
raw_total = len(raw_metadata)

# Split distribution data
split_data = {}
for split_name in ['train', 'val', 'test']:
    if split_name in splits_info:
        split_data[split_name] = splits_info[split_name]['class_distribution']

print("Data preparation complete")

print("\n[5] Generating File 1: Raw Distribution Bar Chart...")

# FILE 1: Raw Distribution Bar Chart Only
fig1, ax1 = plt.subplots(1, 1, figsize=(14, 8))

colors_ordered = [EMOTION_COLORS.get(emotion, '#666666') for emotion in raw_emotion_dist.index]
bars = ax1.bar(raw_emotion_dist.index, raw_emotion_dist.values,
               color=colors_ordered, alpha=0.8, edgecolor='white', linewidth=1.2, width=0.7)

ax1.set_title('CASME II Raw Dataset Distribution',
              fontsize=18, fontweight='bold', pad=25)
ax1.set_xlabel('Emotion Classes', fontsize=16, labelpad=15)
ax1.set_ylabel('Sample Count', fontsize=16, labelpad=15)

# Add value labels on all bars
for bar, value in zip(bars, raw_emotion_dist.values):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2,
             str(value), ha='center', va='bottom', fontsize=13, fontweight='bold')

ax1.set_ylim(0, max(raw_emotion_dist.values) * 1.2)
ax1.grid(False)

plt.tight_layout()
file1_path = f"{visualization_path}/1_raw_distribution_bar.png"
plt.savefig(file1_path, dpi=300, bbox_inches='tight', facecolor='white')
plt.close()
print(f"File 1 saved: 1_raw_distribution_bar.png")

print("\n[6] Generating File 2: Raw Distribution Pie Chart...")

# FILE 2: Raw Distribution Pie Chart Only - Larger with better spacing
fig2, ax2 = plt.subplots(1, 1, figsize=(12, 12))

percentages = [(count/raw_total)*100 for count in raw_emotion_dist.values]
pie_colors = [EMOTION_COLORS.get(emotion, '#666666') for emotion in raw_emotion_dist.index]

# Create pie chart with better spacing for small percentages
wedges, texts, autotexts = ax2.pie(percentages, labels=raw_emotion_dist.index,
                                   autopct='%1.1f%%', colors=pie_colors,
                                   startangle=90, textprops={'fontsize': 12},
                                   pctdistance=0.75, labeldistance=1.15)

ax2.set_title('CASME II Raw Dataset Percentage Distribution',
              fontsize=18, fontweight='bold', pad=30)

# Enhanced handling - only fear percentage gets pulled out, keep text labels normal
for i, (autotext, pct, text, emotion) in enumerate(zip(autotexts, percentages, texts, raw_emotion_dist.index)):
    autotext.set_color('white')
    autotext.set_fontweight('bold')
    autotext.set_fontsize(12)  # Increased font size

    # Only pull out fear percentage (smallest), keep all text labels in normal position
    if emotion == 'fear':
        # Move only percentage label further out, not the class name
        current_pos = autotext.get_position()
        autotext.set_position((current_pos[0] * 1.35, current_pos[1] * 1.35))
        autotext.set_color('black')
        # Keep text label in normal position - no adjustment needed

plt.tight_layout()
file2_path = f"{visualization_path}/2_raw_distribution_pie.png"
plt.savefig(file2_path, dpi=300, bbox_inches='tight', facecolor='white')
plt.close()
print(f"File 2 saved: 2_raw_distribution_pie.png")

print("\n[7] Generating File 3: Split Distribution Comparison...")

# FILE 3: Split Distribution Comparison Only
fig3, ax_main = plt.subplots(1, 1, figsize=(16, 8))

# Main comparison chart - sorted by raw distribution order
all_emotions = raw_emotion_dist.index.tolist()
raw_counts = [raw_emotion_dist.get(emotion, 0) for emotion in all_emotions]
train_counts = [split_data.get('train', {}).get(emotion, 0) for emotion in all_emotions]
val_counts = [split_data.get('val', {}).get(emotion, 0) for emotion in all_emotions]
test_counts = [split_data.get('test', {}).get(emotion, 0) for emotion in all_emotions]

x = np.arange(len(all_emotions))
width = 0.2

# Create adjacent bars
bars1 = ax_main.bar(x - width*1.5, raw_counts, width, label='Raw Dataset (Apex Frame)',
                    color=SPLIT_COLORS['raw'], alpha=0.85)
bars2 = ax_main.bar(x - width/2, train_counts, width, label='Train Split (Apex Frame)',
                    color=SPLIT_COLORS['train'], alpha=0.85)
bars3 = ax_main.bar(x + width/2, val_counts, width, label='Validation Split (Apex Frame)',
                    color=SPLIT_COLORS['val'], alpha=0.85)
bars4 = ax_main.bar(x + width*1.5, test_counts, width, label='Test Split (Apex Frame)',
                    color=SPLIT_COLORS['test'], alpha=0.85)

ax_main.set_title('CASME II Dataset Split Distribution Comparison',
                  fontsize=18, fontweight='bold', pad=25)
ax_main.set_xlabel('Emotion Classes (Sorted by Frequency)', fontsize=16, labelpad=20)
ax_main.set_ylabel('Image Count', fontsize=16, labelpad=20)
ax_main.set_xticks(x)
ax_main.set_xticklabels(all_emotions, rotation=0)
ax_main.legend(loc='upper right', fontsize=13)
ax_main.grid(False)

# Add all value labels
all_bars_data = [(bars1, raw_counts), (bars2, train_counts), (bars3, val_counts), (bars4, test_counts)]
for bars, values in all_bars_data:
    for bar, value in zip(bars, values):
        if value > 0:
            ax_main.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                        str(value), ha='center', va='bottom', fontsize=9,
                        fontweight='bold')

plt.tight_layout()
file3_path = f"{visualization_path}/3_split_distribution_comparison.png"
plt.savefig(file3_path, dpi=300, bbox_inches='tight', facecolor='white')
plt.close()
print(f"File 3 saved: 3_split_distribution_comparison.png")

print("\n[8] Generating File 4: Dataset Split Ratios...")

# FILE 4: Dataset Split Ratios - Large pie chart
fig4, ax_ratio = plt.subplots(1, 1, figsize=(10, 10))

split_totals = [sum(train_counts), sum(val_counts), sum(test_counts)]
split_labels = ['Train (80%)', 'Validation (20%)', 'Test (Holdout)']
split_colors_pie = [SPLIT_COLORS['train'], SPLIT_COLORS['val'], SPLIT_COLORS['test']]

wedges, texts, autotexts = ax_ratio.pie(split_totals, labels=split_labels,
                                        autopct=lambda pct: f'{pct:.1f}%\n({int(pct/100*sum(split_totals))} samples)',
                                        colors=split_colors_pie, startangle=90,
                                        textprops={'fontsize': 13},
                                        labeldistance=1.1, pctdistance=0.8)

ax_ratio.set_title('CASME II Final Dataset Split Ratios',
                   fontsize=18, fontweight='bold', pad=30)

# Enhanced autotext for better readability
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontweight('bold')
    autotext.set_fontsize(12)

plt.tight_layout()
file4_path = f"{visualization_path}/4_dataset_split_ratios.png"
plt.savefig(file4_path, dpi=300, bbox_inches='tight', facecolor='white')
plt.close()
print(f"File 4 saved: 4_dataset_split_ratios.png")

print("\n[9] Generating File 5: Statistical Analysis Table...")

# FILE 5: Clean Statistical Table Only
fig5 = plt.figure(figsize=(16, 8))
ax_table = fig5.add_subplot(1, 1, 1)
ax_table.axis('off')

# Prepare table data with proper calculations
table_data = []
total_raw_count = 0
total_train_count = 0
total_val_count = 0
total_test_count = 0

for emotion in all_emotions:
    raw_count = raw_emotion_dist.get(emotion, 0)
    train_count = split_data.get('train', {}).get(emotion, 0)
    val_count = split_data.get('val', {}).get(emotion, 0)
    test_count = split_data.get('test', {}).get(emotion, 0)
    split_total = train_count + val_count + test_count

    raw_pct = f"{(raw_count/raw_total)*100:.1f}%" if raw_total > 0 else "0%"
    preservation_rate = f"{(split_total/raw_count)*100:.1f}%" if raw_count > 0 else "0%"

    max_raw = raw_emotion_dist.max()
    imbalance_ratio = f"{max_raw/raw_count:.1f}:1" if raw_count > 0 else "∞:1"

    table_data.append([
        emotion.title(),
        raw_count,
        raw_pct,
        train_count,
        val_count,
        test_count,
        split_total,
        preservation_rate,
        imbalance_ratio
    ])

    total_raw_count += raw_count
    total_train_count += train_count
    total_val_count += val_count
    total_test_count += test_count

# Add total row
total_split_count = total_train_count + total_val_count + total_test_count
table_data.append([
    'TOTAL',
    total_raw_count,
    '100.0%',
    total_train_count,
    total_val_count,
    total_test_count,
    total_split_count,
    f"{(total_split_count/total_raw_count)*100:.1f}%",
    '1.0:1'
])

# Convert to display format
table_display_data = []
for row in table_data:
    display_row = [
        row[0],  # emotion name
        f"{row[1]:,}",  # raw count formatted
        row[2],  # raw percentage
        f"{row[3]:,}",  # train count formatted
        f"{row[4]:,}",  # val count formatted
        f"{row[5]:,}",  # test count formatted
        f"{row[6]:,}",  # split total formatted
        row[7],  # preservation rate
        row[8]   # imbalance ratio
    ]
    table_display_data.append(display_row)

# Create table
table = ax_table.table(
    cellText=table_display_data,
    colLabels=['Emotion Class', 'Raw Count', 'Raw %', 'Train', 'Val', 'Test', 'Split Total', 'Preserved', 'Imbalance'],
    cellLoc='center',
    loc='center',
    colWidths=[0.15, 0.10, 0.08, 0.08, 0.08, 0.08, 0.10, 0.10, 0.10]
)

table.auto_set_font_size(False)
table.set_fontsize(12)
table.scale(1, 2.8)

# Professional table styling
num_rows = len(table_display_data)
for i in range(num_rows + 1):
    for j in range(9):
        cell = table[(i, j)]
        if i == 0:  # Header
            cell.set_facecolor('#1f77b4')
            cell.set_text_props(weight='bold', color='white')
        elif i == num_rows:  # Total row
            cell.set_facecolor('#f0f0f0')
            cell.set_text_props(weight='bold')
        else:
            emotion = table_display_data[i-1][0].lower()
            if emotion in ['fear', 'sadness']:
                cell.set_facecolor('#ffe6e6')
            else:
                cell.set_facecolor('#ffffff')

# Single clean title
ax_table.set_title('CASME II Dataset Statistical Analysis Summary',
                   fontsize=18, fontweight='bold', pad=40)

plt.tight_layout()
file5_path = f"{visualization_path}/5_statistical_analysis_table.png"
plt.savefig(file5_path, dpi=300, bbox_inches='tight', facecolor='white')
plt.close()
print(f"File 5 saved: 5_statistical_analysis_table.png")

print("\n[10] Generating comprehensive JSON metadata...")

# Generate metadata
analysis_metadata = {
    'analysis_timestamp': pd.Timestamp.now().isoformat(),
    'dataset_info': {
        'name': 'CASME II Micro-Expression Dataset',
        'total_subjects': 26,
        'total_original_samples': int(raw_total),
        'emotion_classes': len(raw_emotion_dist),
        'severe_imbalance_ratio': f"{raw_emotion_dist.max()}:{raw_emotion_dist.min()}"
    },
    'raw_distribution': convert_to_serializable({
        'emotion_counts': {k: int(v) for k, v in raw_emotion_dist.items()},
        'emotion_percentages': {k: round((v/raw_total)*100, 2) for k, v in raw_emotion_dist.items()}
    }),
    'split_distribution': convert_to_serializable({
        'train': {
            'total_samples': sum(split_data.get('train', {}).values()),
            'class_distribution': split_data.get('train', {})
        },
        'validation': {
            'total_samples': sum(split_data.get('val', {}).values()),
            'class_distribution': split_data.get('val', {})
        },
        'test': {
            'total_samples': sum(split_data.get('test', {}).values()),
            'class_distribution': split_data.get('test', {})
        }
    }),
    'visualization_files': {
        'raw_distribution_bar': '1_raw_distribution_bar.png',
        'raw_distribution_pie': '2_raw_distribution_pie.png',
        'split_comparison': '3_split_distribution_comparison.png',
        'split_ratios': '4_dataset_split_ratios.png',
        'statistical_table': '5_statistical_analysis_table.png'
    },
    'color_scheme': {
        'emotion_psychology_mapping': EMOTION_COLORS,
        'split_comparison_colors': SPLIT_COLORS,
        'design_principle': 'Medical visualization with consistent color psychology'
    }
}

metadata_file = f"{visualization_path}/professional_analysis_metadata.json"
with open(metadata_file, 'w') as f:
    json.dump(analysis_metadata, f, indent=2)

print(f"Comprehensive metadata saved: professional_analysis_metadata.json")

print("\n" + "=" * 80)
print("MULTI-FILE PROFESSIONAL VISUALIZATION COMPLETE")
print("=" * 80)
print(f"Status: SUCCESS - 5 separate publication-ready files generated")
print(f"Total samples validated: {total_actual_files} files")
print(f"Output location: {visualization_path}")
print("\nGenerated files:")
print("  • 1_raw_distribution_bar.png - Clean bar chart only")
print("  • 2_raw_distribution_pie.png - Large pie chart with proper spacing")
print("  • 3_split_distribution_comparison.png - Split comparison chart")
print("  • 4_dataset_split_ratios.png - Large ratio pie chart")
print("  • 5_statistical_analysis_table.png - Clean table without extras")
print("  • professional_analysis_metadata.json - Comprehensive metadata")
print("\nKey improvements:")
print("  • Consistent color scheme across all charts")
print("  • Proper spacing for small percentages in pie charts")
print("  • Separated files for optimal sizing")
print("  • Clean single titles without overlap")
print("  • Focused content without extra text boxes")
print("=" * 80)

CASME II PROFESSIONAL VISUALIZATION - MULTI-FILE CLEAN GENERATION

[1] Environment setup and drive mounting...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive mounted successfully
Output directory: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/visualization/01_casme2-af

[2] Loading and processing metadata...
Raw metadata loaded: 255 records
ApexFrame column normalized to numeric
Split metadata loaded successfully

[3] File integrity validation...
  TRAIN: PASS (201/201)
  VAL: PASS (26/26)
  TEST: PASS (28/28)
Total validated files: 255

[4] Preparing visualization data...
Data preparation complete

[5] Generating File 1: Raw Distribution Bar Chart...
File 1 saved: 1_raw_distribution_bar.png

[6] Generating File 2: Raw Distribution Pie Chart...
File 2 saved: 2_raw_distribution_pie.png

[7] Generating File 3: Split Distribution Comparison...
File 3 saved: 3_spl