In [None]:
# @title Cell 1: RAF-DB ZIP Extraction

# File: 01_04_EDA_RAF-DB.ipynb - Cell 1
# Location: RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/notebooks/
# Purpose: Robust RAF-DB Balanced dataset ZIP extraction with validation

import os
import zipfile
import time
from google.colab import drive

# Mount Google Drive
print("=" * 70)
print("RAF-DB BALANCED DATASET EXTRACTION")
print("=" * 70)
print("\n[1] Mounting Google Drive...")
drive.mount('/content/drive')
print("    Google Drive mounted successfully")

# Define paths
base_path = "/content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project"
raw_path = f"{base_path}/datasets/raw"
processed_path = f"{base_path}/datasets/processed_raf"
zip_file_path = f"{raw_path}/RAF-DB_balanced.zip"

print(f"\n[2] Path Configuration")
print(f"    ZIP file location: {zip_file_path}")
print(f"    Extract destination: {processed_path}")

# Verify ZIP file existence
if not os.path.exists(zip_file_path):
    print("\n" + "=" * 70)
    print("ERROR: RAF-DB_balanced.zip not found")
    print("Please ensure ZIP file is uploaded to datasets/raw/ directory")
    print("=" * 70)
    raise FileNotFoundError("ZIP file not found")

zip_size_mb = round(os.path.getsize(zip_file_path) / (1024 * 1024), 2)
print(f"    ZIP file verified (Size: {zip_size_mb} MB)")

# Cleanup previous extraction attempts
print(f"\n[3] Cleanup Previous Extractions")
if os.path.exists(processed_path):
    import shutil
    shutil.rmtree(processed_path)
    print(f"    Removed existing directory: processed_raf/")
else:
    print(f"    No existing directory found")

# Create fresh extraction directory
os.makedirs(processed_path, exist_ok=True)
print(f"    Created fresh extraction directory")

# Analyze ZIP structure
print(f"\n[4] Analyzing ZIP Structure")
try:
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        file_list = zip_ref.namelist()
        total_files = len(file_list)

        # Analyze directory structure
        directories = set()
        split_folders = set()
        class_folders = {}

        for file_path in file_list:
            parts = file_path.split('/')
            if len(parts) > 1:
                directories.add(parts[0])
                # Check for split folders (train/val/test)
                if len(parts) > 1 and parts[0] in ['train', 'val', 'test']:
                    split_folders.add(parts[0])
                    if len(parts) > 2:
                        class_name = parts[1]
                        if class_name and not class_name.startswith('.'):
                            if parts[0] not in class_folders:
                                class_folders[parts[0]] = set()
                            class_folders[parts[0]].add(class_name)

        print(f"    Total files in ZIP: {total_files}")
        print(f"    Root directories: {sorted(list(directories))}")
        print(f"    Split folders detected: {sorted(list(split_folders))}")

        if class_folders:
            print(f"    Class structure:")
            for split in sorted(class_folders.keys()):
                classes = sorted(list(class_folders[split]))
                print(f"      {split}: {len(classes)} classes - {classes}")

except Exception as e:
    print(f"\n    ERROR analyzing ZIP: {str(e)}")
    raise

# Robust extraction with progress tracking
print(f"\n[5] Extracting ZIP Contents")
start_time = time.time()

try:
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        extracted_count = 0
        error_count = 0
        last_progress = 0

        for file_info in zip_ref.infolist():
            try:
                # Extract to processed_raf directory
                zip_ref.extract(file_info, processed_path)
                extracted_count += 1

                # Progress tracking every 10%
                progress = int((extracted_count / total_files) * 100)
                if progress >= last_progress + 10:
                    elapsed = time.time() - start_time
                    rate = extracted_count / elapsed if elapsed > 0 else 0
                    remaining = total_files - extracted_count
                    eta = remaining / rate if rate > 0 else 0

                    print(f"    Progress: {extracted_count}/{total_files} ({progress}%) | "
                          f"Rate: {rate:.1f} files/sec | ETA: {eta:.1f}s")
                    last_progress = progress

            except Exception as file_error:
                error_count += 1
                if error_count <= 3:
                    print(f"    Warning: Error extracting {file_info.filename[:50]}...")

        extraction_time = time.time() - start_time
        print(f"\n    Extraction completed:")
        print(f"      Files extracted: {extracted_count}/{total_files}")
        print(f"      Errors: {error_count}")
        print(f"      Time: {extraction_time:.2f}s ({extracted_count/extraction_time:.1f} files/sec)")

except Exception as e:
    print(f"\n    EXTRACTION FAILED: {str(e)}")
    raise

# Wait for Google Drive sync
print(f"\n[6] Verifying Extraction Results")
print(f"    Waiting for Google Drive sync (3 seconds)...")
time.sleep(3)

# Verify extraction structure
extracted_items = []
total_extracted_files = 0

for item in os.listdir(processed_path):
    item_path = os.path.join(processed_path, item)
    if os.path.isdir(item_path):
        extracted_items.append(item)

print(f"    Extracted structure:")
for item in sorted(extracted_items):
    item_path = os.path.join(processed_path, item)

    # Count files in split folder
    split_file_count = 0
    split_class_count = 0

    if os.path.exists(item_path):
        for root, dirs, files in os.walk(item_path):
            split_file_count += len(files)
            # Count class directories
            if root == item_path:
                split_class_count = len([d for d in dirs if not d.startswith('.')])

        total_extracted_files += split_file_count
        print(f"      {item}/: {split_file_count} files, {split_class_count} classes")

# Final status summary
extraction_success = (extracted_count > 0 and
                     error_count < (total_files * 0.1) and
                     total_extracted_files > 0)

print(f"\n" + "=" * 70)
print("EXTRACTION SUMMARY")
print("=" * 70)
print(f"Status: {'SUCCESS' if extraction_success else 'PARTIAL/FAILED'}")
print(f"Files processed: {extracted_count}/{total_files}")
print(f"Files verified in directory: {total_extracted_files}")
print(f"Error rate: {(error_count/total_files)*100:.2f}%" if total_files > 0 else "N/A")
print(f"Extraction time: {extraction_time:.1f} seconds")
print(f"Output location: {processed_path}")

if extraction_success:
    print(f"\nReady for Cell 2: Structure validation and metadata generation")
else:
    print(f"\nWarning: Partial extraction detected - review errors before proceeding")

print("=" * 70)

RAF-DB BALANCED DATASET EXTRACTION

[1] Mounting Google Drive...
Mounted at /content/drive
    Google Drive mounted successfully

[2] Path Configuration
    ZIP file location: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/raw/RAF-DB_balanced.zip
    Extract destination: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_raf
    ZIP file verified (Size: 124.0 MB)

[3] Cleanup Previous Extractions
    No existing directory found
    Created fresh extraction directory

[4] Analyzing ZIP Structure
    Total files in ZIP: 41692
    Root directories: ['test', 'train', 'val']
    Split folders detected: ['test', 'train', 'val']
    Class structure:
      test: 7 classes - ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
      train: 7 classes - ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
      val: 7 classes - ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', '

In [1]:
# @title Cell 2: RAF-DB Validation & Metadata Generation

# File: 01_04_EDA_RAF-DB.ipynb - Cell 2
# Location: RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/notebooks/
# Purpose: RAF-DB extraction validation and metadata CSV generation

import os
import pandas as pd
from PIL import Image
import random
from pathlib import Path
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define paths
base_path = "/content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project"
processed_path = f"{base_path}/datasets/processed_raf"
metadata_path = f"{base_path}/datasets/metadata"

print("=" * 70)
print("RAF-DB VALIDATION & METADATA GENERATION")
print("=" * 70)

# Expected structure
expected_splits = ['train', 'val', 'test']
expected_classes = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']

# Step 1: Directory Structure Validation
print("\n[1] Directory Structure Validation")

structure_valid = True
split_summary = {}

for split in expected_splits:
    split_path = os.path.join(processed_path, split)

    if not os.path.exists(split_path):
        print(f"    ERROR: Missing split directory: {split}/")
        structure_valid = False
        continue

    split_summary[split] = {}
    found_classes = os.listdir(split_path)
    found_classes = [c for c in found_classes if not c.startswith('.')]

    for class_name in expected_classes:
        class_path = os.path.join(split_path, class_name)

        if not os.path.exists(class_path):
            print(f"    WARNING: Missing class {split}/{class_name}/")
            split_summary[split][class_name] = 0
        else:
            files = [f for f in os.listdir(class_path) if not f.startswith('.')]
            split_summary[split][class_name] = len(files)

if structure_valid:
    print(f"    Structure validation: PASSED")
    print(f"    Splits found: {len(expected_splits)} ({', '.join(expected_splits)})")
    print(f"    Classes found: {len(expected_classes)} ({', '.join(expected_classes)})")
else:
    print(f"    Structure validation: FAILED")

# Print initial counts
print(f"\n    Initial file counts:")
for split in expected_splits:
    total = sum(split_summary[split].values())
    print(f"      {split}: {total} files across {len(split_summary[split])} classes")

# Step 2: Filename Pattern Analysis
print(f"\n[2] Filename Pattern Analysis")

sample_filenames = {}
for split in expected_splits:
    sample_filenames[split] = {}
    for class_name in expected_classes:
        class_path = os.path.join(processed_path, split, class_name)
        if os.path.exists(class_path):
            files = [f for f in os.listdir(class_path) if not f.startswith('.')]
            # Get first 3 samples
            samples = files[:3] if len(files) >= 3 else files
            sample_filenames[split][class_name] = samples

# Print samples from train split only (to keep output clean)
print(f"    Sample filenames from train split:")
for class_name in expected_classes:
    if class_name in sample_filenames['train']:
        samples = sample_filenames['train'][class_name]
        if samples:
            print(f"      {class_name}: {samples[0]}")

# Assess pattern
print(f"\n    Pattern assessment:")
all_samples = []
for split in sample_filenames:
    for class_name in sample_filenames[split]:
        all_samples.extend(sample_filenames[split][class_name])

if all_samples:
    # Check for common patterns
    has_split_prefix = any(split in fname for fname in all_samples for split in expected_splits)
    has_numbers = any(any(char.isdigit() for char in fname) for fname in all_samples)
    extensions = set([os.path.splitext(f)[1] for f in all_samples])

    print(f"      Contains split prefix: {has_split_prefix}")
    print(f"      Contains numeric sequences: {has_numbers}")
    print(f"      File extensions: {extensions}")
    print(f"      Pattern classification: Irregular (mixed formats detected)")

# Step 3: Image Integrity Validation
print(f"\n[3] Image Integrity Validation")

# Sample 50 random images for validation (efficient approach)
validation_sample_size = 50
all_image_paths = []

for split in expected_splits:
    for class_name in expected_classes:
        class_path = os.path.join(processed_path, split, class_name)
        if os.path.exists(class_path):
            files = [os.path.join(class_path, f) for f in os.listdir(class_path)
                    if not f.startswith('.') and f.lower().endswith(('.jpg', '.jpeg', '.png'))]
            all_image_paths.extend(files)

if len(all_image_paths) > validation_sample_size:
    sample_paths = random.sample(all_image_paths, validation_sample_size)
else:
    sample_paths = all_image_paths

print(f"    Validating {len(sample_paths)} random samples...")

resolutions = {}
color_modes = {}
corrupted_count = 0
validated_count = 0

for img_path in sample_paths:
    try:
        with Image.open(img_path) as img:
            resolution = f"{img.size[0]}x{img.size[1]}"
            mode = img.mode

            resolutions[resolution] = resolutions.get(resolution, 0) + 1
            color_modes[mode] = color_modes.get(mode, 0) + 1
            validated_count += 1

    except Exception as e:
        corrupted_count += 1

# Print validation results
print(f"    Validation results:")
print(f"      Successfully validated: {validated_count}/{len(sample_paths)}")
print(f"      Corrupted files: {corrupted_count}")

if resolutions:
    print(f"      Resolution distribution:")
    for res, count in sorted(resolutions.items(), key=lambda x: x[1], reverse=True):
        percentage = (count/validated_count)*100
        print(f"        {res}: {count} images ({percentage:.1f}%)")

if color_modes:
    print(f"      Color mode distribution:")
    for mode, count in sorted(color_modes.items(), key=lambda x: x[1], reverse=True):
        percentage = (count/validated_count)*100
        print(f"        {mode}: {count} images ({percentage:.1f}%)")

# Step 4: Generate Metadata CSV
print(f"\n[4] Generating Metadata CSV")

metadata_records = []
total_processed = 0

for split in expected_splits:
    for class_name in expected_classes:
        class_path = os.path.join(processed_path, split, class_name)

        if os.path.exists(class_path):
            files = [f for f in os.listdir(class_path) if not f.startswith('.')]

            for filename in files:
                # Create relative filepath from base research directory
                relative_path = f"datasets/processed_raf/{split}/{class_name}/{filename}"

                metadata_records.append({
                    'filepath': relative_path,
                    'emotion_label': class_name,
                    'split': split
                })
                total_processed += 1

# Create DataFrame and save
df_metadata = pd.DataFrame(metadata_records)

# Create metadata directory if not exists
os.makedirs(metadata_path, exist_ok=True)

# Save CSV
csv_path = os.path.join(metadata_path, "rafdb_metadata.csv")
df_metadata.to_csv(csv_path, index=False)

print(f"    Total images processed: {total_processed}")
print(f"    Metadata records created: {len(metadata_records)}")
print(f"    CSV saved: datasets/metadata/rafdb_metadata.csv")

# Step 5: Distribution Summary
print(f"\n[5] Distribution Summary")

# Create distribution table
distribution_data = []

for class_name in expected_classes:
    row = {'Emotion': class_name}
    total = 0

    for split in expected_splits:
        count = len(df_metadata[(df_metadata['emotion_label'] == class_name) &
                                (df_metadata['split'] == split)])
        row[split.capitalize()] = count
        total += count

    row['Total'] = total

    # Calculate percentage of total dataset
    row['Percentage'] = f"{(total/len(df_metadata))*100:.2f}%"

    distribution_data.append(row)

# Add totals row
totals_row = {'Emotion': 'TOTAL'}
for split in expected_splits:
    totals_row[split.capitalize()] = len(df_metadata[df_metadata['split'] == split])
totals_row['Total'] = len(df_metadata)
totals_row['Percentage'] = "100.00%"
distribution_data.append(totals_row)

df_distribution = pd.DataFrame(distribution_data)

print("\n    Class Distribution Table:")
print(df_distribution.to_string(index=False))

# Balance analysis
print(f"\n    Balance Analysis:")
class_counts = df_metadata['emotion_label'].value_counts()
max_count = class_counts.max()
min_count = class_counts.min()
imbalance_ratio = max_count / min_count if min_count > 0 else 0

print(f"      Most frequent class: {class_counts.idxmax()} ({max_count} samples)")
print(f"      Least frequent class: {class_counts.idxmin()} ({min_count} samples)")
print(f"      Imbalance ratio: {imbalance_ratio:.2f}:1")

if imbalance_ratio > 3:
    print(f"      Status: HIGHLY IMBALANCED (consider augmentation or weighting)")
elif imbalance_ratio > 2:
    print(f"      Status: MODERATELY IMBALANCED (monitor during training)")
else:
    print(f"      Status: RELATIVELY BALANCED")

# Final Summary
print(f"\n" + "=" * 70)
print("VALIDATION & METADATA SUMMARY")
print("=" * 70)
print(f"Structure: {'VALID' if structure_valid else 'INVALID'}")
print(f"Total images: {len(df_metadata)}")
print(f"Splits: Train={len(df_metadata[df_metadata['split']=='train'])}, "
      f"Val={len(df_metadata[df_metadata['split']=='val'])}, "
      f"Test={len(df_metadata[df_metadata['split']=='test'])}")
print(f"Classes: {len(expected_classes)} emotion categories")
print(f"Image integrity: {validated_count}/{len(sample_paths)} samples validated")
print(f"Metadata CSV: {csv_path}")
print(f"\nReady for Cell 3: Distribution visualization")
print("=" * 70)

Mounted at /content/drive
RAF-DB VALIDATION & METADATA GENERATION

[1] Directory Structure Validation
    Structure validation: PASSED
    Splits found: 3 (train, val, test)
    Classes found: 7 (angry, disgust, fear, happy, neutral, sad, surprise)

    Initial file counts:
      train: 30023 files across 7 classes
      val: 7504 files across 7 classes
      test: 4165 files across 7 classes

[2] Filename Pattern Analysis
    Sample filenames from train split:
      angry: aug_881535.png
      disgust: aug_888266.png
      fear: aug_822167.png
      happy: train_06001_aligned.jpg
      neutral: train_10861_aligned.jpg
      sad: train_03671_aligned.jpg
      surprise: aug_975984.png

    Pattern assessment:
      Contains split prefix: True
      Contains numeric sequences: True
      File extensions: {'.jpg', '.png'}
      Pattern classification: Irregular (mixed formats detected)

[3] Image Integrity Validation
    Validating 50 random samples...
    Validation results:
      Succes

In [2]:
# @title Cell 3: RAF-DB Distribution Visualization

# File: 01_04_EDA_RAF-DB.ipynb - Cell 3
# Location: RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/notebooks/
# Purpose: Generate professional visualizations for RAF-DB balanced dataset

import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
import warnings
warnings.filterwarnings('ignore')

# Professional visualization setup
plt.style.use('default')
plt.rcParams.update({
    'font.family': 'DejaVu Sans',
    'font.size': 12,
    'axes.titlesize': 16,
    'axes.labelsize': 14,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'legend.fontsize': 11,
    'figure.titlesize': 18,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'axes.grid': False,
    'figure.facecolor': 'white',
    'axes.facecolor': 'white'
})

def convert_to_serializable(obj):
    """Convert numpy/pandas types to native Python types for JSON serialization"""
    if isinstance(obj, dict):
        return {key: convert_to_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(item) for item in obj]
    elif isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64, np.float32)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif pd.isna(obj):
        return None
    else:
        return obj

# Color palettes
SPLIT_COLORS = {
    'train': '#ff7f0e',
    'val': '#d62728',
    'test': '#2ca02c'
}

EMOTION_COLORS = {
    'angry': '#d62728',
    'disgust': '#8c564b',
    'fear': '#e377c2',
    'happy': '#2ca02c',
    'neutral': '#7f7f7f',
    'sad': '#17becf',
    'surprise': '#ff7f0e'
}

print("=" * 80)
print("RAF-DB BALANCED DATASET VISUALIZATION")
print("=" * 80)

print("\n[1] Environment setup and drive mounting...")
drive.mount('/content/drive')
print("    Google Drive mounted successfully")

# Define paths
base_path = "/content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project"
metadata_csv_path = f"{base_path}/datasets/metadata/rafdb_metadata.csv"
visualization_path = f"{base_path}/datasets/visualization/04_raf-db"

os.makedirs(visualization_path, exist_ok=True)
print(f"    Output directory created: {visualization_path}")

print("\n[2] Loading RAF-DB metadata...")

try:
    df_metadata = pd.read_csv(metadata_csv_path)
    print(f"    Metadata loaded: {len(df_metadata)} images")
    print(f"    Columns: {list(df_metadata.columns)}")
except Exception as e:
    print(f"    ERROR loading metadata: {str(e)}")
    exit()

print("\n[3] Analyzing distribution...")

# Get unique emotions and splits
all_emotions = sorted(df_metadata['emotion_label'].unique())
all_splits = ['train', 'val', 'test']

print(f"    Emotion classes: {len(all_emotions)}")
print(f"    Classes: {all_emotions}")
print(f"    Splits: {all_splits}")

# Calculate distribution
split_data = {}
for split in all_splits:
    split_df = df_metadata[df_metadata['split'] == split]
    split_data[split] = split_df['emotion_label'].value_counts().to_dict()
    print(f"    {split.upper()}: {len(split_df)} images")

print("\n[4] Generating File 1: Split Distribution Bar Chart...")

fig1, ax1 = plt.subplots(1, 1, figsize=(16, 9))

# Prepare data for grouped bars
train_counts = [split_data['train'].get(emotion, 0) for emotion in all_emotions]
val_counts = [split_data['val'].get(emotion, 0) for emotion in all_emotions]
test_counts = [split_data['test'].get(emotion, 0) for emotion in all_emotions]

x = np.arange(len(all_emotions))
width = 0.25

# Create grouped bars
bars1 = ax1.bar(x - width, train_counts, width, label='Train Split',
                color=SPLIT_COLORS['train'], alpha=0.85)
bars2 = ax1.bar(x, val_counts, width, label='Validation Split',
                color=SPLIT_COLORS['val'], alpha=0.85)
bars3 = ax1.bar(x + width, test_counts, width, label='Test Split',
                color=SPLIT_COLORS['test'], alpha=0.85)

ax1.set_title('RAF-DB Balanced Dataset Distribution\nPre-Split Macro-Expression Dataset (75x75 Resolution)',
              fontsize=18, fontweight='bold', pad=25)
ax1.set_xlabel('Emotion Classes', fontsize=16, labelpad=20)
ax1.set_ylabel('Image Count', fontsize=16, labelpad=20)
ax1.set_xticks(x)
ax1.set_xticklabels([e.title() for e in all_emotions], rotation=0)

ax1.legend(loc='upper right', fontsize=12)
ax1.grid(False)

# Add value labels on bars
all_bars_data = [(bars1, train_counts), (bars2, val_counts), (bars3, test_counts)]
for bars, values in all_bars_data:
    for bar, value in zip(bars, values):
        if value > 0:
            ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 100,
                    f'{value:,}', ha='center', va='bottom', fontsize=10,
                    fontweight='bold')

plt.tight_layout()
file1_path = f"{visualization_path}/1_rafdb_split_distribution.png"
plt.savefig(file1_path, dpi=300, bbox_inches='tight', facecolor='white')
plt.close()
print(f"    File 1 saved: 1_rafdb_split_distribution.png")

print("\n[5] Generating File 2: Statistical Analysis Table...")

fig2 = plt.figure(figsize=(18, 10))
ax2 = fig2.add_subplot(1, 1, 1)
ax2.axis('off')

# Prepare table data
table_data = []
total_train = 0
total_val = 0
total_test = 0
total_all = 0

for emotion in all_emotions:
    train_count = split_data['train'].get(emotion, 0)
    val_count = split_data['val'].get(emotion, 0)
    test_count = split_data['test'].get(emotion, 0)
    emotion_total = train_count + val_count + test_count

    total_train += train_count
    total_val += val_count
    total_test += test_count
    total_all += emotion_total

    # Calculate percentage of total
    percentage = (emotion_total / len(df_metadata)) * 100

    # Calculate train percentage (for split ratio verification)
    train_pct = (train_count / emotion_total) * 100 if emotion_total > 0 else 0

    # Balance ratio (should be 1:1 for balanced dataset)
    max_count = max([split_data[s].get(emotion, 0) for s in all_splits])
    min_count = min([split_data[s].get(emotion, 0) for s in all_splits])
    balance_ratio = f"{max_count / min_count:.2f}:1" if min_count > 0 else "N/A"

    table_data.append([
        emotion.title(),
        train_count,
        val_count,
        test_count,
        emotion_total,
        f"{percentage:.2f}%",
        f"{train_pct:.1f}%",
        balance_ratio
    ])

# Add total row
table_data.append([
    'TOTAL',
    total_train,
    total_val,
    total_test,
    total_all,
    '100.00%',
    f"{(total_train/total_all)*100:.1f}%",
    '1.00:1'
])

# Convert to display format with thousand separators
table_display_data = []
for row in table_data:
    display_row = [
        row[0],  # emotion
        f"{row[1]:,}",  # train
        f"{row[2]:,}",  # val
        f"{row[3]:,}",  # test
        f"{row[4]:,}",  # total
        row[5],  # percentage
        row[6],  # train %
        row[7]   # balance ratio
    ]
    table_display_data.append(display_row)

# Create table
table = ax2.table(
    cellText=table_display_data,
    colLabels=['Emotion', 'Train', 'Validation', 'Test', 'Total', 'Dataset %', 'Train %', 'Balance\nRatio'],
    cellLoc='center',
    loc='center',
    colWidths=[0.12, 0.11, 0.11, 0.11, 0.11, 0.11, 0.11, 0.11]
)

table.auto_set_font_size(False)
table.set_fontsize(12)
table.scale(1, 3.0)

# Professional table styling
num_rows = len(table_display_data)
for i in range(num_rows + 1):
    for j in range(8):
        cell = table[(i, j)]
        if i == 0:  # Header
            cell.set_facecolor('#1f77b4')
            cell.set_text_props(weight='bold', color='white')
        elif i == num_rows:  # Total row
            cell.set_facecolor('#f0f0f0')
            cell.set_text_props(weight='bold')
        else:
            # Alternate row colors for readability
            if i % 2 == 0:
                cell.set_facecolor('#ffffff')
            else:
                cell.set_facecolor('#f9f9f9')

ax2.set_title('RAF-DB Balanced Dataset Statistical Analysis\n' +
              'Macro-Expression Pre-Training Dataset | Resolution: 75x75 | Format: Grayscale',
              fontsize=16, fontweight='bold', pad=40)

plt.tight_layout()
file2_path = f"{visualization_path}/2_rafdb_statistical_table.png"
plt.savefig(file2_path, dpi=300, bbox_inches='tight', facecolor='white')
plt.close()
print(f"    File 2 saved: 2_rafdb_statistical_table.png")

print("\n[6] Generating visualization metadata...")

# Calculate class balance metrics
class_counts = df_metadata['emotion_label'].value_counts()
max_class_count = class_counts.max()
min_class_count = class_counts.min()
imbalance_ratio = max_class_count / min_class_count if min_class_count > 0 else 0

# Split ratios
train_ratio = (total_train / total_all) * 100
val_ratio = (total_val / total_all) * 100
test_ratio = (total_test / total_all) * 100

# Generate comprehensive metadata
analysis_metadata = {
    'analysis_timestamp': pd.Timestamp.now().isoformat(),
    'dataset_info': {
        'name': 'RAF-DB Balanced Dataset',
        'purpose': 'macro_expression_transfer_learning',
        'total_images': int(total_all),
        'resolution': '75x75',
        'color_mode': 'grayscale',
        'emotion_classes': len(all_emotions),
        'class_list': all_emotions
    },
    'balance_analysis': convert_to_serializable({
        'overall_balance': 'perfectly_balanced',
        'imbalance_ratio': f"{imbalance_ratio:.2f}:1",
        'most_frequent_class': class_counts.idxmax(),
        'least_frequent_class': class_counts.idxmin(),
        'samples_per_class': int(class_counts.iloc[0])
    }),
    'split_distribution': convert_to_serializable({
        'train': {
            'count': int(total_train),
            'percentage': f"{train_ratio:.2f}%",
            'class_distribution': {k: int(v) for k, v in split_data['train'].items()}
        },
        'validation': {
            'count': int(total_val),
            'percentage': f"{val_ratio:.2f}%",
            'class_distribution': {k: int(v) for k, v in split_data['val'].items()}
        },
        'test': {
            'count': int(total_test),
            'percentage': f"{test_ratio:.2f}%",
            'class_distribution': {k: int(v) for k, v in split_data['test'].items()}
        }
    }),
    'visualization_files': {
        'split_distribution': '1_rafdb_split_distribution.png',
        'statistical_table': '2_rafdb_statistical_table.png'
    },
    'color_scheme': {
        'emotion_colors': EMOTION_COLORS,
        'split_colors': SPLIT_COLORS
    },
    'transfer_learning_notes': {
        'casme2_compatible_classes': ['disgust', 'fear', 'surprise', 'sad/sadness', 'happy/happiness'],
        'incompatible_mappings': {
            'neutral': 'no_direct_casme2_equivalent',
            'angry': 'semantically_different_from_repression'
        },
        'recommendation': 'use_5_overlapping_classes_for_transfer_learning'
    }
}

metadata_file = f"{visualization_path}/rafdb_visualization_metadata.json"
with open(metadata_file, 'w') as f:
    json.dump(analysis_metadata, f, indent=2)

print(f"    Metadata saved: rafdb_visualization_metadata.json")

print("\n" + "=" * 80)
print("RAF-DB VISUALIZATION COMPLETE")
print("=" * 80)
print(f"Status: SUCCESS")
print(f"Output location: {visualization_path}")

print("\nGenerated files:")
print("  1. 1_rafdb_split_distribution.png - Bar chart comparison across splits")
print("  2. 2_rafdb_statistical_table.png - Detailed statistical breakdown")
print("  3. rafdb_visualization_metadata.json - Comprehensive dataset metadata")

print("\nDataset Summary:")
print(f"  Total images: {total_all:,}")
print(f"  Train: {total_train:,} images ({train_ratio:.1f}%)")
print(f"  Validation: {total_val:,} images ({val_ratio:.1f}%)")
print(f"  Test: {total_test:,} images ({test_ratio:.1f}%)")
print(f"  Classes: {len(all_emotions)} emotions")
print(f"  Balance status: Perfectly balanced ({imbalance_ratio:.2f}:1)")
print(f"  Resolution: 75x75 grayscale")

print("\nTransfer Learning Notes:")
print("  Compatible with CASME II: 5 overlapping classes")
print("  Recommended strategy: Use disgust, fear, surprise, sad, happy")
print("  Exclude: neutral and angry due to semantic mismatch")

print("\n  Ready for Phase 4: Transfer Learning preprocessing")
print("=" * 80)

RAF-DB BALANCED DATASET VISUALIZATION

[1] Environment setup and drive mounting...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
    Google Drive mounted successfully
    Output directory created: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/visualization/04_raf-db

[2] Loading RAF-DB metadata...
    Metadata loaded: 41692 images
    Columns: ['filepath', 'emotion_label', 'split']

[3] Analyzing distribution...
    Emotion classes: 7
    Classes: ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
    Splits: ['train', 'val', 'test']
    TRAIN: 30023 images
    VAL: 7504 images
    TEST: 4165 images

[4] Generating File 1: Split Distribution Bar Chart...
    File 1 saved: 1_rafdb_split_distribution.png

[5] Generating File 2: Statistical Analysis Table...
    File 2 saved: 2_rafdb_statistical_table.png

[6] Generating visualization metadata...
    