In [None]:
# @title Cell 1: CASME2 Grayscale + Center Crop Preprocessing Pipeline
# File: preprocess_casme2_grayscale_centercrop.py
# Location: Thesis_MER_Project/scripts/preprocessing/
# Purpose: Transform v1/v2/v3 datasets to grayscale 224x224 center-cropped versions (v4/v5/v6)

import os
import json
import cv2
import numpy as np
from pathlib import Path
from datetime import datetime
from collections import defaultdict
from google.colab import drive

# ============================================================================
# CONFIGURATION
# ============================================================================

# Mount Google Drive
drive.mount('/content/drive')

# Define base paths
BASE_PATH = "/content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project"
PROCESSED_PATH = f"{BASE_PATH}/datasets/processed_casme2"

# Dataset mapping: source -> target
DATASET_MAPPING = {
    'v1_to_v4': {
        'source': f"{PROCESSED_PATH}/data_split_v1",
        'target': f"{BASE_PATH}/datasets/preprocessed_casme2_v4",
        'description': 'AF - Apex Frame (grayscale 224x224)',
        'variant': 'AF'
    },
    'v2_to_v5': {
        'source': f"{PROCESSED_PATH}/data_split_v2",
        'target': f"{BASE_PATH}/datasets/preprocessed_casme2_v5",
        'description': 'KFS - Key Frame Sequence (grayscale 224x224)',
        'variant': 'KFS'
    },
    'v3_to_v6': {
        'source': f"{PROCESSED_PATH}/data_split_v3",
        'target': f"{BASE_PATH}/datasets/preprocessed_casme2_v6",
        'description': 'MFS - Multi-Frame Sequence (grayscale 224x224)',
        'variant': 'MFS'
    }
}

# Processing parameters
INTERMEDIATE_SIZE = 256
FINAL_SIZE = 224
CROP_MARGIN = (INTERMEDIATE_SIZE - FINAL_SIZE) // 2

# ============================================================================
# PREPROCESSING FUNCTIONS
# ============================================================================

def smart_resize_preserve_aspect(image, target_size=256):
    """
    Resize image from 640x480 to square format preserving facial proportions.

    Strategy:
    1. Crop center 480x480 from 640x480 (remove 80px from left and right)
    2. Resize to target_size x target_size

    Args:
        image: Input image (H, W, C) or (H, W)
        target_size: Output square dimension

    Returns:
        Resized square image
    """
    h, w = image.shape[:2]

    # Calculate crop coordinates to get square center region
    if w > h:
        # Landscape: crop width
        crop_size = h
        x_start = (w - crop_size) // 2
        y_start = 0
    else:
        # Portrait or square: crop height
        crop_size = w
        x_start = 0
        y_start = (h - crop_size) // 2

    # Crop to square
    cropped = image[y_start:y_start+crop_size, x_start:x_start+crop_size]

    # Resize to target size
    resized = cv2.resize(cropped, (target_size, target_size), interpolation=cv2.INTER_LANCZOS4)

    return resized

def convert_to_grayscale(image):
    """
    Convert RGB/BGR image to grayscale.

    Args:
        image: Input RGB/BGR image

    Returns:
        Grayscale image
    """
    if len(image.shape) == 3:
        return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return image

def center_crop(image, crop_size=224):
    """
    Extract center crop from image.

    Args:
        image: Input image (H, W) or (H, W, C)
        crop_size: Size of center crop

    Returns:
        Center-cropped image
    """
    h, w = image.shape[:2]
    start_y = (h - crop_size) // 2
    start_x = (w - crop_size) // 2

    return image[start_y:start_y+crop_size, start_x:start_x+crop_size]

def preprocess_image(image_path):
    """
    Complete preprocessing pipeline for a single image.

    Pipeline:
    1. Load image (384x384 RGB distorted from 640x480)
    2. Smart resize to 256x256 (preserving aspect ratio via center crop)
    3. Convert to grayscale
    4. Center crop to 224x224

    Args:
        image_path: Path to input image

    Returns:
        Preprocessed grayscale 224x224 image, or None if error
    """
    try:
        # Load image
        image = cv2.imread(image_path)

        if image is None:
            return None

        # Step 1: Smart resize to 256x256 (preserve aspect ratio)
        resized = smart_resize_preserve_aspect(image, INTERMEDIATE_SIZE)

        # Step 2: Convert to grayscale
        grayscale = convert_to_grayscale(resized)

        # Step 3: Center crop to 224x224
        cropped = center_crop(grayscale, FINAL_SIZE)

        return cropped

    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

# ============================================================================
# DATASET PROCESSING
# ============================================================================

def process_dataset(source_dir, target_dir, variant_name):
    """
    Process entire dataset: transform all images in train/val/test splits.

    Args:
        source_dir: Source dataset directory (v1/v2/v3)
        target_dir: Target directory (v4/v5/v6)
        variant_name: Dataset variant (AF/KFS/MFS)

    Returns:
        Processing statistics dictionary
    """
    stats = {
        'variant': variant_name,
        'processing_date': datetime.now().isoformat(),
        'source_directory': source_dir,
        'target_directory': target_dir,
        'preprocessing_steps': [
            'Smart resize 640x480 -> 256x256 (aspect preserved)',
            'Grayscale conversion (RGB -> Gray)',
            'Center crop 256x256 -> 224x224'
        ],
        'splits': {},
        'total_processed': 0,
        'total_errors': 0
    }

    splits = ['train', 'val', 'test']

    for split in splits:
        source_split_dir = os.path.join(source_dir, split)
        target_split_dir = os.path.join(target_dir, split)

        if not os.path.exists(source_split_dir):
            print(f"  Warning: {split} split not found in source directory")
            continue

        # Create target directory
        os.makedirs(target_split_dir, exist_ok=True)

        # Get all image files
        image_files = [f for f in os.listdir(source_split_dir)
                      if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

        split_stats = {
            'total_images': len(image_files),
            'processed': 0,
            'errors': 0,
            'emotion_distribution': defaultdict(int)
        }

        print(f"  Processing {split} split: {len(image_files)} images")

        # Process each image
        for idx, img_filename in enumerate(image_files, 1):
            source_path = os.path.join(source_split_dir, img_filename)
            target_path = os.path.join(target_split_dir, img_filename)

            # Preprocess image
            processed_img = preprocess_image(source_path)

            if processed_img is not None:
                # Save preprocessed image
                cv2.imwrite(target_path, processed_img)
                split_stats['processed'] += 1

                # Extract emotion from filename for statistics
                emotion = img_filename.split('_')[-1].replace('.jpg', '')
                split_stats['emotion_distribution'][emotion] += 1
            else:
                split_stats['errors'] += 1
                stats['total_errors'] += 1

            # Progress indicator (every 100 images)
            if idx % 100 == 0:
                print(f"    Progress: {idx}/{len(image_files)}")

        stats['splits'][split] = dict(split_stats)
        stats['splits'][split]['emotion_distribution'] = dict(split_stats['emotion_distribution'])
        stats['total_processed'] += split_stats['processed']

        print(f"    Completed: {split_stats['processed']} processed, {split_stats['errors']} errors")

    return stats

# ============================================================================
# MAIN EXECUTION
# ============================================================================

print("=" * 80)
print("CASME2 PREPROCESSING PIPELINE: GRAYSCALE + CENTER CROP (224x224)")
print("=" * 80)
print()

all_results = {}

for mapping_name, config in DATASET_MAPPING.items():
    source_dir = config['source']
    target_dir = config['target']
    variant = config['variant']
    description = config['description']

    print(f"[{variant}] {description}")
    print(f"Source: {source_dir}")
    print(f"Target: {target_dir}")
    print()

    # Check if source exists
    if not os.path.exists(source_dir):
        print(f"  Error: Source directory not found")
        print()
        continue

    # Create target base directory
    os.makedirs(target_dir, exist_ok=True)

    # Process dataset
    processing_stats = process_dataset(source_dir, target_dir, variant)

    # Save processing summary
    summary_path = os.path.join(target_dir, 'preprocessing_summary.json')
    with open(summary_path, 'w') as f:
        json.dump(processing_stats, f, indent=2)

    all_results[variant] = processing_stats

    print(f"  Summary saved to: preprocessing_summary.json")
    print(f"  Total processed: {processing_stats['total_processed']}")
    print(f"  Total errors: {processing_stats['total_errors']}")
    print()
    print("-" * 80)
    print()

# ============================================================================
# FINAL SUMMARY
# ============================================================================

print("=" * 80)
print("PREPROCESSING COMPLETE - SUMMARY")
print("=" * 80)
print()

for variant, stats in all_results.items():
    print(f"{variant} Dataset:")
    print(f"  Total processed: {stats['total_processed']} images")
    print(f"  Errors: {stats['total_errors']}")

    for split in ['train', 'val', 'test']:
        if split in stats['splits']:
            split_data = stats['splits'][split]
            print(f"  {split.upper()}: {split_data['processed']} images")

    print()

print("All preprocessed datasets saved to:")
for config in DATASET_MAPPING.values():
    print(f"  - {config['target']}")

print()
print("=" * 80)

Mounted at /content/drive
CASME2 PREPROCESSING PIPELINE: GRAYSCALE + CENTER CROP (224x224)

[AF] AF - Apex Frame (grayscale 224x224)
Source: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/data_split_v1
Target: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/preprocessed_casme2_v4

  Processing train split: 201 images
    Progress: 100/201
    Progress: 200/201
    Completed: 201 processed, 0 errors
  Processing val split: 26 images
    Completed: 26 processed, 0 errors
  Processing test split: 28 images
    Completed: 28 processed, 0 errors
  Summary saved to: preprocessing_summary.json
  Total processed: 255
  Total errors: 0

--------------------------------------------------------------------------------

[KFS] KFS - Key Frame Sequence (grayscale 224x224)
Source: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/data_split_v2
Target: /con

In [None]:
# @title Cell 2: CASME2 Preprocessing Validation + Face Detection Analysis
# File: validate_preprocessed_casme2.py
# Location: Thesis_MER_Project/scripts/preprocessing/
# Purpose: Validate v4/v5/v6 preprocessing quality and face detection analysis

import os
import json
import cv2
import dlib
import numpy as np
from pathlib import Path
from datetime import datetime
from collections import defaultdict
import matplotlib.pyplot as plt
from google.colab import drive

# ============================================================================
# CONFIGURATION
# ============================================================================

# Mount Google Drive
drive.mount('/content/drive')

# Define base paths
BASE_PATH = "/content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project"
PROCESSED_PATH = f"{BASE_PATH}/datasets/processed_casme2"

# Preprocessed dataset paths
PREPROCESSED_DATASETS = {
    'v4': {
        'path': f"{PROCESSED_PATH}/preprocessed_v4",
        'source': f"{PROCESSED_PATH}/data_split_v1",
        'variant': 'AF',
        'description': 'Apex Frame'
    },
    'v5': {
        'path': f"{PROCESSED_PATH}/preprocessed_v5",
        'source': f"{PROCESSED_PATH}/data_split_v2",
        'variant': 'KFS',
        'description': 'Key Frame Sequence'
    },
    'v6': {
        'path': f"{PROCESSED_PATH}/preprocessed_v6",
        'source': f"{PROCESSED_PATH}/data_split_v3",
        'variant': 'MFS',
        'description': 'Multi-Frame Sequence'
    }
}

# Validation parameters
EXPECTED_SIZE = (224, 224)
EXPECTED_CHANNELS = 1  # Grayscale
CENTRALITY_THRESHOLD = 0.20  # Face off-center by >20% flagged
MIN_FACE_DETECTION_RATE = 0.90  # 90% minimum acceptable

# Initialize Dlib face detector
print("Initializing Dlib face detector...")
detector = dlib.get_frontal_face_detector()
print("Face detector loaded")
print()

# ============================================================================
# VALIDATION FUNCTIONS
# ============================================================================

def verify_image_counts(source_dir, target_dir):
    """
    Verify that preprocessed dataset has same image count as source.

    Args:
        source_dir: Original dataset directory
        target_dir: Preprocessed dataset directory

    Returns:
        Dictionary with count comparison per split
    """
    counts = {}

    for split in ['train', 'val', 'test']:
        source_split = os.path.join(source_dir, split)
        target_split = os.path.join(target_dir, split)

        source_count = 0
        target_count = 0

        if os.path.exists(source_split):
            source_count = len([f for f in os.listdir(source_split)
                              if f.lower().endswith(('.jpg', '.jpeg', '.png'))])

        if os.path.exists(target_split):
            target_count = len([f for f in os.listdir(target_split)
                              if f.lower().endswith(('.jpg', '.jpeg', '.png'))])

        counts[split] = {
            'source': source_count,
            'target': target_count,
            'match': source_count == target_count
        }

    return counts

def check_image_dimensions(image_path):
    """
    Verify image has correct dimensions and channels.

    Args:
        image_path: Path to image file

    Returns:
        Dictionary with dimension check results
    """
    try:
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

        if img is None:
            return {'valid': False, 'error': 'Failed to load'}

        h, w = img.shape
        channels = 1 if len(img.shape) == 2 else img.shape[2]

        correct_size = (h, w) == EXPECTED_SIZE
        correct_channels = channels == EXPECTED_CHANNELS

        return {
            'valid': correct_size and correct_channels,
            'dimensions': (h, w),
            'channels': channels,
            'correct_size': correct_size,
            'correct_channels': correct_channels
        }

    except Exception as e:
        return {'valid': False, 'error': str(e)}

def detect_face_centrality(image_path):
    """
    Detect face in image and calculate centrality score.

    Centrality score: distance from face center to image center,
    normalized by image dimension.

    Args:
        image_path: Path to image file

    Returns:
        Dictionary with face detection results and centrality score
    """
    try:
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

        if img is None:
            return {'detected': False, 'error': 'Failed to load image'}

        # Detect faces
        faces = detector(img, 1)

        if len(faces) == 0:
            return {'detected': False, 'reason': 'No face detected'}

        if len(faces) > 1:
            # Multiple faces: use largest
            faces = sorted(faces, key=lambda r: r.width() * r.height(), reverse=True)

        face = faces[0]

        # Calculate face center
        face_center_x = (face.left() + face.right()) / 2
        face_center_y = (face.top() + face.bottom()) / 2

        # Image center
        img_center_x = img.shape[1] / 2
        img_center_y = img.shape[0] / 2

        # Calculate offset from center (normalized)
        offset_x = abs(face_center_x - img_center_x) / img.shape[1]
        offset_y = abs(face_center_y - img_center_y) / img.shape[0]

        # Centrality score: Euclidean distance from center
        centrality_score = np.sqrt(offset_x**2 + offset_y**2)

        # Face bounding box size relative to image
        face_width_ratio = face.width() / img.shape[1]
        face_height_ratio = face.height() / img.shape[0]

        return {
            'detected': True,
            'centrality_score': centrality_score,
            'offset_x': offset_x,
            'offset_y': offset_y,
            'face_width_ratio': face_width_ratio,
            'face_height_ratio': face_height_ratio,
            'well_centered': centrality_score <= CENTRALITY_THRESHOLD,
            'bbox': {
                'left': face.left(),
                'top': face.top(),
                'right': face.right(),
                'bottom': face.bottom()
            }
        }

    except Exception as e:
        return {'detected': False, 'error': str(e)}

def analyze_image_quality(image_path):
    """
    Analyze basic image quality metrics.

    Args:
        image_path: Path to image file

    Returns:
        Dictionary with quality metrics
    """
    try:
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

        if img is None:
            return {'valid': False}

        # Calculate metrics
        mean_intensity = np.mean(img)
        std_intensity = np.std(img)
        min_intensity = np.min(img)
        max_intensity = np.max(img)

        # Contrast metric (normalized standard deviation)
        contrast = std_intensity / mean_intensity if mean_intensity > 0 else 0

        return {
            'valid': True,
            'mean_intensity': float(mean_intensity),
            'std_intensity': float(std_intensity),
            'min_intensity': int(min_intensity),
            'max_intensity': int(max_intensity),
            'contrast': float(contrast)
        }

    except Exception as e:
        return {'valid': False, 'error': str(e)}

# ============================================================================
# DATASET VALIDATION
# ============================================================================

def validate_dataset(dataset_path, source_path, variant_name):
    """
    Comprehensive validation of preprocessed dataset.

    Args:
        dataset_path: Path to preprocessed dataset
        source_path: Path to source dataset
        variant_name: Dataset variant (AF/KFS/MFS)

    Returns:
        Validation report dictionary
    """
    report = {
        'variant': variant_name,
        'validation_date': datetime.now().isoformat(),
        'dataset_path': dataset_path,
        'source_path': source_path,
        'splits': {},
        'overall_statistics': {},
        'issues_found': [],
        'recommendation': ''
    }

    # Step 1: Verify image counts
    print(f"  Step 1: Verifying image counts...")
    count_verification = verify_image_counts(source_path, dataset_path)

    for split, counts in count_verification.items():
        if not counts['match']:
            report['issues_found'].append(
                f"{split}: Count mismatch (source: {counts['source']}, target: {counts['target']})"
            )

    # Step 2: Validate each split
    splits = ['train', 'val', 'test']

    overall_stats = {
        'total_images': 0,
        'dimension_correct': 0,
        'faces_detected': 0,
        'faces_well_centered': 0,
        'faces_off_center': 0,
        'face_detection_failed': 0,
        'centrality_scores': []
    }

    for split in splits:
        split_path = os.path.join(dataset_path, split)

        if not os.path.exists(split_path):
            continue

        print(f"  Step 2: Analyzing {split} split...")

        image_files = [f for f in os.listdir(split_path)
                      if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

        split_stats = {
            'total_images': len(image_files),
            'source_count': count_verification[split]['source'],
            'count_match': count_verification[split]['match'],
            'dimension_correct': 0,
            'dimension_errors': 0,
            'faces_detected': 0,
            'faces_well_centered': 0,
            'faces_off_center': 0,
            'face_detection_failed': 0,
            'centrality_scores': [],
            'quality_metrics': {
                'mean_intensities': [],
                'contrasts': []
            },
            'problematic_images': []
        }

        # Sample validation (every 10th image for speed)
        sample_indices = range(0, len(image_files), max(1, len(image_files) // 50))

        for idx in sample_indices:
            img_file = image_files[idx]
            img_path = os.path.join(split_path, img_file)

            # Check dimensions
            dim_check = check_image_dimensions(img_path)
            if dim_check['valid']:
                split_stats['dimension_correct'] += 1
            else:
                split_stats['dimension_errors'] += 1

            # Face detection
            face_result = detect_face_centrality(img_path)

            if face_result['detected']:
                split_stats['faces_detected'] += 1
                split_stats['centrality_scores'].append(face_result['centrality_score'])

                if face_result['well_centered']:
                    split_stats['faces_well_centered'] += 1
                else:
                    split_stats['faces_off_center'] += 1
                    split_stats['problematic_images'].append({
                        'filename': img_file,
                        'centrality_score': face_result['centrality_score'],
                        'offset_x': face_result['offset_x'],
                        'offset_y': face_result['offset_y']
                    })
            else:
                split_stats['face_detection_failed'] += 1
                split_stats['problematic_images'].append({
                    'filename': img_file,
                    'issue': 'Face detection failed'
                })

            # Quality metrics
            quality = analyze_image_quality(img_path)
            if quality['valid']:
                split_stats['quality_metrics']['mean_intensities'].append(
                    quality['mean_intensity']
                )
                split_stats['quality_metrics']['contrasts'].append(
                    quality['contrast']
                )

        # Calculate rates
        sample_count = len(sample_indices)
        split_stats['face_detection_rate'] = (
            split_stats['faces_detected'] / sample_count if sample_count > 0 else 0
        )
        split_stats['well_centered_rate'] = (
            split_stats['faces_well_centered'] / split_stats['faces_detected']
            if split_stats['faces_detected'] > 0 else 0
        )

        # Aggregate quality metrics
        if split_stats['quality_metrics']['mean_intensities']:
            split_stats['quality_metrics']['avg_mean_intensity'] = float(
                np.mean(split_stats['quality_metrics']['mean_intensities'])
            )
            split_stats['quality_metrics']['avg_contrast'] = float(
                np.mean(split_stats['quality_metrics']['contrasts'])
            )

        # Update overall statistics
        overall_stats['total_images'] += split_stats['total_images']
        overall_stats['dimension_correct'] += split_stats['dimension_correct']
        overall_stats['faces_detected'] += split_stats['faces_detected']
        overall_stats['faces_well_centered'] += split_stats['faces_well_centered']
        overall_stats['faces_off_center'] += split_stats['faces_off_center']
        overall_stats['face_detection_failed'] += split_stats['face_detection_failed']
        overall_stats['centrality_scores'].extend(split_stats['centrality_scores'])

        # Clean up before saving
        split_stats['centrality_scores'] = [float(x) for x in split_stats['centrality_scores']]
        split_stats['quality_metrics']['mean_intensities'] = []
        split_stats['quality_metrics']['contrasts'] = []

        report['splits'][split] = split_stats

        print(f"    Images analyzed: {sample_count}")
        print(f"    Face detection rate: {split_stats['face_detection_rate']:.1%}")
        print(f"    Well-centered rate: {split_stats['well_centered_rate']:.1%}")

    # Calculate overall metrics
    total_sampled = sum(len(report['splits'][s]['centrality_scores'])
                       for s in report['splits'])

    if total_sampled > 0:
        overall_stats['face_detection_rate'] = (
            overall_stats['faces_detected'] / total_sampled
        )
        overall_stats['well_centered_rate'] = (
            overall_stats['faces_well_centered'] / overall_stats['faces_detected']
            if overall_stats['faces_detected'] > 0 else 0
        )
        overall_stats['off_center_rate'] = (
            overall_stats['faces_off_center'] / overall_stats['faces_detected']
            if overall_stats['faces_detected'] > 0 else 0
        )
        overall_stats['avg_centrality_score'] = float(
            np.mean(overall_stats['centrality_scores'])
        )

    overall_stats['centrality_scores'] = [float(x) for x in overall_stats['centrality_scores']]

    report['overall_statistics'] = overall_stats

    # Generate recommendation
    face_det_rate = overall_stats.get('face_detection_rate', 0)
    off_center_rate = overall_stats.get('off_center_rate', 0)

    if face_det_rate < MIN_FACE_DETECTION_RATE:
        report['issues_found'].append(
            f"Low face detection rate: {face_det_rate:.1%} (threshold: {MIN_FACE_DETECTION_RATE:.1%})"
        )
        report['recommendation'] = "NEEDS_IMPROVEMENT - Consider face-aware preprocessing"
    elif off_center_rate > 0.10:
        report['issues_found'].append(
            f"High off-center rate: {off_center_rate:.1%}"
        )
        report['recommendation'] = "NEEDS_IMPROVEMENT - Consider face-aware preprocessing"
    else:
        report['recommendation'] = "ACCEPTABLE - Proceed to training"

    return report

# ============================================================================
# MAIN EXECUTION
# ============================================================================

print("=" * 80)
print("CASME2 PREPROCESSING VALIDATION + FACE DETECTION ANALYSIS")
print("=" * 80)
print()

validation_results = {}

for version, config in PREPROCESSED_DATASETS.items():
    dataset_path = config['path']
    source_path = config['source']
    variant = config['variant']
    description = config['description']

    print(f"[{version.upper()}] {variant} - {description}")
    print(f"Path: {dataset_path}")
    print()

    if not os.path.exists(dataset_path):
        print(f"  Warning: Dataset not found, skipping validation")
        print()
        continue

    # Validate dataset
    validation_report = validate_dataset(dataset_path, source_path, variant)
    validation_results[version] = validation_report

    # Save validation report
    report_path = os.path.join(dataset_path, 'validation_report.json')
    with open(report_path, 'w') as f:
        json.dump(validation_report, f, indent=2)

    print(f"  Validation report saved: validation_report.json")
    print()
    print("-" * 80)
    print()

# ============================================================================
# FINAL SUMMARY
# ============================================================================

print("=" * 80)
print("VALIDATION SUMMARY")
print("=" * 80)
print()

for version, report in validation_results.items():
    variant = report['variant']
    stats = report['overall_statistics']

    print(f"{version.upper()} ({variant}):")
    print(f"  Total images: {stats.get('total_images', 0)}")
    print(f"  Face detection rate: {stats.get('face_detection_rate', 0):.1%}")
    print(f"  Well-centered rate: {stats.get('well_centered_rate', 0):.1%}")
    print(f"  Off-center rate: {stats.get('off_center_rate', 0):.1%}")

    if report['issues_found']:
        print(f"  Issues found: {len(report['issues_found'])}")
        for issue in report['issues_found'][:3]:
            print(f"    - {issue}")

    print(f"  Recommendation: {report['recommendation']}")
    print()

# Overall recommendation
all_acceptable = all(
    r['recommendation'] == "ACCEPTABLE - Proceed to training"
    for r in validation_results.values()
)

print("-" * 80)
if all_acceptable:
    print("OVERALL: All datasets passed validation")
    print("Next step: Proceed to model training")
else:
    print("OVERALL: Some datasets need improvement")
    print("Next step: Consider implementing face-aware preprocessing (Cell 3)")

print()
print("=" * 80)

Mounted at /content/drive
Initializing Dlib face detector...
Face detector loaded

CASME2 PREPROCESSING VALIDATION + FACE DETECTION ANALYSIS

[V4] AF - Apex Frame
Path: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/preprocessed_v4

  Step 1: Verifying image counts...
  Step 2: Analyzing train split...
    Images analyzed: 51
    Face detection rate: 100.0%
    Well-centered rate: 86.3%
  Step 2: Analyzing val split...
    Images analyzed: 26
    Face detection rate: 100.0%
    Well-centered rate: 96.2%
  Step 2: Analyzing test split...
    Images analyzed: 28
    Face detection rate: 100.0%
    Well-centered rate: 82.1%
  Validation report saved: validation_report.json

--------------------------------------------------------------------------------

[V5] KFS - Key Frame Sequence
Path: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/preprocessed_v5

  Step 1: Verifying image 

In [None]:
# @title Cell 3: CASME2 Face-Aware Preprocessing
# File: preprocess_casme2_faceaware_v3.py
# Location: Thesis_MER_Project/scripts/preprocessing/
# Purpose: Face-centered crop with forehead inclusion for complete micro-expression coverage

import os
import json
import cv2
import dlib
import numpy as np
from pathlib import Path
from datetime import datetime
from collections import defaultdict
from google.colab import drive

# ============================================================================
# CONFIGURATION
# ============================================================================

# Mount Google Drive
drive.mount('/content/drive')

# Define base paths
BASE_PATH = "/content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project"
PROCESSED_PATH = f"{BASE_PATH}/datasets/processed_casme2"

# Dataset mapping: source -> target (face-aware with forehead versions)
DATASET_MAPPING = {
    'v1_to_v7': {
        'source': f"{PROCESSED_PATH}/data_split_v1",
        'target': f"{PROCESSED_PATH}/preprocessed_v7",
        'description': 'AF - Apex Frame (face-aware with forehead, grayscale 224x224)',
        'variant': 'AF'
    },
    'v2_to_v8': {
        'source': f"{PROCESSED_PATH}/data_split_v2",
        'target': f"{PROCESSED_PATH}/preprocessed_v8",
        'description': 'KFS - Key Frame Sequence (face-aware with forehead, grayscale 224x224)',
        'variant': 'KFS'
    },
    'v3_to_v9': {
        'source': f"{PROCESSED_PATH}/data_split_v3",
        'target': f"{PROCESSED_PATH}/preprocessed_v9",
        'description': 'MFS - Multi-Frame Sequence (face-aware with forehead, grayscale 224x224)',
        'variant': 'MFS'
    }
}

# Processing parameters
TARGET_SIZE = 224
BBOX_EXPANSION = 20  # Expand face bbox by 20px in all directions for complete coverage

# Initialize Dlib face detector
print("Initializing Dlib face detector...")
detector = dlib.get_frontal_face_detector()
print("Face detector loaded")
print()

# ============================================================================
# FACE-AWARE PREPROCESSING FUNCTIONS
# ============================================================================

def detect_face_with_expansion(image):
    """
    Detect face and expand bbox in all directions for complete expression coverage.

    Strategy:
    - Detect face bbox (typically eyebrows to chin)
    - Expand bbox by 20px in all directions (top, bottom, left, right)
    - Return expanded bbox for cropping

    Args:
        image: Input image (grayscale or RGB)

    Returns:
        Tuple (expanded_bbox, face_info) or (None, None)
    """
    try:
        # Detect faces
        faces = detector(image, 1)

        if len(faces) == 0:
            return None, None

        # If multiple faces, use the largest one
        if len(faces) > 1:
            faces = sorted(faces, key=lambda r: r.width() * r.height(), reverse=True)

        face = faces[0]

        img_h, img_w = image.shape[:2]

        # Original face bbox
        face_left = face.left()
        face_top = face.top()
        face_right = face.right()
        face_bottom = face.bottom()
        face_width = face.width()
        face_height = face.height()

        # Expand bbox by BBOX_EXPANSION pixels in all directions
        expanded_left = max(0, face_left - BBOX_EXPANSION)
        expanded_top = max(0, face_top - BBOX_EXPANSION)
        expanded_right = min(img_w, face_right + BBOX_EXPANSION)
        expanded_bottom = min(img_h, face_bottom + BBOX_EXPANSION)

        expanded_bbox = {
            'left': expanded_left,
            'top': expanded_top,
            'right': expanded_right,
            'bottom': expanded_bottom,
            'width': expanded_right - expanded_left,
            'height': expanded_bottom - expanded_top
        }

        face_info = {
            'original_bbox': {
                'left': face_left,
                'top': face_top,
                'right': face_right,
                'bottom': face_bottom,
                'width': face_width,
                'height': face_height
            },
            'expanded_bbox': expanded_bbox,
            'expansion_applied': BBOX_EXPANSION,
            'face_area': face_width * face_height
        }

        return expanded_bbox, face_info

    except Exception as e:
        return None, None

def ensure_minimum_size(image, min_size=224):
    """
    Ensure image is at least min_size x min_size. Resize if needed.

    Args:
        image: Input image
        min_size: Minimum dimension required

    Returns:
        Tuple (resized_image, resize_info)
    """
    h, w = image.shape[:2]

    if h >= min_size and w >= min_size:
        return image, {'resized': False, 'original_size': (w, h)}

    # Calculate scale factor
    scale_factor = min_size / min(h, w)
    new_width = int(w * scale_factor)
    new_height = int(h * scale_factor)

    # Resize using high-quality interpolation
    resized = cv2.resize(image, (new_width, new_height),
                        interpolation=cv2.INTER_LANCZOS4)

    resize_info = {
        'resized': True,
        'original_size': (w, h),
        'new_size': (new_width, new_height),
        'scale_factor': scale_factor
    }

    return resized, resize_info

def convert_to_grayscale(image):
    """
    Convert RGB/BGR image to grayscale.

    Args:
        image: Input RGB/BGR image

    Returns:
        Grayscale image
    """
    if len(image.shape) == 3:
        return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return image

def preprocess_image_faceaware_v3(image_path):
    """
    Face-aware preprocessing v3 with expanded bbox for complete expression coverage.

    Pipeline:
    1. Load image
    2. Ensure minimum size (224x224)
    3. Detect face and expand bbox by 20px in all directions
    4. Crop expanded bbox region
    5. Resize to 224x224
    6. Convert to grayscale

    Args:
        image_path: Path to input image

    Returns:
        Tuple (preprocessed_image, processing_info)
    """
    processing_info = {
        'face_detected': False,
        'bbox_expanded': False,
        'resize_applied': False,
        'method': 'unknown',
        'face_coverage': {}
    }

    try:
        # Step 1: Load image
        image = cv2.imread(image_path)

        if image is None:
            processing_info['method'] = 'error'
            processing_info['error'] = 'Failed to load image'
            return None, processing_info

        # Step 2: Ensure minimum size
        image, resize_info = ensure_minimum_size(image, TARGET_SIZE)
        processing_info['resize_applied'] = resize_info['resized']
        if resize_info['resized']:
            processing_info['resize_info'] = resize_info

        h, w = image.shape[:2]

        # Step 3: Detect face and get expanded bbox
        expanded_bbox, face_info = detect_face_with_expansion(image)

        if expanded_bbox is not None:
            processing_info['face_detected'] = True
            processing_info['bbox_expanded'] = True
            processing_info['face_coverage'] = {
                'expansion_applied': BBOX_EXPANSION,
                'original_face_area': face_info['face_area'],
                'expanded_width': expanded_bbox['width'],
                'expanded_height': expanded_bbox['height']
            }

            # Step 4: Crop expanded bbox
            x1 = expanded_bbox['left']
            y1 = expanded_bbox['top']
            x2 = expanded_bbox['right']
            y2 = expanded_bbox['bottom']

            cropped = image[y1:y2, x1:x2]
            processing_info['method'] = 'face_expanded_bbox'

        else:
            # Fallback: center crop
            center_x = w // 2
            center_y = h // 2
            half_size = TARGET_SIZE // 2

            x1 = max(0, center_x - half_size)
            y1 = max(0, center_y - half_size)
            x2 = min(w, center_x + half_size)
            y2 = min(h, center_y + half_size)

            cropped = image[y1:y2, x1:x2]
            processing_info['method'] = 'fallback_center'

        # Step 5: Resize to exact 224x224
        if cropped.shape[0] != TARGET_SIZE or cropped.shape[1] != TARGET_SIZE:
            cropped = cv2.resize(cropped, (TARGET_SIZE, TARGET_SIZE),
                               interpolation=cv2.INTER_LANCZOS4)
            processing_info['final_resize_applied'] = True

        # Step 6: Convert to grayscale
        grayscale = convert_to_grayscale(cropped)

        return grayscale, processing_info

    except Exception as e:
        processing_info['method'] = 'error'
        processing_info['error'] = str(e)
        return None, processing_info

# ============================================================================
# DATASET PROCESSING
# ============================================================================

def process_dataset_faceaware_v3(source_dir, target_dir, variant_name):
    """
    Process entire dataset with face-aware preprocessing v3 (forehead included).

    Args:
        source_dir: Source dataset directory
        target_dir: Target directory
        variant_name: Dataset variant

    Returns:
        Processing statistics dictionary
    """
    stats = {
        'variant': variant_name,
        'processing_date': datetime.now().isoformat(),
        'source_directory': source_dir,
        'target_directory': target_dir,
        'preprocessing_method': 'face_bbox_expansion_all_directions',
        'preprocessing_parameters': {
            'target_size': TARGET_SIZE,
            'bbox_expansion': BBOX_EXPANSION
        },
        'preprocessing_steps': [
            'Load image',
            'Ensure minimum size 224x224',
            'Face detection with Dlib',
            f'Expand bbox by {BBOX_EXPANSION}px in all directions',
            'Crop expanded bbox region',
            'Resize to 224x224',
            'Grayscale conversion'
        ],
        'splits': {},
        'total_processed': 0,
        'total_errors': 0,
        'face_detection_stats': {
            'total_images': 0,
            'faces_detected': 0,
            'detection_rate': 0,
            'bbox_expanded': 0,
            'fallback_used': 0,
            'resize_applied': 0
        }
    }

    splits = ['train', 'val', 'test']

    for split in splits:
        source_split_dir = os.path.join(source_dir, split)
        target_split_dir = os.path.join(target_dir, split)

        if not os.path.exists(source_split_dir):
            print(f"  Warning: {split} split not found in source")
            continue

        # Create target directory
        os.makedirs(target_split_dir, exist_ok=True)

        # Get all image files
        image_files = [f for f in os.listdir(source_split_dir)
                      if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

        split_stats = {
            'total_images': len(image_files),
            'processed': 0,
            'errors': 0,
            'faces_detected': 0,
            'bbox_expanded': 0,
            'fallback_used': 0,
            'resize_applied': 0,
            'emotion_distribution': defaultdict(int)
        }

        print(f"  Processing {split} split: {len(image_files)} images")

        # Process each image
        for idx, img_filename in enumerate(image_files, 1):
            source_path = os.path.join(source_split_dir, img_filename)
            target_path = os.path.join(target_split_dir, img_filename)

            # Face-aware preprocessing v3
            processed_img, proc_info = preprocess_image_faceaware_v3(source_path)

            if processed_img is not None:
                # Save preprocessed image
                cv2.imwrite(target_path, processed_img)
                split_stats['processed'] += 1

                # Track statistics
                if proc_info['face_detected']:
                    split_stats['faces_detected'] += 1

                if proc_info['bbox_expanded']:
                    split_stats['bbox_expanded'] += 1

                if proc_info['method'] == 'fallback_center':
                    split_stats['fallback_used'] += 1

                if proc_info['resize_applied']:
                    split_stats['resize_applied'] += 1

                # Extract emotion from filename
                emotion = img_filename.split('_')[-1].replace('.jpg', '')
                split_stats['emotion_distribution'][emotion] += 1
            else:
                split_stats['errors'] += 1
                stats['total_errors'] += 1

            # Progress indicator
            if idx % 200 == 0:
                print(f"    Progress: {idx}/{len(image_files)}")

        # Calculate rates
        if split_stats['total_images'] > 0:
            split_stats['face_detection_rate'] = (
                split_stats['faces_detected'] / split_stats['total_images']
            )
            split_stats['bbox_expansion_rate'] = (
                split_stats['bbox_expanded'] / split_stats['total_images']
            )

        # Store split results (clean for JSON)
        stats['splits'][split] = {
            'total_images': split_stats['total_images'],
            'processed': split_stats['processed'],
            'errors': split_stats['errors'],
            'faces_detected': split_stats['faces_detected'],
            'face_detection_rate': split_stats.get('face_detection_rate', 0),
            'bbox_expanded': split_stats['bbox_expanded'],
            'bbox_expansion_rate': split_stats.get('bbox_expansion_rate', 0),
            'fallback_used': split_stats['fallback_used'],
            'resize_applied': split_stats['resize_applied'],
            'emotion_distribution': dict(split_stats['emotion_distribution'])
        }

        stats['total_processed'] += split_stats['processed']

        # Update overall stats
        stats['face_detection_stats']['total_images'] += split_stats['total_images']
        stats['face_detection_stats']['faces_detected'] += split_stats['faces_detected']
        stats['face_detection_stats']['bbox_expanded'] += split_stats['bbox_expanded']
        stats['face_detection_stats']['fallback_used'] += split_stats['fallback_used']
        stats['face_detection_stats']['resize_applied'] += split_stats['resize_applied']

        print(f"    Completed: {split_stats['processed']} processed")
        print(f"    Face detection: {split_stats['faces_detected']}/{split_stats['total_images']} ({split_stats.get('face_detection_rate', 0):.1%})")
        print(f"    BBox expanded: {split_stats['bbox_expanded']} ({split_stats.get('bbox_expansion_rate', 0):.1%})")

    # Calculate overall rates
    total_imgs = stats['face_detection_stats']['total_images']
    if total_imgs > 0:
        stats['face_detection_stats']['detection_rate'] = (
            stats['face_detection_stats']['faces_detected'] / total_imgs
        )
        stats['face_detection_stats']['bbox_expansion_rate'] = (
            stats['face_detection_stats']['bbox_expanded'] / total_imgs
        )

    return stats

# ============================================================================
# MAIN EXECUTION
# ============================================================================

print("=" * 80)
print("CASME2 FACE-AWARE PREPROCESSING V3: BBOX EXPANSION (COMPLETE COVERAGE)")
print("=" * 80)
print()

all_results = {}

for mapping_name, config in DATASET_MAPPING.items():
    source_dir = config['source']
    target_dir = config['target']
    variant = config['variant']
    description = config['description']

    print(f"[{variant}] {description}")
    print(f"Source: {source_dir}")
    print(f"Target: {target_dir}")
    print()

    # Check if source exists
    if not os.path.exists(source_dir):
        print(f"  Error: Source directory not found")
        print()
        continue

    # Create target base directory
    os.makedirs(target_dir, exist_ok=True)

    # Process dataset
    processing_stats = process_dataset_faceaware_v3(source_dir, target_dir, variant)

    # Save processing summary
    summary_path = os.path.join(target_dir, 'preprocessing_summary.json')
    with open(summary_path, 'w') as f:
        json.dump(processing_stats, f, indent=2)

    all_results[variant] = processing_stats

    print(f"  Summary saved to: preprocessing_summary.json")
    print(f"  Total processed: {processing_stats['total_processed']}")
    print(f"  Face detection rate: {processing_stats['face_detection_stats']['detection_rate']:.1%}")
    print(f"  BBox expansion rate: {processing_stats['face_detection_stats'].get('bbox_expansion_rate', 0):.1%}")
    print()
    print("-" * 80)
    print()

# ============================================================================
# FINAL SUMMARY
# ============================================================================

print("=" * 80)
print("FACE-AWARE PREPROCESSING V3 COMPLETE - SUMMARY")
print("=" * 80)
print()

for variant, stats in all_results.items():
    print(f"{variant} Dataset:")
    print(f"  Total processed: {stats['total_processed']} images")
    print(f"  Face detection: {stats['face_detection_stats']['detection_rate']:.1%}")
    print(f"  BBox expanded: {stats['face_detection_stats'].get('bbox_expansion_rate', 0):.1%}")
    print(f"  Fallback used: {stats['face_detection_stats']['fallback_used']} times")
    print(f"  Errors: {stats['total_errors']}")

    for split in ['train', 'val', 'test']:
        if split in stats['splits']:
            split_data = stats['splits'][split]
            print(f"  {split.upper()}: {split_data['processed']} images")

    print()

print("Preprocessing complete with 20px bbox expansion in all directions")
print("Includes forehead, sides, and chin for complete micro-expression coverage")
print()
print("All datasets saved to:")
for config in DATASET_MAPPING.values():
    print(f"  - {config['target']}")

print()
print("Next step: Cell 4 - Comprehensive validation and comparison")
print()
print("=" * 80)

Mounted at /content/drive
Initializing Dlib face detector...
Face detector loaded

CASME2 FACE-AWARE PREPROCESSING V3: BBOX EXPANSION (COMPLETE COVERAGE)

[AF] AF - Apex Frame (face-aware with forehead, grayscale 224x224)
Source: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/data_split_v1
Target: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/preprocessed_v7

  Processing train split: 201 images
    Progress: 200/201
    Completed: 201 processed
    Face detection: 201/201 (100.0%)
    BBox expanded: 201 (100.0%)
  Processing val split: 26 images
    Completed: 26 processed
    Face detection: 26/26 (100.0%)
    BBox expanded: 26 (100.0%)
  Processing test split: 28 images
    Completed: 28 processed
    Face detection: 28/28 (100.0%)
    BBox expanded: 28 (100.0%)
  Summary saved to: preprocessing_summary.json
  Total processed: 255
  Face detection rate: 100.0%
  BBox expa

In [None]:
# @title Cell 4: CASME2 Comprehensive Validation
# File: validate_comprehensive_casme2.py
# Location: Thesis_MER_Project/scripts/preprocessing/
# Purpose: Full validation and comparison of baseline (v4/v5/v6) vs face-aware (v7/v8/v9)

import os
import json
import cv2
import dlib
import numpy as np
from pathlib import Path
from datetime import datetime
from collections import defaultdict
import matplotlib.pyplot as plt
from google.colab import drive

# ============================================================================
# CONFIGURATION
# ============================================================================

# Mount Google Drive
drive.mount('/content/drive')

# Define base paths
BASE_PATH = "/content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project"
PROCESSED_PATH = f"{BASE_PATH}/datasets/processed_casme2"

# Datasets to validate
DATASETS_TO_VALIDATE = {
    'baseline': {
        'v4': {
            'path': f"{PROCESSED_PATH}/preprocessed_v4",
            'variant': 'AF',
            'description': 'Baseline - Center Crop'
        },
        'v5': {
            'path': f"{PROCESSED_PATH}/preprocessed_v5",
            'variant': 'KFS',
            'description': 'Baseline - Center Crop'
        },
        'v6': {
            'path': f"{PROCESSED_PATH}/preprocessed_v6",
            'variant': 'MFS',
            'description': 'Baseline - Center Crop'
        }
    },
    'face_aware': {
        'v7': {
            'path': f"{PROCESSED_PATH}/preprocessed_v7",
            'variant': 'AF',
            'description': 'Face-Aware - BBox Expansion'
        },
        'v8': {
            'path': f"{PROCESSED_PATH}/preprocessed_v8",
            'variant': 'KFS',
            'description': 'Face-Aware - BBox Expansion'
        },
        'v9': {
            'path': f"{PROCESSED_PATH}/preprocessed_v9",
            'variant': 'MFS',
            'description': 'Face-Aware - BBox Expansion'
        }
    }
}

# Validation parameters
EXPECTED_SIZE = (224, 224)
EXPECTED_CHANNELS = 1
CENTRALITY_THRESHOLD = 0.20
TARGET_WELL_CENTERED_RATE = 0.95

# Initialize Dlib face detector
print("Initializing Dlib face detector...")
detector = dlib.get_frontal_face_detector()
print("Face detector loaded")
print()

# ============================================================================
# VALIDATION FUNCTIONS
# ============================================================================

def check_image_properties(image_path):
    """
    Check basic image properties: dimensions, channels, loadability.

    Args:
        image_path: Path to image file

    Returns:
        Dictionary with property check results
    """
    try:
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

        if img is None:
            return {
                'valid': False,
                'error': 'Failed to load image'
            }

        h, w = img.shape
        channels = 1 if len(img.shape) == 2 else img.shape[2]

        return {
            'valid': True,
            'dimensions': (h, w),
            'channels': channels,
            'correct_size': (h, w) == EXPECTED_SIZE,
            'correct_channels': channels == EXPECTED_CHANNELS,
            'file_size_bytes': os.path.getsize(image_path)
        }

    except Exception as e:
        return {
            'valid': False,
            'error': str(e)
        }

def analyze_face_centrality(image_path):
    """
    Analyze face detection and centrality in preprocessed image.

    Args:
        image_path: Path to image file

    Returns:
        Dictionary with face analysis results
    """
    try:
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

        if img is None:
            return {
                'detected': False,
                'error': 'Failed to load image'
            }

        # Detect faces
        faces = detector(img, 1)

        if len(faces) == 0:
            return {
                'detected': False,
                'reason': 'No face detected'
            }

        # Use largest face if multiple detected
        if len(faces) > 1:
            faces = sorted(faces, key=lambda r: r.width() * r.height(), reverse=True)

        face = faces[0]

        # Calculate face properties
        face_center_x = (face.left() + face.right()) / 2
        face_center_y = (face.top() + face.bottom()) / 2

        img_center_x = img.shape[1] / 2
        img_center_y = img.shape[0] / 2

        # Normalized offset from center
        offset_x = abs(face_center_x - img_center_x) / img.shape[1]
        offset_y = abs(face_center_y - img_center_y) / img.shape[0]

        # Centrality score (Euclidean distance from center)
        centrality_score = np.sqrt(offset_x**2 + offset_y**2)

        # Face coverage
        face_width = face.width()
        face_height = face.height()
        face_area_ratio = (face_width * face_height) / (img.shape[0] * img.shape[1])

        # Check if forehead visible (top of face bbox should not be at image edge)
        forehead_visible = face.top() > 10

        # Check if chin visible (bottom of face bbox should not be at image edge)
        chin_visible = face.bottom() < img.shape[0] - 10

        return {
            'detected': True,
            'centrality_score': float(centrality_score),
            'offset_x': float(offset_x),
            'offset_y': float(offset_y),
            'well_centered': centrality_score <= CENTRALITY_THRESHOLD,
            'face_width_ratio': float(face_width / img.shape[1]),
            'face_height_ratio': float(face_height / img.shape[0]),
            'face_area_ratio': float(face_area_ratio),
            'forehead_visible': forehead_visible,
            'chin_visible': chin_visible,
            'face_bbox': {
                'left': face.left(),
                'top': face.top(),
                'right': face.right(),
                'bottom': face.bottom()
            }
        }

    except Exception as e:
        return {
            'detected': False,
            'error': str(e)
        }

def analyze_image_quality(image_path):
    """
    Analyze image quality metrics.

    Args:
        image_path: Path to image file

    Returns:
        Dictionary with quality metrics
    """
    try:
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

        if img is None:
            return {'valid': False}

        # Basic statistics
        mean_intensity = np.mean(img)
        std_intensity = np.std(img)
        min_intensity = np.min(img)
        max_intensity = np.max(img)

        # Contrast metric
        contrast = std_intensity / mean_intensity if mean_intensity > 0 else 0

        # Dynamic range
        dynamic_range = max_intensity - min_intensity

        return {
            'valid': True,
            'mean_intensity': float(mean_intensity),
            'std_intensity': float(std_intensity),
            'min_intensity': int(min_intensity),
            'max_intensity': int(max_intensity),
            'contrast': float(contrast),
            'dynamic_range': int(dynamic_range)
        }

    except Exception as e:
        return {
            'valid': False,
            'error': str(e)
        }

# ============================================================================
# COMPREHENSIVE DATASET VALIDATION
# ============================================================================

def validate_dataset_comprehensive(dataset_path, variant_name, preprocessing_type):
    """
    Comprehensive validation of entire dataset with full scan.

    Args:
        dataset_path: Path to preprocessed dataset
        variant_name: Dataset variant (AF/KFS/MFS)
        preprocessing_type: Type (baseline/face_aware)

    Returns:
        Comprehensive validation report
    """
    report = {
        'variant': variant_name,
        'preprocessing_type': preprocessing_type,
        'validation_date': datetime.now().isoformat(),
        'dataset_path': dataset_path,
        'validation_mode': 'full_scan',
        'splits': {},
        'overall_statistics': {},
        'quality_assessment': '',
        'issues_found': []
    }

    splits = ['train', 'val', 'test']

    overall_stats = {
        'total_images': 0,
        'valid_images': 0,
        'dimension_correct': 0,
        'faces_detected': 0,
        'faces_well_centered': 0,
        'faces_off_center': 0,
        'face_detection_failed': 0,
        'forehead_visible_count': 0,
        'chin_visible_count': 0,
        'centrality_scores': [],
        'face_area_ratios': [],
        'quality_metrics': {
            'mean_intensities': [],
            'contrasts': [],
            'dynamic_ranges': []
        }
    }

    print(f"  Starting comprehensive validation...")

    for split in splits:
        split_path = os.path.join(dataset_path, split)

        if not os.path.exists(split_path):
            continue

        print(f"  Validating {split} split (full scan)...")

        image_files = [f for f in os.listdir(split_path)
                      if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

        split_stats = {
            'total_images': len(image_files),
            'valid_images': 0,
            'dimension_correct': 0,
            'dimension_errors': 0,
            'faces_detected': 0,
            'faces_well_centered': 0,
            'faces_off_center': 0,
            'face_detection_failed': 0,
            'forehead_visible_count': 0,
            'chin_visible_count': 0,
            'centrality_scores': [],
            'face_area_ratios': [],
            'quality_metrics': {
                'mean_intensities': [],
                'contrasts': [],
                'dynamic_ranges': []
            },
            'problematic_images': []
        }

        # Process ALL images (full scan)
        for idx, img_file in enumerate(image_files, 1):
            img_path = os.path.join(split_path, img_file)

            # Check properties
            props = check_image_properties(img_path)
            if props['valid']:
                split_stats['valid_images'] += 1
                if props['correct_size'] and props['correct_channels']:
                    split_stats['dimension_correct'] += 1
                else:
                    split_stats['dimension_errors'] += 1

            # Face analysis
            face_result = analyze_face_centrality(img_path)

            if face_result['detected']:
                split_stats['faces_detected'] += 1
                split_stats['centrality_scores'].append(face_result['centrality_score'])
                split_stats['face_area_ratios'].append(face_result['face_area_ratio'])

                if face_result['well_centered']:
                    split_stats['faces_well_centered'] += 1
                else:
                    split_stats['faces_off_center'] += 1
                    split_stats['problematic_images'].append({
                        'filename': img_file,
                        'issue': 'off_center',
                        'centrality_score': face_result['centrality_score']
                    })

                if face_result['forehead_visible']:
                    split_stats['forehead_visible_count'] += 1

                if face_result['chin_visible']:
                    split_stats['chin_visible_count'] += 1
            else:
                split_stats['face_detection_failed'] += 1
                split_stats['problematic_images'].append({
                    'filename': img_file,
                    'issue': 'face_detection_failed'
                })

            # Quality analysis
            quality = analyze_image_quality(img_path)
            if quality['valid']:
                split_stats['quality_metrics']['mean_intensities'].append(
                    quality['mean_intensity']
                )
                split_stats['quality_metrics']['contrasts'].append(
                    quality['contrast']
                )
                split_stats['quality_metrics']['dynamic_ranges'].append(
                    quality['dynamic_range']
                )

            # Progress indicator (every 500 images)
            if idx % 500 == 0:
                print(f"    Progress: {idx}/{len(image_files)} images")

        # Calculate rates
        if split_stats['total_images'] > 0:
            split_stats['face_detection_rate'] = (
                split_stats['faces_detected'] / split_stats['total_images']
            )
            split_stats['well_centered_rate'] = (
                split_stats['faces_well_centered'] / split_stats['faces_detected']
                if split_stats['faces_detected'] > 0 else 0
            )
            split_stats['off_center_rate'] = (
                split_stats['faces_off_center'] / split_stats['faces_detected']
                if split_stats['faces_detected'] > 0 else 0
            )
            split_stats['forehead_visible_rate'] = (
                split_stats['forehead_visible_count'] / split_stats['faces_detected']
                if split_stats['faces_detected'] > 0 else 0
            )
            split_stats['chin_visible_rate'] = (
                split_stats['chin_visible_count'] / split_stats['faces_detected']
                if split_stats['faces_detected'] > 0 else 0
            )

        # Calculate averages
        if split_stats['centrality_scores']:
            split_stats['avg_centrality_score'] = float(
                np.mean(split_stats['centrality_scores'])
            )
            split_stats['median_centrality_score'] = float(
                np.median(split_stats['centrality_scores'])
            )

        if split_stats['face_area_ratios']:
            split_stats['avg_face_area_ratio'] = float(
                np.mean(split_stats['face_area_ratios'])
            )

        if split_stats['quality_metrics']['mean_intensities']:
            split_stats['quality_summary'] = {
                'avg_mean_intensity': float(
                    np.mean(split_stats['quality_metrics']['mean_intensities'])
                ),
                'avg_contrast': float(
                    np.mean(split_stats['quality_metrics']['contrasts'])
                ),
                'avg_dynamic_range': float(
                    np.mean(split_stats['quality_metrics']['dynamic_ranges'])
                )
            }

        # Update overall statistics
        overall_stats['total_images'] += split_stats['total_images']
        overall_stats['valid_images'] += split_stats['valid_images']
        overall_stats['dimension_correct'] += split_stats['dimension_correct']
        overall_stats['faces_detected'] += split_stats['faces_detected']
        overall_stats['faces_well_centered'] += split_stats['faces_well_centered']
        overall_stats['faces_off_center'] += split_stats['faces_off_center']
        overall_stats['face_detection_failed'] += split_stats['face_detection_failed']
        overall_stats['forehead_visible_count'] += split_stats['forehead_visible_count']
        overall_stats['chin_visible_count'] += split_stats['chin_visible_count']
        overall_stats['centrality_scores'].extend(split_stats['centrality_scores'])
        overall_stats['face_area_ratios'].extend(split_stats['face_area_ratios'])
        overall_stats['quality_metrics']['mean_intensities'].extend(
            split_stats['quality_metrics']['mean_intensities']
        )
        overall_stats['quality_metrics']['contrasts'].extend(
            split_stats['quality_metrics']['contrasts']
        )
        overall_stats['quality_metrics']['dynamic_ranges'].extend(
            split_stats['quality_metrics']['dynamic_ranges']
        )

        # Clean up for JSON storage
        split_stats['centrality_scores'] = []
        split_stats['face_area_ratios'] = []
        split_stats['quality_metrics'] = split_stats.get('quality_summary', {})

        # Store limited problematic images (top 10)
        if len(split_stats['problematic_images']) > 10:
            split_stats['problematic_images'] = split_stats['problematic_images'][:10]

        report['splits'][split] = split_stats

        print(f"    Completed: {split_stats['total_images']} images scanned")
        print(f"    Face detection: {split_stats['face_detection_rate']:.1%}")
        print(f"    Well-centered: {split_stats['well_centered_rate']:.1%}")
        print(f"    Forehead visible: {split_stats['forehead_visible_rate']:.1%}")
        print(f"    Chin visible: {split_stats['chin_visible_rate']:.1%}")

    # Calculate overall metrics
    if overall_stats['total_images'] > 0:
        overall_stats['face_detection_rate'] = (
            overall_stats['faces_detected'] / overall_stats['total_images']
        )
        overall_stats['well_centered_rate'] = (
            overall_stats['faces_well_centered'] / overall_stats['faces_detected']
            if overall_stats['faces_detected'] > 0 else 0
        )
        overall_stats['off_center_rate'] = (
            overall_stats['faces_off_center'] / overall_stats['faces_detected']
            if overall_stats['faces_detected'] > 0 else 0
        )
        overall_stats['forehead_visible_rate'] = (
            overall_stats['forehead_visible_count'] / overall_stats['faces_detected']
            if overall_stats['faces_detected'] > 0 else 0
        )
        overall_stats['chin_visible_rate'] = (
            overall_stats['chin_visible_count'] / overall_stats['faces_detected']
            if overall_stats['faces_detected'] > 0 else 0
        )

    if overall_stats['centrality_scores']:
        overall_stats['avg_centrality_score'] = float(
            np.mean(overall_stats['centrality_scores'])
        )
        overall_stats['median_centrality_score'] = float(
            np.median(overall_stats['centrality_scores'])
        )
        overall_stats['std_centrality_score'] = float(
            np.std(overall_stats['centrality_scores'])
        )

    if overall_stats['face_area_ratios']:
        overall_stats['avg_face_area_ratio'] = float(
            np.mean(overall_stats['face_area_ratios'])
        )

    if overall_stats['quality_metrics']['mean_intensities']:
        overall_stats['quality_summary'] = {
            'avg_mean_intensity': float(
                np.mean(overall_stats['quality_metrics']['mean_intensities'])
            ),
            'avg_contrast': float(
                np.mean(overall_stats['quality_metrics']['contrasts'])
            ),
            'avg_dynamic_range': float(
                np.mean(overall_stats['quality_metrics']['dynamic_ranges'])
            )
        }

    # Clean up large arrays
    overall_stats['centrality_scores'] = []
    overall_stats['face_area_ratios'] = []
    overall_stats['quality_metrics'] = overall_stats.get('quality_summary', {})

    report['overall_statistics'] = overall_stats

    # Quality assessment
    face_det_rate = overall_stats.get('face_detection_rate', 0)
    well_centered_rate = overall_stats.get('well_centered_rate', 0)
    forehead_rate = overall_stats.get('forehead_visible_rate', 0)

    if face_det_rate < 0.95:
        report['issues_found'].append(
            f"Low face detection rate: {face_det_rate:.1%}"
        )

    if well_centered_rate < TARGET_WELL_CENTERED_RATE:
        report['issues_found'].append(
            f"Well-centered rate below target: {well_centered_rate:.1%} < {TARGET_WELL_CENTERED_RATE:.1%}"
        )

    if forehead_rate < 0.90:
        report['issues_found'].append(
            f"Low forehead visibility: {forehead_rate:.1%}"
        )

    # Overall assessment
    if not report['issues_found']:
        report['quality_assessment'] = 'EXCELLENT - Ready for training'
    elif well_centered_rate >= TARGET_WELL_CENTERED_RATE:
        report['quality_assessment'] = 'GOOD - Acceptable for training'
    else:
        report['quality_assessment'] = 'NEEDS_REVIEW - Consider improvements'

    return report

# ============================================================================
# MAIN EXECUTION
# ============================================================================

print("=" * 80)
print("CASME2 COMPREHENSIVE VALIDATION - FULL SCAN")
print("Baseline (v4/v5/v6) vs Face-Aware (v7/v8/v9) Comparison")
print("=" * 80)
print()

all_validation_results = {
    'validation_date': datetime.now().isoformat(),
    'validation_mode': 'comprehensive_full_scan',
    'baseline': {},
    'face_aware': {},
    'comparison': {}
}

# Validate baseline datasets
print("PHASE 1: Validating Baseline Preprocessing (v4/v5/v6)")
print("-" * 80)
print()

for version, config in DATASETS_TO_VALIDATE['baseline'].items():
    dataset_path = config['path']
    variant = config['variant']
    description = config['description']

    print(f"[{version.upper()}] {variant} - {description}")
    print(f"Path: {dataset_path}")
    print()

    if not os.path.exists(dataset_path):
        print(f"  Warning: Dataset not found, skipping")
        print()
        continue

    validation_report = validate_dataset_comprehensive(
        dataset_path, variant, 'baseline'
    )

    all_validation_results['baseline'][version] = validation_report

    # Save individual report
    report_path = os.path.join(dataset_path, 'comprehensive_validation_report.json')
    with open(report_path, 'w') as f:
        json.dump(validation_report, f, indent=2)

    print(f"  Report saved: comprehensive_validation_report.json")
    print()
    print("-" * 80)
    print()

# Validate face-aware datasets
print("PHASE 2: Validating Face-Aware Preprocessing (v7/v8/v9)")
print("-" * 80)
print()

for version, config in DATASETS_TO_VALIDATE['face_aware'].items():
    dataset_path = config['path']
    variant = config['variant']
    description = config['description']

    print(f"[{version.upper()}] {variant} - {description}")
    print(f"Path: {dataset_path}")
    print()

    if not os.path.exists(dataset_path):
        print(f"  Warning: Dataset not found, skipping")
        print()
        continue

    validation_report = validate_dataset_comprehensive(
        dataset_path, variant, 'face_aware'
    )

    all_validation_results['face_aware'][version] = validation_report

    # Save individual report
    report_path = os.path.join(dataset_path, 'comprehensive_validation_report.json')
    with open(report_path, 'w') as f:
        json.dump(validation_report, f, indent=2)

    print(f"  Report saved: comprehensive_validation_report.json")
    print()
    print("-" * 80)
    print()

# ============================================================================
# COMPARISON ANALYSIS
# ============================================================================

print("PHASE 3: Comparison Analysis")
print("=" * 80)
print()

comparison = {}

for variant in ['AF', 'KFS', 'MFS']:
    # Find corresponding baseline and face-aware versions
    baseline_version = None
    faceaware_version = None

    for v, data in all_validation_results['baseline'].items():
        if data['variant'] == variant:
            baseline_version = v
            break

    for v, data in all_validation_results['face_aware'].items():
        if data['variant'] == variant:
            faceaware_version = v
            break

    if baseline_version and faceaware_version:
        baseline_stats = all_validation_results['baseline'][baseline_version]['overall_statistics']
        faceaware_stats = all_validation_results['face_aware'][faceaware_version]['overall_statistics']

        comparison[variant] = {
            'baseline_version': baseline_version,
            'faceaware_version': faceaware_version,
            'metrics': {
                'face_detection_rate': {
                    'baseline': baseline_stats.get('face_detection_rate', 0),
                    'face_aware': faceaware_stats.get('face_detection_rate', 0),
                    'improvement': faceaware_stats.get('face_detection_rate', 0) - baseline_stats.get('face_detection_rate', 0)
                },
                'well_centered_rate': {
                    'baseline': baseline_stats.get('well_centered_rate', 0),
                    'face_aware': faceaware_stats.get('well_centered_rate', 0),
                    'improvement': faceaware_stats.get('well_centered_rate', 0) - baseline_stats.get('well_centered_rate', 0)
                },
                'forehead_visible_rate': {
                    'baseline': baseline_stats.get('forehead_visible_rate', 0),
                    'face_aware': faceaware_stats.get('forehead_visible_rate', 0),
                    'improvement': faceaware_stats.get('forehead_visible_rate', 0) - baseline_stats.get('forehead_visible_rate', 0)
                },
                'avg_centrality_score': {
                    'baseline': baseline_stats.get('avg_centrality_score', 0),
                    'face_aware': faceaware_stats.get('avg_centrality_score', 0),
                    'improvement': baseline_stats.get('avg_centrality_score', 0) - faceaware_stats.get('avg_centrality_score', 0)
                }
            }
        }

        print(f"{variant} Comparison ({baseline_version} vs {faceaware_version}):")
        print(f"  Well-centered rate:")
        print(f"    Baseline: {comparison[variant]['metrics']['well_centered_rate']['baseline']:.1%}")
        print(f"    Face-aware: {comparison[variant]['metrics']['well_centered_rate']['face_aware']:.1%}")
        print(f"    Improvement: {comparison[variant]['metrics']['well_centered_rate']['improvement']:+.1%}")
        print(f"  Forehead visible:")
        print(f"    Baseline: {comparison[variant]['metrics']['forehead_visible_rate']['baseline']:.1%}")
        print(f"    Face-aware: {comparison[variant]['metrics']['forehead_visible_rate']['face_aware']:.1%}")
        print(f"    Improvement: {comparison[variant]['metrics']['forehead_visible_rate']['improvement']:+.1%}")
        print()

all_validation_results['comparison'] = comparison

# Save comprehensive comparison report
comparison_report_path = os.path.join(PROCESSED_PATH, 'comprehensive_validation_comparison.json')
with open(comparison_report_path, 'w') as f:
    json.dump(all_validation_results, f, indent=2)

print("-" * 80)
print(f"Comprehensive comparison report saved:")
print(f"  {comparison_report_path}")
print()

# ============================================================================
# FINAL SUMMARY & RECOMMENDATION
# ============================================================================

print("=" * 80)
print("FINAL SUMMARY & RECOMMENDATION")
print("=" * 80)
print()

print("Baseline Preprocessing (v4/v5/v6):")
for version, report in all_validation_results['baseline'].items():
    stats = report['overall_statistics']
    print(f"  {version.upper()} ({report['variant']}):")
    print(f"    Well-centered: {stats.get('well_centered_rate', 0):.1%}")
    print(f"    Forehead visible: {stats.get('forehead_visible_rate', 0):.1%}")
    print(f"    Assessment: {report['quality_assessment']}")

print()
print("Face-Aware Preprocessing (v7/v8/v9):")
for version, report in all_validation_results['face_aware'].items():
    stats = report['overall_statistics']
    print(f"  {version.upper()} ({report['variant']}):")
    print(f"    Well-centered: {stats.get('well_centered_rate', 0):.1%}")
    print(f"    Forehead visible: {stats.get('forehead_visible_rate', 0):.1%}")
    print(f"    Assessment: {report['quality_assessment']}")

print()
print("-" * 80)

# Overall recommendation
all_face_aware_excellent = all(
    r['quality_assessment'] in ['EXCELLENT - Ready for training', 'GOOD - Acceptable for training']
    for r in all_validation_results['face_aware'].values()
)

if all_face_aware_excellent:
    print("RECOMMENDATION: Use Face-Aware preprocessing (v7/v8/v9) for final training")
    print("  - Superior face centering achieved (>95% well-centered)")
    print("  - Complete facial coverage (forehead + chin included)")
    print("  - Ready for model training experiments")
else:
    print("RECOMMENDATION: Review preprocessing results before proceeding")
    print("  - Check comprehensive validation reports for details")
    print("  - Address identified issues before training")

print()
print("=" * 80)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Initializing Dlib face detector...
Face detector loaded

CASME2 COMPREHENSIVE VALIDATION - FULL SCAN
Baseline (v4/v5/v6) vs Face-Aware (v7/v8/v9) Comparison

PHASE 1: Validating Baseline Preprocessing (v4/v5/v6)
--------------------------------------------------------------------------------

[V4] AF - Baseline - Center Crop
Path: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/preprocessed_v4

  Starting comprehensive validation...
  Validating train split (full scan)...
    Completed: 201 images scanned
    Face detection: 100.0%
    Well-centered: 82.6%
    Forehead visible: 100.0%
    Chin visible: 34.8%
  Validating val split (full scan)...
    Completed: 26 images scanned
    Face detection: 100.0%
    Well-centered: 96.2%
    Forehead visible: 100.0%
    Chin visible: 26.9%
  Validating test split 

In [None]:
# @title [cancelled] Cell 5: CASME2 Face Alignment + Eye Masking Preprocessing
# File: preprocess_casme2_aligned_eyemask.py
# Location: Thesis_MER_Project/scripts/preprocessing/
# Purpose: Face alignment (horizontal) + eye masking for noise reduction (v1/v2/v3 to v10/v11/v12)

import os
import json
import cv2
import dlib
import numpy as np
from pathlib import Path
from datetime import datetime
from collections import defaultdict
from google.colab import drive

# ============================================================================
# CONFIGURATION
# ============================================================================

# Mount Google Drive
drive.mount('/content/drive')

# Define base paths
BASE_PATH = "/content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project"
PROCESSED_PATH = f"{BASE_PATH}/datasets/processed_casme2"

# Dataset mapping: source -> target (aligned + eye masked versions)
DATASET_MAPPING = {
    'v1_to_v10': {
        'source': f"{PROCESSED_PATH}/data_split_v1",
        'target': f"{PROCESSED_PATH}/preprocessed_v10",
        'description': 'AF - Apex Frame (aligned + eye masked, grayscale 224x224)',
        'variant': 'AF'
    },
    'v2_to_v11': {
        'source': f"{PROCESSED_PATH}/data_split_v2",
        'target': f"{PROCESSED_PATH}/preprocessed_v11",
        'description': 'KFS - Key Frame Sequence (aligned + eye masked, grayscale 224x224)',
        'variant': 'KFS'
    },
    'v3_to_v12': {
        'source': f"{PROCESSED_PATH}/data_split_v3",
        'target': f"{PROCESSED_PATH}/preprocessed_v12",
        'description': 'MFS - Multi-Frame Sequence (aligned + eye masked, grayscale 224x224)',
        'variant': 'MFS'
    }
}

# Processing parameters
TARGET_SIZE = 224
BBOX_EXPANSION = 20
EYE_MASK_STRATEGY = 'gaussian_blur'  # Options: 'black', 'gray', 'gaussian_blur'
EYE_MASK_MARGIN = 5  # Additional margin around eye region

# Initialize Dlib detectors
print("Initializing Dlib face detector and landmark predictor...")
detector = dlib.get_frontal_face_detector()

# Download and load shape predictor if not exists
predictor_path = "/content/shape_predictor_68_face_landmarks.dat"
if not os.path.exists(predictor_path):
    print("Downloading shape predictor model...")
    os.system("wget http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2")
    os.system("bzip2 -d shape_predictor_68_face_landmarks.dat.bz2")

predictor = dlib.shape_predictor(predictor_path)
print("Face detector and landmark predictor loaded")
print()

# ============================================================================
# FACE ALIGNMENT FUNCTIONS
# ============================================================================

def get_eye_landmarks(shape):
    """
    Extract eye landmark coordinates from dlib shape predictor output.

    Args:
        shape: Dlib shape predictor output (68 landmarks)

    Returns:
        Tuple (left_eye_center, right_eye_center)
    """
    # Left eye landmarks: 36-41
    left_eye_points = [(shape.part(i).x, shape.part(i).y) for i in range(36, 42)]
    left_eye_center = np.mean(left_eye_points, axis=0).astype(int)

    # Right eye landmarks: 42-47
    right_eye_points = [(shape.part(i).x, shape.part(i).y) for i in range(42, 48)]
    right_eye_center = np.mean(right_eye_points, axis=0).astype(int)

    return tuple(left_eye_center), tuple(right_eye_center)

def calculate_rotation_angle(left_eye, right_eye):
    """
    Calculate rotation angle to make eyes horizontal.

    Args:
        left_eye: (x, y) coordinates of left eye center
        right_eye: (x, y) coordinates of right eye center

    Returns:
        Rotation angle in degrees
    """
    # Calculate angle between eyes
    delta_y = right_eye[1] - left_eye[1]
    delta_x = right_eye[0] - left_eye[0]

    angle = np.degrees(np.arctan2(delta_y, delta_x))

    return angle

def rotate_image_with_landmarks(image, angle, center):
    """
    Rotate image to align face horizontally.

    Args:
        image: Input image
        angle: Rotation angle in degrees
        center: Rotation center point

    Returns:
        Rotated image
    """
    h, w = image.shape[:2]

    # Get rotation matrix
    rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1.0)

    # Rotate image
    rotated = cv2.warpAffine(image, rotation_matrix, (w, h),
                             flags=cv2.INTER_LANCZOS4,
                             borderMode=cv2.BORDER_REPLICATE)

    return rotated

def align_face_horizontal(image):
    """
    Detect face landmarks and rotate to horizontal alignment.

    Args:
        image: Input image (RGB or grayscale)

    Returns:
        Tuple (aligned_image, alignment_info)
    """
    alignment_info = {
        'aligned': False,
        'angle': 0,
        'landmarks_detected': False
    }

    try:
        # Detect faces
        faces = detector(image, 1)

        if len(faces) == 0:
            return image, alignment_info

        # Use largest face
        if len(faces) > 1:
            faces = sorted(faces, key=lambda r: r.width() * r.height(), reverse=True)

        face = faces[0]

        # Get facial landmarks
        shape = predictor(image, face)

        # Get eye centers
        left_eye, right_eye = get_eye_landmarks(shape)

        alignment_info['landmarks_detected'] = True

        # Calculate rotation angle
        angle = calculate_rotation_angle(left_eye, right_eye)

        # Only rotate if angle is significant (>1 degree)
        if abs(angle) > 1.0:
            # Calculate rotation center (between eyes)
            center = ((left_eye[0] + right_eye[0]) // 2,
                     (left_eye[1] + right_eye[1]) // 2)

            # Rotate image
            aligned = rotate_image_with_landmarks(image, angle, center)

            alignment_info['aligned'] = True
            alignment_info['angle'] = float(angle)

            return aligned, alignment_info
        else:
            alignment_info['aligned'] = False
            alignment_info['angle'] = float(angle)
            return image, alignment_info

    except Exception as e:
        alignment_info['error'] = str(e)
        return image, alignment_info

# ============================================================================
# EYE MASKING FUNCTIONS
# ============================================================================

def get_eye_regions(image):
    """
    Detect eye regions for masking.

    Args:
        image: Input image

    Returns:
        List of eye regions [(x1, y1, x2, y2), ...] or None
    """
    try:
        # Detect faces
        faces = detector(image, 1)

        if len(faces) == 0:
            return None

        # Use largest face
        if len(faces) > 1:
            faces = sorted(faces, key=lambda r: r.width() * r.height(), reverse=True)

        face = faces[0]

        # Get facial landmarks
        shape = predictor(image, face)

        eye_regions = []

        # Left eye region (landmarks 36-41)
        left_eye_points = [(shape.part(i).x, shape.part(i).y) for i in range(36, 42)]
        left_x_coords = [p[0] for p in left_eye_points]
        left_y_coords = [p[1] for p in left_eye_points]

        left_x1 = max(0, min(left_x_coords) - EYE_MASK_MARGIN)
        left_y1 = max(0, min(left_y_coords) - EYE_MASK_MARGIN)
        left_x2 = min(image.shape[1], max(left_x_coords) + EYE_MASK_MARGIN)
        left_y2 = min(image.shape[0], max(left_y_coords) + EYE_MASK_MARGIN)

        eye_regions.append((int(left_x1), int(left_y1), int(left_x2), int(left_y2)))

        # Right eye region (landmarks 42-47)
        right_eye_points = [(shape.part(i).x, shape.part(i).y) for i in range(42, 48)]
        right_x_coords = [p[0] for p in right_eye_points]
        right_y_coords = [p[1] for p in right_eye_points]

        right_x1 = max(0, min(right_x_coords) - EYE_MASK_MARGIN)
        right_y1 = max(0, min(right_y_coords) - EYE_MASK_MARGIN)
        right_x2 = min(image.shape[1], max(right_x_coords) + EYE_MASK_MARGIN)
        right_y2 = min(image.shape[0], max(right_y_coords) + EYE_MASK_MARGIN)

        eye_regions.append((int(right_x1), int(right_y1), int(right_x2), int(right_y2)))

        return eye_regions

    except Exception as e:
        return None

def apply_eye_mask(image, strategy='gaussian_blur'):
    """
    Apply eye masking to reduce eye movement noise.

    Args:
        image: Input grayscale image
        strategy: Masking strategy ('black', 'gray', 'gaussian_blur')

    Returns:
        Tuple (masked_image, masking_info)
    """
    masking_info = {
        'masked': False,
        'strategy': strategy,
        'regions_masked': 0
    }

    # Get eye regions
    eye_regions = get_eye_regions(image)

    if eye_regions is None:
        return image, masking_info

    masked = image.copy()

    for x1, y1, x2, y2 in eye_regions:
        if strategy == 'black':
            # Black mask (intensity 0)
            masked[y1:y2, x1:x2] = 0

        elif strategy == 'gray':
            # Gray mask (mean intensity of image)
            mean_intensity = np.mean(image)
            masked[y1:y2, x1:x2] = mean_intensity

        elif strategy == 'gaussian_blur':
            # Gaussian blur (softer approach)
            eye_region = masked[y1:y2, x1:x2]
            blurred = cv2.GaussianBlur(eye_region, (15, 15), 0)
            masked[y1:y2, x1:x2] = blurred

        masking_info['regions_masked'] += 1

    masking_info['masked'] = True

    return masked, masking_info

# ============================================================================
# FACE DETECTION & EXPANSION (from Cell 3)
# ============================================================================

def detect_face_with_expansion(image):
    """
    Detect face and expand bbox in all directions.

    Args:
        image: Input image

    Returns:
        Tuple (expanded_bbox, face_info) or (None, None)
    """
    try:
        faces = detector(image, 1)

        if len(faces) == 0:
            return None, None

        if len(faces) > 1:
            faces = sorted(faces, key=lambda r: r.width() * r.height(), reverse=True)

        face = faces[0]

        img_h, img_w = image.shape[:2]

        face_left = face.left()
        face_top = face.top()
        face_right = face.right()
        face_bottom = face.bottom()
        face_width = face.width()
        face_height = face.height()

        expanded_left = max(0, face_left - BBOX_EXPANSION)
        expanded_top = max(0, face_top - BBOX_EXPANSION)
        expanded_right = min(img_w, face_right + BBOX_EXPANSION)
        expanded_bottom = min(img_h, face_bottom + BBOX_EXPANSION)

        expanded_bbox = {
            'left': expanded_left,
            'top': expanded_top,
            'right': expanded_right,
            'bottom': expanded_bottom,
            'width': expanded_right - expanded_left,
            'height': expanded_bottom - expanded_top
        }

        face_info = {
            'original_bbox': {
                'left': face_left,
                'top': face_top,
                'right': face_right,
                'bottom': face_bottom,
                'width': face_width,
                'height': face_height
            },
            'expanded_bbox': expanded_bbox,
            'expansion_applied': BBOX_EXPANSION,
            'face_area': face_width * face_height
        }

        return expanded_bbox, face_info

    except Exception as e:
        return None, None

def ensure_minimum_size(image, min_size=224):
    """
    Ensure image is at least min_size x min_size.

    Args:
        image: Input image
        min_size: Minimum dimension required

    Returns:
        Tuple (resized_image, resize_info)
    """
    h, w = image.shape[:2]

    if h >= min_size and w >= min_size:
        return image, {'resized': False, 'original_size': (w, h)}

    scale_factor = min_size / min(h, w)
    new_width = int(w * scale_factor)
    new_height = int(h * scale_factor)

    resized = cv2.resize(image, (new_width, new_height),
                        interpolation=cv2.INTER_LANCZOS4)

    resize_info = {
        'resized': True,
        'original_size': (w, h),
        'new_size': (new_width, new_height),
        'scale_factor': scale_factor
    }

    return resized, resize_info

def convert_to_grayscale(image):
    """
    Convert RGB/BGR image to grayscale.

    Args:
        image: Input RGB/BGR image

    Returns:
        Grayscale image
    """
    if len(image.shape) == 3:
        return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return image

# ============================================================================
# COMPLETE PREPROCESSING PIPELINE
# ============================================================================

def preprocess_image_aligned_eyemask(image_path):
    """
    Complete preprocessing pipeline with face alignment and eye masking.

    Pipeline:
    1. Load image
    2. Ensure minimum size
    3. Face alignment (horizontal eye alignment)
    4. Detect face and expand bbox
    5. Crop expanded bbox
    6. Resize to 224x224
    7. Convert to grayscale
    8. Apply eye masking

    Args:
        image_path: Path to input image

    Returns:
        Tuple (preprocessed_image, processing_info)
    """
    processing_info = {
        'face_detected': False,
        'face_aligned': False,
        'bbox_expanded': False,
        'eye_masked': False,
        'resize_applied': False,
        'method': 'unknown'
    }

    try:
        # Step 1: Load image
        image = cv2.imread(image_path)

        if image is None:
            processing_info['method'] = 'error'
            processing_info['error'] = 'Failed to load image'
            return None, processing_info

        # Step 2: Ensure minimum size
        image, resize_info = ensure_minimum_size(image, TARGET_SIZE)
        processing_info['resize_applied'] = resize_info['resized']

        # Step 3: Face alignment
        aligned_image, alignment_info = align_face_horizontal(image)
        processing_info['face_aligned'] = alignment_info['aligned']
        processing_info['alignment_angle'] = alignment_info.get('angle', 0)

        # Step 4: Detect face and get expanded bbox
        expanded_bbox, face_info = detect_face_with_expansion(aligned_image)

        if expanded_bbox is not None:
            processing_info['face_detected'] = True
            processing_info['bbox_expanded'] = True

            # Step 5: Crop expanded bbox
            x1 = expanded_bbox['left']
            y1 = expanded_bbox['top']
            x2 = expanded_bbox['right']
            y2 = expanded_bbox['bottom']

            cropped = aligned_image[y1:y2, x1:x2]
            processing_info['method'] = 'aligned_expanded_eyemask'

        else:
            # Fallback: center crop
            h, w = aligned_image.shape[:2]
            center_x = w // 2
            center_y = h // 2
            half_size = TARGET_SIZE // 2

            x1 = max(0, center_x - half_size)
            y1 = max(0, center_y - half_size)
            x2 = min(w, center_x + half_size)
            y2 = min(h, center_y + half_size)

            cropped = aligned_image[y1:y2, x1:x2]
            processing_info['method'] = 'fallback_aligned'

        # Step 6: Resize to exact 224x224
        if cropped.shape[0] != TARGET_SIZE or cropped.shape[1] != TARGET_SIZE:
            cropped = cv2.resize(cropped, (TARGET_SIZE, TARGET_SIZE),
                               interpolation=cv2.INTER_LANCZOS4)
            processing_info['final_resize_applied'] = True

        # Step 7: Convert to grayscale
        grayscale = convert_to_grayscale(cropped)

        # Step 8: Apply eye masking
        masked, masking_info = apply_eye_mask(grayscale, EYE_MASK_STRATEGY)
        processing_info['eye_masked'] = masking_info['masked']
        processing_info['eye_mask_strategy'] = masking_info['strategy']
        processing_info['eye_regions_masked'] = masking_info['regions_masked']

        return masked, processing_info

    except Exception as e:
        processing_info['method'] = 'error'
        processing_info['error'] = str(e)
        return None, processing_info

# ============================================================================
# DATASET PROCESSING
# ============================================================================

def process_dataset_aligned_eyemask(source_dir, target_dir, variant_name):
    """
    Process entire dataset with alignment and eye masking.

    Args:
        source_dir: Source dataset directory
        target_dir: Target directory
        variant_name: Dataset variant

    Returns:
        Processing statistics dictionary
    """
    stats = {
        'variant': variant_name,
        'processing_date': datetime.now().isoformat(),
        'source_directory': source_dir,
        'target_directory': target_dir,
        'preprocessing_method': 'face_aligned_bbox_expansion_eye_masked',
        'preprocessing_parameters': {
            'target_size': TARGET_SIZE,
            'bbox_expansion': BBOX_EXPANSION,
            'eye_mask_strategy': EYE_MASK_STRATEGY,
            'eye_mask_margin': EYE_MASK_MARGIN
        },
        'preprocessing_steps': [
            'Load image',
            'Ensure minimum size 224x224',
            'Face alignment (horizontal eye alignment)',
            'Face detection with Dlib',
            f'Expand bbox by {BBOX_EXPANSION}px in all directions',
            'Crop expanded bbox region',
            'Resize to 224x224',
            'Grayscale conversion',
            f'Eye masking ({EYE_MASK_STRATEGY})'
        ],
        'splits': {},
        'total_processed': 0,
        'total_errors': 0,
        'alignment_stats': {
            'total_images': 0,
            'faces_aligned': 0,
            'alignment_rate': 0,
            'avg_rotation_angle': 0
        },
        'eye_masking_stats': {
            'total_images': 0,
            'eyes_masked': 0,
            'masking_rate': 0
        }
    }

    splits = ['train', 'val', 'test']

    for split in splits:
        source_split_dir = os.path.join(source_dir, split)
        target_split_dir = os.path.join(target_dir, split)

        if not os.path.exists(source_split_dir):
            print(f"  Warning: {split} split not found in source")
            continue

        os.makedirs(target_split_dir, exist_ok=True)

        image_files = [f for f in os.listdir(source_split_dir)
                      if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

        split_stats = {
            'total_images': len(image_files),
            'processed': 0,
            'errors': 0,
            'faces_aligned': 0,
            'eyes_masked': 0,
            'rotation_angles': [],
            'emotion_distribution': defaultdict(int)
        }

        print(f"  Processing {split} split: {len(image_files)} images")

        for idx, img_filename in enumerate(image_files, 1):
            source_path = os.path.join(source_split_dir, img_filename)
            target_path = os.path.join(target_split_dir, img_filename)

            processed_img, proc_info = preprocess_image_aligned_eyemask(source_path)

            if processed_img is not None:
                cv2.imwrite(target_path, processed_img)
                split_stats['processed'] += 1

                if proc_info['face_aligned']:
                    split_stats['faces_aligned'] += 1
                    split_stats['rotation_angles'].append(
                        abs(proc_info.get('alignment_angle', 0))
                    )

                if proc_info['eye_masked']:
                    split_stats['eyes_masked'] += 1

                emotion = img_filename.split('_')[-1].replace('.jpg', '')
                split_stats['emotion_distribution'][emotion] += 1
            else:
                split_stats['errors'] += 1
                stats['total_errors'] += 1

            if idx % 200 == 0:
                print(f"    Progress: {idx}/{len(image_files)}")

        # Calculate rates
        if split_stats['total_images'] > 0:
            split_stats['alignment_rate'] = (
                split_stats['faces_aligned'] / split_stats['total_images']
            )
            split_stats['eye_masking_rate'] = (
                split_stats['eyes_masked'] / split_stats['total_images']
            )

        if split_stats['rotation_angles']:
            split_stats['avg_rotation_angle'] = float(
                np.mean(split_stats['rotation_angles'])
            )

        # Store split results
        stats['splits'][split] = {
            'total_images': split_stats['total_images'],
            'processed': split_stats['processed'],
            'errors': split_stats['errors'],
            'faces_aligned': split_stats['faces_aligned'],
            'alignment_rate': split_stats.get('alignment_rate', 0),
            'avg_rotation_angle': split_stats.get('avg_rotation_angle', 0),
            'eyes_masked': split_stats['eyes_masked'],
            'eye_masking_rate': split_stats.get('eye_masking_rate', 0),
            'emotion_distribution': dict(split_stats['emotion_distribution'])
        }

        stats['total_processed'] += split_stats['processed']

        # Update overall stats
        stats['alignment_stats']['total_images'] += split_stats['total_images']
        stats['alignment_stats']['faces_aligned'] += split_stats['faces_aligned']

        stats['eye_masking_stats']['total_images'] += split_stats['total_images']
        stats['eye_masking_stats']['eyes_masked'] += split_stats['eyes_masked']

        print(f"    Completed: {split_stats['processed']} processed")
        print(f"    Face aligned: {split_stats['faces_aligned']} ({split_stats.get('alignment_rate', 0):.1%})")
        print(f"    Eyes masked: {split_stats['eyes_masked']} ({split_stats.get('eye_masking_rate', 0):.1%})")
        if split_stats.get('avg_rotation_angle', 0) > 0:
            print(f"    Avg rotation: {split_stats['avg_rotation_angle']:.2f} degrees")

    # Calculate overall rates
    if stats['alignment_stats']['total_images'] > 0:
        stats['alignment_stats']['alignment_rate'] = (
            stats['alignment_stats']['faces_aligned'] /
            stats['alignment_stats']['total_images']
        )

    if stats['eye_masking_stats']['total_images'] > 0:
        stats['eye_masking_stats']['masking_rate'] = (
            stats['eye_masking_stats']['eyes_masked'] /
            stats['eye_masking_stats']['total_images']
        )

    return stats

# ============================================================================
# MAIN EXECUTION
# ============================================================================

print("=" * 80)
print("CASME2 FACE ALIGNMENT + EYE MASKING PREPROCESSING")
print("=" * 80)
print()

all_results = {}

for mapping_name, config in DATASET_MAPPING.items():
    source_dir = config['source']
    target_dir = config['target']
    variant = config['variant']
    description = config['description']

    print(f"[{variant}] {description}")
    print(f"Source: {source_dir}")
    print(f"Target: {target_dir}")
    print()

    if not os.path.exists(source_dir):
        print(f"  Error: Source directory not found")
        print()
        continue

    os.makedirs(target_dir, exist_ok=True)

    processing_stats = process_dataset_aligned_eyemask(source_dir, target_dir, variant)

    summary_path = os.path.join(target_dir, 'preprocessing_summary.json')
    with open(summary_path, 'w') as f:
        json.dump(processing_stats, f, indent=2)

    all_results[variant] = processing_stats

    print(f"  Summary saved to: preprocessing_summary.json")
    print(f"  Total processed: {processing_stats['total_processed']}")
    print(f"  Alignment rate: {processing_stats['alignment_stats']['alignment_rate']:.1%}")
    print(f"  Eye masking rate: {processing_stats['eye_masking_stats']['masking_rate']:.1%}")
    print()
    print("-" * 80)
    print()

# ============================================================================
# FINAL SUMMARY
# ============================================================================

print("=" * 80)
print("FACE ALIGNMENT + EYE MASKING PREPROCESSING COMPLETE")
print("=" * 80)
print()

for variant, stats in all_results.items():
    print(f"{variant} Dataset:")
    print(f"  Total processed: {stats['total_processed']} images")
    print(f"  Face alignment rate: {stats['alignment_stats']['alignment_rate']:.1%}")
    print(f"  Eye masking rate: {stats['eye_masking_stats']['masking_rate']:.1%}")
    print(f"  Errors: {stats['total_errors']}")

    for split in ['train', 'val', 'test']:
        if split in stats['splits']:
            split_data = stats['splits'][split]
            print(f"  {split.upper()}: {split_data['processed']} images")

    print()

print("All datasets with alignment and eye masking saved to:")
for config in DATASET_MAPPING.values():
    print(f"  - {config['target']}")

print()
print("Next step: Validation Cell 6 to compare v7/v8/v9 vs v10/v11/v12")
print()
print("=" * 80)

Mounted at /content/drive
Initializing Dlib face detector and landmark predictor...
Downloading shape predictor model...
Face detector and landmark predictor loaded

CASME2 FACE ALIGNMENT + EYE MASKING PREPROCESSING

[AF] AF - Apex Frame (aligned + eye masked, grayscale 224x224)
Source: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/data_split_v1
Target: /content/drive/MyDrive/RESEARCH-WORKSPACE/ACTIVE-PROJECTS/Thesis_MER_Project/datasets/processed_casme2/preprocessed_v10

  Processing train split: 201 images
    Progress: 200/201
    Completed: 201 processed
    Face aligned: 0 (0.0%)
    Eyes masked: 201 (100.0%)
  Processing val split: 26 images
    Completed: 26 processed
    Face aligned: 0 (0.0%)
    Eyes masked: 26 (100.0%)
  Processing test split: 28 images
    Completed: 28 processed
    Face aligned: 0 (0.0%)
    Eyes masked: 28 (100.0%)
  Summary saved to: preprocessing_summary.json
  Total processed: 255
  Alignment ra

KeyboardInterrupt: 