In [1]:
"""
RLT Dataset Processor v8.1 FINAL

Scientific Data-ready version with ALL fixes:
- 94 audio features (corrected from 108)
- Explicit librosa parameters for reproducibility
- Translation cache with complete decision logging
- Environment capture (including deep-translator, textblob, pydub)
- Run manifest + data dictionary
- Schema validation (only MultimodalDataset_Full.csv)
- Landmark count validation with detailed logging (478 + 33)
- NLTK: Fail fast (no silent download)
- Deep copy config (not shallow)
- Fully consistent with I3D v8 pipeline

Author: Research Team
Date: 2025-01-30
Version: 8.1 FINAL
"""

import os
import re
import csv
import json
import yaml
import copy
import hashlib
import platform
import subprocess
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Tuple, Optional

# Data processing
import numpy as np
import pandas as pd

# Audio processing
import wave
import librosa
from pydub import AudioSegment

# Computer Vision
import cv2
import mediapipe as mp

# NLP
import nltk
from textblob import TextBlob

# Translation
from deep_translator import GoogleTranslator

# Visualization
import matplotlib
matplotlib.use('Agg')  # Non-interactive backend for reproducibility
import matplotlib.pyplot as plt
import seaborn as sns

# Progress bar
from tqdm import tqdm

# Logging
import logging
import warnings

# Selective warning suppression (only FutureWarning and PySoundFile)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', message='.*PySoundFile.*')

# ‚úÖ FIX 1: NLTK resources - FAIL FAST (no silent download)
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError as e:
    raise RuntimeError(
        "‚ùå Required NLTK resources not found!\n"
        "Please install them manually:\n"
        "  python -m nltk.downloader punkt stopwords\n"
        "Or run in Python:\n"
        "  import nltk\n"
        "  nltk.download('punkt')\n"
        "  nltk.download('stopwords')\n"
        f"Missing resource: {e}"
    )

# Initialize MediaPipe
mp_face_mesh = mp.solutions.face_mesh
mp_pose = mp.solutions.pose

# Pipeline version
PIPELINE_VERSION = "8.1.0"


# ==================== ENVIRONMENT CAPTURE ====================
def capture_environment() -> Dict:
    """
    Capture complete environment information for reproducibility
    ‚úÖ FIXED: Clean version detection for all dependencies
    """
    
    env_info = {
        'pipeline_version': PIPELINE_VERSION,
        'capture_timestamp': datetime.now().isoformat(),
        'system': {
            'platform': platform.platform(),
            'python_version': platform.python_version(),
            'processor': platform.processor(),
            'machine': platform.machine()
        },
        'library_versions': {
            'numpy': np.__version__,
            'pandas': pd.__version__,
            'librosa': librosa.__version__,
            'opencv': cv2.__version__,
            'mediapipe': mp.__version__,
            'matplotlib': matplotlib.__version__,
            'seaborn': sns.__version__,
            'nltk': nltk.__version__,
        },
        'audio_backend': {
            'librosa_audioread': True
        }
    }
    
    # ‚úÖ FIXED: Clean version detection using pkg_resources
    try:
        import pkg_resources
        
        # deep-translator
        try:
            env_info['library_versions']['deep_translator'] = pkg_resources.get_distribution('deep-translator').version
        except:
            env_info['library_versions']['deep_translator'] = 'not_found'
        
        # textblob
        try:
            env_info['library_versions']['textblob'] = pkg_resources.get_distribution('textblob').version
        except:
            env_info['library_versions']['textblob'] = 'not_found'
        
        # pydub
        try:
            env_info['library_versions']['pydub'] = pkg_resources.get_distribution('pydub').version
        except:
            env_info['library_versions']['pydub'] = 'not_found'
    
    except ImportError:
        # Fallback if pkg_resources not available
        env_info['library_versions']['deep_translator'] = 'not_found'
        env_info['library_versions']['textblob'] = 'not_found'
        env_info['library_versions']['pydub'] = 'not_found'
    
    # Check ffmpeg version
    try:
        result = subprocess.run(['ffmpeg', '-version'], 
                              capture_output=True, text=True, timeout=5)
        ffmpeg_version = result.stdout.split('\n')[0]
        env_info['ffmpeg_version'] = ffmpeg_version
    except:
        env_info['ffmpeg_version'] = 'Not detected'
    
    return env_info


# ==================== TRANSLATION CACHE MANAGER ====================
class TranslationCacheManager:
    """
    Manage translation cache with metadata for reproducibility
    ‚úÖ FIXED: Per-run decision log with timestamp
    """
    
    def __init__(self, cache_dir: Path):
        self.cache_dir = cache_dir
        self.cache_dir.mkdir(parents=True, exist_ok=True)
        
        self.cache_file = self.cache_dir / 'translation_cache.json'
        self.metadata_file = self.cache_dir / 'translation_metadata.json'
        
        # ‚úÖ FIXED: Per-run decision log with timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.decision_log = self.cache_dir / f'translation_decisions_{timestamp}.csv'
        
        self.cache = self._load_cache()
        self.metadata = self._load_metadata()
        self.decisions = []
        self.run_timestamp = timestamp
    
    def _load_cache(self) -> Dict:
        """Load existing cache"""
        if self.cache_file.exists():
            try:
                with open(self.cache_file, 'r', encoding='utf-8') as f:
                    return json.load(f)
            except:
                return {}
        return {}
    
    def _load_metadata(self) -> Dict:
        """Load cache metadata"""
        if self.metadata_file.exists():
            try:
                with open(self.metadata_file, 'r', encoding='utf-8') as f:
                    return json.load(f)
            except:
                return self._create_default_metadata()
        return self._create_default_metadata()
    
    def _create_default_metadata(self) -> Dict:
        """Create default metadata structure"""
        return {
            'version': '1.0',
            'pipeline_version': PIPELINE_VERSION,
            'created': datetime.now().isoformat(),
            'last_updated': datetime.now().isoformat(),
            'total_translations': 0,
            'source_lang': 'en',
            'target_lang': 'id',
            'translator': 'GoogleTranslator'
        }
    
    def _compute_key(self, text: str) -> str:
        """Compute MD5 hash key for text"""
        return hashlib.md5(text.encode('utf-8')).hexdigest()
    
    def get_translation(self, text: str, log_decision: bool = True) -> Optional[str]:
        """
        Get cached translation if available
        ‚úÖ FIXED: Log decision with run timestamp
        """
        if not text or len(text.strip()) == 0:
            return ""
        
        key = self._compute_key(text)
        
        # Support both old and new format
        if key in self.cache:
            cached = self.cache[key]
            translation = None
            
            if isinstance(cached, str):
                # Old format: just the translation string
                translation = cached
            elif isinstance(cached, dict):
                # New format: dictionary with metadata
                translation = cached.get('translation', '')
            
            # ‚úÖ FIXED: Log cached decision with run info
            if translation and log_decision:
                self.decisions.append({
                    'run_timestamp': self.run_timestamp,
                    'key': key,
                    'source_length': len(text),
                    'target_length': len(translation),
                    'timestamp': datetime.now().isoformat(),
                    'cached': True
                })
            
            return translation
        
        return None
    
    def add_translation(self, text: str, translation: str, 
                       source_lang: str = 'en', target_lang: str = 'id'):
        """Add new translation to cache"""
        if not text or not translation:
            return
        
        key = self._compute_key(text)
        
        self.cache[key] = {
            'translation': translation,
            'source_lang': source_lang,
            'target_lang': target_lang,
            'timestamp': datetime.now().isoformat(),
            'pipeline_version': PIPELINE_VERSION,
            'translator': 'GoogleTranslator',
            'text_length': len(text),
            'translation_length': len(translation)
        }
        
        # ‚úÖ FIXED: Log decision with run info
        self.decisions.append({
            'run_timestamp': self.run_timestamp,
            'key': key,
            'source_length': len(text),
            'target_length': len(translation),
            'timestamp': datetime.now().isoformat(),
            'cached': False
        })
        
        # Update metadata
        self.metadata['total_translations'] = len(self.cache)
        self.metadata['last_updated'] = datetime.now().isoformat()
    
    def save(self):
        """Save cache and metadata to disk"""
        # Save cache
        with open(self.cache_file, 'w', encoding='utf-8') as f:
            json.dump(self.cache, f, ensure_ascii=False, indent=2)
        
        # Save metadata
        with open(self.metadata_file, 'w', encoding='utf-8') as f:
            json.dump(self.metadata, f, ensure_ascii=False, indent=2)
        
        # ‚úÖ FIXED: Save per-run decision log (doesn't overwrite previous runs)
        if self.decisions:
            df = pd.DataFrame(self.decisions)
            df.to_csv(self.decision_log, index=False, encoding='utf-8')
    
    def get_stats(self) -> Dict:
        """Get cache statistics"""
        return {
            'total_cached': len(self.cache),
            'cache_size_mb': self.cache_file.stat().st_size / (1024*1024) if self.cache_file.exists() else 0,
            'last_updated': self.metadata.get('last_updated', 'Never'),
            'decision_log_file': self.decision_log.name  # ‚úÖ NEW: Include log filename
        }


# ==================== ENGLISH NUMBER CONVERTER ====================
class EnglishNumberConverter:
    """Convert numbers to English words (0 - Trillion)"""
    
    def __init__(self):
        self.ones = ['', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
        self.teens = ['ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 
                      'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen']
        self.tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 
                     'sixty', 'seventy', 'eighty', 'ninety']
    
    def convert_below_hundred(self, num):
        if num == 0:
            return 'zero'
        elif num < 10:
            return self.ones[num]
        elif num < 20:
            return self.teens[num - 10]
        else:
            tens_digit = num // 10
            ones_digit = num % 10
            if ones_digit == 0:
                return self.tens[tens_digit]
            else:
                return f"{self.tens[tens_digit]}-{self.ones[ones_digit]}"
    
    def convert_below_thousand(self, num):
        if num < 100:
            return self.convert_below_hundred(num)
        
        hundreds_digit = num // 100
        remainder = num % 100
        
        result = f"{self.ones[hundreds_digit]} hundred"
        
        if remainder > 0:
            result += f" {self.convert_below_hundred(remainder)}"
        
        return result
    
    def convert_number(self, num):
        if num == 0:
            return 'zero'
        
        if num < 0:
            return f"minus {self.convert_number(abs(num))}"
        
        if num >= 1000000000000:
            trillions = num // 1000000000000
            remainder = num % 1000000000000
            result = 'one trillion' if trillions == 1 else f"{self.convert_below_thousand(trillions)} trillion"
            if remainder > 0:
                result += f" {self.convert_number(remainder)}"
            return result
        
        if num >= 1000000000:
            billions = num // 1000000000
            remainder = num % 1000000000
            result = 'one billion' if billions == 1 else f"{self.convert_below_thousand(billions)} billion"
            if remainder > 0:
                result += f" {self.convert_number(remainder)}"
            return result
        
        if num >= 1000000:
            millions = num // 1000000
            remainder = num % 1000000
            result = 'one million' if millions == 1 else f"{self.convert_below_thousand(millions)} million"
            if remainder > 0:
                result += f" {self.convert_number(remainder)}"
            return result
        
        if num >= 1000:
            thousands = num // 1000
            remainder = num % 1000
            result = 'one thousand' if thousands == 1 else f"{self.convert_below_thousand(thousands)} thousand"
            if remainder > 0:
                result += f" {self.convert_below_thousand(remainder)}"
            return result
        
        return self.convert_below_thousand(num)
    
    def convert_decimal(self, num_str):
        parts = num_str.split('.')
        
        if len(parts) == 1:
            return self.convert_number(int(parts[0]))
        
        integer_part = int(parts[0]) if parts[0] else 0
        decimal_part = parts[1] if len(parts) > 1 and parts[1] else ''
        
        result = self.convert_number(integer_part)
        
        if decimal_part:
            result += ' point'
            for digit in decimal_part:
                if digit.isdigit():
                    result += f" {self.ones[int(digit)]}"
        
        return result


def normalize_numbers_in_text(text):
    """Normalize all numbers in text to English words"""
    if not text or len(text.strip()) == 0:
        return text
    
    converter = EnglishNumberConverter()
    number_pattern = r'\b\d+(?:[.,]\d+)*\b'
    
    def replace_number(match):
        num_str = match.group(0)
        num_str_clean = num_str.replace(',', '')
        
        try:
            if '.' in num_str_clean:
                return converter.convert_decimal(num_str_clean)
            else:
                num = int(num_str_clean)
                return converter.convert_number(num)
        except:
            return num_str
    
    normalized_text = re.sub(number_pattern, replace_number, text)
    normalized_text = re.sub(r'\s+', ' ', normalized_text).strip()
    
    return normalized_text


def extract_number_features(text):
    """Extract deception-specific number features for English text"""
    if not text or len(text.strip()) == 0:
        return {
            'has_numbers': False, 'number_count': 0, 'has_vague_quantifiers': False,
            'has_exact_quantifiers': False, 'number_word_ratio': 0.0,
            'has_large_numbers': False, 'has_decimal_numbers': False
        }
    
    digit_pattern = r'\b\d+(?:[.,]\d+)*\b'
    numbers = re.findall(digit_pattern, text)
    
    number_words = r'\b(zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion)\b'
    number_word_matches = re.findall(number_words, text.lower())
    
    vague_pattern = r'\b(about|around|approximately|roughly|nearly|almost|more than|less than|over|under|some|several|many|few)\b'
    has_vague = bool(re.search(vague_pattern, text.lower()))
    
    exact_pattern = r'\b(exactly|precisely|specifically|definitely|certainly)\b'
    has_exact = bool(re.search(exact_pattern, text.lower()))
    
    large_numbers = []
    for n in numbers:
        try:
            val = float(n.replace(',', ''))
            if val > 1000:
                large_numbers.append(n)
        except:
            pass
    
    decimal_numbers = [n for n in numbers if '.' in n]
    
    total_words = len(text.split())
    number_word_ratio = len(number_word_matches) / total_words if total_words > 0 else 0.0
    
    return {
        'has_numbers': len(numbers) > 0 or len(number_word_matches) > 0,
        'number_count': len(numbers) + len(number_word_matches),
        'has_vague_quantifiers': has_vague,
        'has_exact_quantifiers': has_exact,
        'number_word_ratio': number_word_ratio,
        'has_large_numbers': len(large_numbers) > 0,
        'has_decimal_numbers': len(decimal_numbers) > 0
    }


# ==================== TRANSLATION ====================
def translate_text_with_cache(text: str, cache_manager: TranslationCacheManager,
                              source: str = 'en', target: str = 'id', 
                              max_retries: int = 3, logger = None) -> Tuple[Optional[str], bool]:
    """
    Translate text with cache support
    
    Returns:
        (translation, was_cached)
    """
    if not text or len(text.strip()) == 0:
        return "", True
    
    # Check cache first
    cached_translation = cache_manager.get_translation(text, log_decision=True)
    if cached_translation is not None:
        if logger:
            logger.log(f"  ‚îî‚îÄ ‚úì Translation retrieved from cache")
        return cached_translation, True
    
    # Not in cache, translate via API
    translator = GoogleTranslator(source=source, target=target)
    
    for attempt in range(max_retries):
        try:
            if len(text) > 4500:
                chunks = [text[i:i+4500] for i in range(0, len(text), 4500)]
                translated_chunks = []
                
                for chunk in chunks:
                    result = translator.translate(chunk)
                    translated_chunks.append(result)
                    import time
                    time.sleep(0.5)  # Throttle API calls
                
                translation = ' '.join(translated_chunks)
            else:
                translation = translator.translate(text)
            
            # Add to cache
            cache_manager.add_translation(text, translation, source, target)
            
            if logger:
                logger.log(f"  ‚îî‚îÄ ‚úì New translation via API (cached for future)")
            
            return translation, False
                
        except Exception as e:
            if attempt < max_retries - 1:
                if logger:
                    logger.log_warning(f"Translation attempt {attempt + 1} failed: {str(e)}. Retrying...")
                import time
                time.sleep(1 * (attempt + 1))
            else:
                if logger:
                    logger.log_error(f"Translation failed after {max_retries} attempts: {str(e)}")
                return None, False
    
    return None, False


# ==================== CONFIGURATION ====================
class RLTConfig:
    """
    Configuration for RLT Dataset
    ‚úÖ FIX 2: Use deep copy instead of shallow copy
    """
    
    DEFAULT_CONFIG = {
        'dataset': {
            'name': 'RLT',
            'language': 'English',
            'description': 'Real-Life Truth/Lie Dataset'
        },
        'audio': {
            'sampling_rate': 16000,
            'librosa_params': {
                'n_fft': 2048,
                'hop_length': 512,
                'win_length': 2048,
                'window': 'hann',
                'center': True,
                'pad_mode': 'constant'
            },
            'enhancement': {
                'extreme_boost_threshold': -55,
                'max_boost': 40,
                'target_dbfs': -18,
                'silence_threshold': -50,
                'min_pause_duration': 200,
                'compression_ratio': 6.0,
                'short_speech_threshold': 2000
            }
        },
        'video': {
            'sample_rate': 1,
            'zoom_levels': [1.5, 2.0, 2.5, 3.0, 4.0, 5.0, 6.0, 8.0, 10.0],
            'preprocessing': {
                'clahe_clip_limit': 4.0,
                'clahe_tile_size': (4, 4),
                'gamma_correction': 1.4,
                'unsharp_strength': 1.8
            }
        },
        'mediapipe': {
            'face_mesh': {
                'static_image_mode': False,
                'max_num_faces': 1,
                'refine_landmarks': True,
                'min_detection_confidence': 0.2,
                'min_tracking_confidence': 0.2
            },
            'pose': {
                'static_image_mode': False,
                'model_complexity': 1,
                'min_detection_confidence': 0.3,
                'min_tracking_confidence': 0.3
            }
        },
        'text': {
            'normalize_numbers': True,
            'extract_number_features': True,
            'use_pretranscribed': True,
            'translate_to_indonesian': True
        },
        'output': {
            'generate_figures': True,
            'generate_markdown_report': True,
            'generate_data_dictionary': True,
            'figure_dpi': 300,
            'csv_encoding': 'utf-8'
        },
        'reproducibility': {
            'capture_environment': True,
            'save_run_manifest': True,
            'validate_landmark_counts': True
        }
    }
    
    def __init__(self, config_path: Optional[str] = None):
        # ‚úÖ FIX 2: Use deep copy instead of shallow copy
        self.config = copy.deepcopy(self.DEFAULT_CONFIG)
        
        if config_path and os.path.exists(config_path):
            self.load_from_file(config_path)
    
    def load_from_file(self, config_path: str):
        try:
            with open(config_path, 'r') as f:
                user_config = yaml.safe_load(f)
                self._merge_config(self.config, user_config)
            logging.info(f"‚úì Configuration loaded from: {config_path}")
        except Exception as e:
            logging.warning(f"Failed to load config: {str(e)}. Using defaults.")
    
    def _merge_config(self, base: dict, update: dict):
        for key, value in update.items():
            if key in base and isinstance(base[key], dict) and isinstance(value, dict):
                self._merge_config(base[key], value)
            else:
                base[key] = value
    
    def save_to_file(self, config_path: str):
        with open(config_path, 'w') as f:
            yaml.dump(self.config, f, default_flow_style=False)
        logging.info(f"‚úì Configuration saved to: {config_path}")
    
    def get(self, key_path: str, default=None):
        keys = key_path.split('.')
        value = self.config
        for key in keys:
            if isinstance(value, dict) and key in value:
                value = value[key]
            else:
                return default
        return value


# ==================== PATH MANAGER ====================
class RLTPathManager:
    """Path management for RLT Dataset"""
    
    def __init__(self, base_dir: str):
        self.base_dir = Path(base_dir)
        self.dataset_name = 'RLT'
        
        dataset_base = self.base_dir / "dataset"
        
        self.paths = {
            # Source (INPUT)
            'source_raw': dataset_base / "raw" / "RLT",
            'source_clips': dataset_base / "raw" / "RLT" / "clips",
            'source_clips_lie': dataset_base / "raw" / "RLT" / "clips" / "lie",
            'source_clips_truth': dataset_base / "raw" / "RLT" / "clips" / "truth",
            'source_transcription': dataset_base / "raw" / "RLT" / "Transcription",
            'source_transcription_lie': dataset_base / "raw" / "RLT" / "Transcription" / "lie",
            'source_transcription_truth': dataset_base / "raw" / "RLT" / "Transcription" / "truth",
            
            # Processed (OUTPUT)
            'processed': dataset_base / "processed" / "RLT",
            'audio': dataset_base / "processed" / "RLT" / "audio",
            'audio_wav': dataset_base / "processed" / "RLT" / "audio" / "wav",
            'audio_wav_lie': dataset_base / "processed" / "RLT" / "audio" / "wav" / "lie",
            'audio_wav_truth': dataset_base / "processed" / "RLT" / "audio" / "wav" / "truth",
            'audio_enhanced': dataset_base / "processed" / "RLT" / "audio" / "enhanced",
            'audio_enhanced_lie': dataset_base / "processed" / "RLT" / "audio" / "enhanced" / "lie",
            'audio_enhanced_truth': dataset_base / "processed" / "RLT" / "audio" / "enhanced" / "truth",
            'text': dataset_base / "processed" / "RLT" / "text",
            'visual': dataset_base / "processed" / "RLT" / "visual",
            'multimodal': dataset_base / "processed" / "RLT" / "multimodal",
            'metadata': dataset_base / "metadata" / "RLT",
            'cache': dataset_base / "cache" / "RLT",
            'validation': dataset_base / "validation" / "RLT",
            'quality_reports': dataset_base / "validation" / "RLT" / "quality_reports",
            'statistical': dataset_base / "validation" / "RLT" / "statistical_analyses",
            'figures': dataset_base / "figures" / "RLT",
            'exploratory_figs': dataset_base / "figures" / "RLT" / "exploratory",
            'logs': dataset_base / "_logs" / "RLT",
            'reextraction': dataset_base / "processed" / "RLT" / "reextraction"
        }
    
    def create_directories(self):
        for path in self.paths.values():
            path.mkdir(parents=True, exist_ok=True)
        print(f"‚úì Directory structure created for RLT")
    
    def get_log_path(self) -> Tuple[Path, str]:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        log_path = self.paths['logs'] / f"extraction_{timestamp}.log"
        return log_path, timestamp


# ==================== LOGGER ====================
class RLTLogger:
    """Logger for RLT Dataset"""
    
    def __init__(self, log_path: Path):
        self.log_path = log_path
        self.start_time = datetime.now()
        
        for handler in logging.root.handlers[:]:
            logging.root.removeHandler(handler)
        
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - [RLT v8.1] - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_path, encoding='utf-8'),
                logging.StreamHandler()
            ]
        )
    
    def log(self, message: str):
        logging.info(message)
    
    def log_error(self, message: str):
        logging.error(message)
    
    def log_warning(self, message: str):
        logging.warning(message)
    
    def finalize(self):
        end_time = datetime.now()
        duration = end_time - self.start_time
        logging.info("\n" + "="*70)
        logging.info(f"Finished at: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
        logging.info(f"Total duration: {duration}")


# ==================== VIDEO PREPROCESSOR ====================
class VideoPreprocessor:
    """Video preprocessing for RLT"""
    
    def __init__(self, config: RLTConfig):
        self.config = config
        self.face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
        self.eye_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_eye.xml')
    
    def zoom_frame(self, frame: np.ndarray, zoom_factor: float = 1.5) -> np.ndarray:
        height, width = frame.shape[:2]
        center_x, center_y = width // 2, height // 2
        
        crop_width = int(width / zoom_factor)
        crop_height = int(height / zoom_factor)
        
        x1 = max(0, center_x - crop_width // 2)
        y1 = max(0, center_y - crop_height // 2)
        x2 = min(width, x1 + crop_width)
        y2 = min(height, y1 + crop_height)
        
        cropped = frame[y1:y2, x1:x2]
        zoomed = cv2.resize(cropped, (width, height), interpolation=cv2.INTER_CUBIC)
        
        return zoomed
    
    def enhance_eye_region_for_iris(self, frame: np.ndarray) -> Tuple[np.ndarray, Optional[List]]:
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces = self.face_cascade.detectMultiScale(gray, 1.3, 5)
        
        if len(faces) == 0:
            return frame, None
        
        result = frame.copy()
        eye_regions = []
        
        for (x, y, w, h) in faces:
            roi_y_start = y
            roi_y_end = y + int(h * 0.6)
            roi_gray = gray[roi_y_start:roi_y_end, x:x+w]
            roi_color = result[roi_y_start:roi_y_end, x:x+w]
            
            eyes = self.eye_cascade.detectMultiScale(roi_gray, 1.1, 5)
            
            for (ex, ey, ew, eh) in eyes:
                expansion = 0.3
                ex_exp = max(0, int(ex - ew * expansion))
                ey_exp = max(0, int(ey - eh * expansion))
                ew_exp = min(roi_color.shape[1] - ex_exp, int(ew * (1 + 2 * expansion)))
                eh_exp = min(roi_color.shape[0] - ey_exp, int(eh * (1 + 2 * expansion)))
                
                eye_roi = roi_color[ey_exp:ey_exp+eh_exp, ex_exp:ex_exp+ew_exp]
                
                if eye_roi.size == 0:
                    continue
                
                lab = cv2.cvtColor(eye_roi, cv2.COLOR_BGR2LAB)
                l, a, b = cv2.split(lab)
                
                clip_limit = self.config.get('video.preprocessing.clahe_clip_limit', 4.0)
                tile_size = self.config.get('video.preprocessing.clahe_tile_size', (4, 4))
                
                clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_size)
                l_enhanced = clahe.apply(l)
                lab_enhanced = cv2.merge([l_enhanced, a, b])
                eye_enhanced = cv2.cvtColor(lab_enhanced, cv2.COLOR_LAB2BGR)
                
                kernel = np.array([[-1, -1, -1], [-1, 10, -1], [-1, -1, -1]])
                eye_enhanced = cv2.filter2D(eye_enhanced, -1, kernel)
                eye_enhanced = cv2.convertScaleAbs(eye_enhanced, alpha=1.2, beta=15)
                
                roi_color[ey_exp:ey_exp+eh_exp, ex_exp:ex_exp+ew_exp] = eye_enhanced
                eye_regions.append((x + ex_exp, roi_y_start + ey_exp, ew_exp, eh_exp))
        
        return result, eye_regions
    
    def preprocess_pipeline_ultra(self, frame: np.ndarray) -> np.ndarray:
        # Step 1: Denoise
        frame = cv2.bilateralFilter(frame, d=9, sigmaColor=75, sigmaSpace=75)
        
        # Step 2: Adaptive brightness
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        mean_brightness = np.mean(gray)
        
        if mean_brightness < 50:
            alpha, beta = 2.5, 80
        elif mean_brightness < 100:
            alpha, beta = 1.8, 50
        elif mean_brightness < 120:
            alpha, beta = 1.3, 20
        elif mean_brightness > 180:
            alpha, beta = 0.8, -20
        else:
            alpha, beta = 1.0, 0
        
        frame = cv2.convertScaleAbs(frame, alpha=alpha, beta=beta)
        
        # Step 3: Enhance eye region for iris
        frame, eye_regions = self.enhance_eye_region_for_iris(frame)
        
        # Step 4: Gamma correction
        gamma = self.config.get('video.preprocessing.gamma_correction', 1.4)
        inv_gamma = 1.0 / gamma
        table = np.array([((i / 255.0) ** inv_gamma) * 255 for i in np.arange(0, 256)]).astype("uint8")
        frame = cv2.LUT(frame, table)
        
        # Step 5: Sharpen
        kernel = np.array([[-1, -1, -1], [-1, 9, -1], [-1, -1, -1]])
        frame = cv2.filter2D(frame, -1, kernel)
        
        # Step 6: Unsharp mask
        strength = self.config.get('video.preprocessing.unsharp_strength', 1.8)
        blurred = cv2.GaussianBlur(frame, (0, 0), 1.5)
        frame = cv2.addWeighted(frame, 1.0 + strength, blurred, -strength, 0)
        
        # Step 7: Final CLAHE
        lab = cv2.cvtColor(frame, cv2.COLOR_BGR2LAB)
        l, a, b = cv2.split(lab)
        clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
        l_enhanced = clahe.apply(l)
        lab_enhanced = cv2.merge([l_enhanced, a, b])
        frame = cv2.cvtColor(lab_enhanced, cv2.COLOR_LAB2BGR)
        
        return frame


# ==================== LANDMARK EXTRACTOR ====================
class LandmarkExtractor:
    """
    Landmark extraction for RLT with validation
    ‚úÖ FIX 3: Detailed logging per file/frame + MediaPipe version
    """
    
    def __init__(self, config: RLTConfig, preprocessor: VideoPreprocessor, logger: RLTLogger):
        self.config = config
        self.preprocessor = preprocessor
        self.logger = logger
        
        # ‚úÖ Landmark count validation
        self.expected_face_landmarks = 478  # with refine_landmarks=True
        self.expected_pose_landmarks = 33
        self.landmark_warnings = {
            'face_count_mismatch': 0,
            'pose_count_mismatch': 0
        }
        
        # ‚úÖ FIX 3: Store detailed mismatch logs
        self.mismatch_details = []
        
        # ‚úÖ FIX 3: Capture MediaPipe version
        self.mediapipe_version = mp.__version__
    
    def validate_landmark_count(self, landmarks_dict: Dict, expected_count: int, 
                                landmark_type: str, filename: str = '', frame: int = 0) -> bool:
        """
        Validate landmark count matches expected
        ‚úÖ FIX 3: Log details per file/frame with MediaPipe version
        """
        actual_count = len(landmarks_dict)
        
        if actual_count != expected_count:
            self.landmark_warnings[f'{landmark_type}_count_mismatch'] += 1
            
            # ‚úÖ FIX 3: Log detailed mismatch
            mismatch_info = {
                'filename': filename,
                'frame': frame,
                'landmark_type': landmark_type,
                'expected': expected_count,
                'actual': actual_count,
                'mediapipe_version': self.mediapipe_version,
                'timestamp': datetime.now().isoformat()
            }
            self.mismatch_details.append(mismatch_info)
            
            # ‚úÖ FIX 3: Log warning immediately
            self.logger.log_warning(
                f"Landmark count mismatch - {filename} frame {frame}: "
                f"{landmark_type} expected {expected_count}, got {actual_count} "
                f"(MediaPipe v{self.mediapipe_version})"
            )
            
            return False
        
        return True
    
    def extract_with_multi_strategy(self, frame: np.ndarray, face_mesh, 
                                    iris_focus: bool = False, 
                                    filename: str = '', frame_num: int = 0) -> Tuple[Optional[Dict], int, Optional[str]]:
        """Extract landmarks with multi-strategy approach"""
        best_result = None
        best_iris_count = 0
        best_strategy = None
        
        zoom_levels = self.config.get('video.zoom_levels', [1.5, 2.0, 2.5, 3.0, 4.0, 5.0, 6.0, 8.0, 10.0])
        
        # Strategy 1: Ultra preprocessing + multi-level zoom
        for zoom in zoom_levels:
            try:
                if zoom == 1.0:
                    processed_frame = frame
                else:
                    processed_frame = self.preprocessor.zoom_frame(frame.copy(), zoom)
                
                processed_frame = self.preprocessor.preprocess_pipeline_ultra(processed_frame)
                frame_rgb = cv2.cvtColor(processed_frame, cv2.COLOR_BGR2RGB)
                results = face_mesh.process(frame_rgb)
                
                if results.multi_face_landmarks:
                    landmarks_dict = {}
                    for face_landmarks in results.multi_face_landmarks:
                        for idx, landmark in enumerate(face_landmarks.landmark):
                            landmarks_dict[idx] = [landmark.x, landmark.y, landmark.z]
                    
                    # ‚úÖ FIX 3: Validate landmark count with detailed logging
                    if self.config.get('reproducibility.validate_landmark_counts', True):
                        self.validate_landmark_count(
                            landmarks_dict, 
                            self.expected_face_landmarks, 
                            'face',
                            filename,
                            frame_num
                        )
                    
                    iris_count = sum(1 for i in range(468, 478) if i in landmarks_dict)
                    
                    if iris_count > best_iris_count:
                        best_result = landmarks_dict
                        best_iris_count = iris_count
                        best_strategy = f"zoom_{zoom}x_ultra"
                    
                    if iris_count >= 9:
                        return landmarks_dict, iris_count, best_strategy
            
            except Exception as e:
                continue
        
        # Strategy 2: Extreme brightness + zoom
        if iris_focus and best_iris_count < 8:
            for zoom in [3.0, 5.0, 8.0]:
                try:
                    zoomed = self.preprocessor.zoom_frame(frame.copy(), zoom)
                    extreme_bright = cv2.convertScaleAbs(zoomed, alpha=2.5, beta=80)
                    processed = self.preprocessor.preprocess_pipeline_ultra(extreme_bright)
                    
                    frame_rgb = cv2.cvtColor(processed, cv2.COLOR_BGR2RGB)
                    results = face_mesh.process(frame_rgb)
                    
                    if results.multi_face_landmarks:
                        landmarks_dict = {}
                        for face_landmarks in results.multi_face_landmarks:
                            for idx, landmark in enumerate(face_landmarks.landmark):
                                landmarks_dict[idx] = [landmark.x, landmark.y, landmark.z]
                        
                        # ‚úÖ FIX 3: Validate landmark count with detailed logging
                        if self.config.get('reproducibility.validate_landmark_counts', True):
                            self.validate_landmark_count(
                                landmarks_dict, 
                                self.expected_face_landmarks, 
                                'face',
                                filename,
                                frame_num
                            )
                        
                        iris_count = sum(1 for i in range(468, 478) if i in landmarks_dict)
                        
                        if iris_count > best_iris_count:
                            best_result = landmarks_dict
                            best_iris_count = iris_count
                            best_strategy = f"zoom_{zoom}x_extreme"
                        
                        if iris_count >= 9:
                            return landmarks_dict, iris_count, best_strategy
                
                except Exception as e:
                    continue
        
        return best_result, best_iris_count, best_strategy
    
    def get_warning_summary(self) -> Dict:
        """Get summary of landmark validation warnings"""
        return {
            'warnings': self.landmark_warnings.copy(),
            'mediapipe_version': self.mediapipe_version,
            'total_mismatches': sum(self.landmark_warnings.values()),
            'mismatch_details_count': len(self.mismatch_details)
        }
    
    def save_mismatch_details(self, output_path: Path):
        """
        Save detailed mismatch log to CSV
        ‚úÖ FIX 3: Export mismatch details for review
        """
        if self.mismatch_details:
            df = pd.DataFrame(self.mismatch_details)
            mismatch_path = output_path / 'landmark_mismatch_details.csv'
            df.to_csv(mismatch_path, index=False, encoding='utf-8')
            self.logger.log(f"‚úì Landmark mismatch details saved: {mismatch_path}")
            return mismatch_path
        return None


# ==================== PAUSE ANALYZER ====================
def extract_pause_silence_features(audio_path: Path, config: RLTConfig, logger: RLTLogger) -> Dict:
    """Extract pause and silence features"""
    try:
        y, sr = librosa.load(str(audio_path), sr=16000)
        duration = len(y) / sr
        
        audio = AudioSegment.from_wav(str(audio_path))
        
        silence_threshold = config.get('audio.enhancement.silence_threshold', -50)
        min_pause_duration = config.get('audio.enhancement.min_pause_duration', 200)
        chunk_length_ms = 50
        
        pauses = []
        current_pause_start = None
        current_pause_duration = 0
        
        for i in range(0, len(audio), chunk_length_ms):
            chunk = audio[i:i+chunk_length_ms]
            
            if len(chunk) == 0:
                continue
            
            is_silent = chunk.dBFS < silence_threshold
            
            if is_silent:
                if current_pause_start is None:
                    current_pause_start = i
                current_pause_duration += chunk_length_ms
            else:
                if current_pause_start is not None and current_pause_duration >= min_pause_duration:
                    pauses.append({
                        'start': current_pause_start,
                        'duration': current_pause_duration,
                        'end': current_pause_start + current_pause_duration
                    })
                
                current_pause_start = None
                current_pause_duration = 0
        
        if current_pause_start is not None and current_pause_duration >= min_pause_duration:
            pauses.append({
                'start': current_pause_start,
                'duration': current_pause_duration,
                'end': current_pause_start + current_pause_duration
            })
        
        num_pauses = len(pauses)
        total_pause_duration = sum(p['duration'] for p in pauses) / 1000
        
        if num_pauses > 0:
            avg_pause_duration = total_pause_duration / num_pauses
            longest_pause = max(p['duration'] for p in pauses) / 1000
            pause_frequency = num_pauses / duration if duration > 0 else 0
            
            early_pauses = sum(1 for p in pauses if p['start'] < len(audio) * 0.33)
            middle_pauses = sum(1 for p in pauses if len(audio) * 0.33 <= p['start'] < len(audio) * 0.67)
            late_pauses = sum(1 for p in pauses if p['start'] >= len(audio) * 0.67)
        else:
            avg_pause_duration = 0
            longest_pause = 0
            pause_frequency = 0
            early_pauses = 0
            middle_pauses = 0
            late_pauses = 0
        
        speech_duration = duration - total_pause_duration
        speech_rate = speech_duration / duration if duration > 0 else 0
        pause_ratio = total_pause_duration / duration if duration > 0 else 0
        
        hesitation_score = (
            0.3 * pause_ratio +
            0.3 * (pause_frequency / 2) +
            0.2 * min(longest_pause / 2, 1.0) +
            0.2 * min(num_pauses / 10, 1.0)
        )
        
        if num_pauses > 1:
            pause_durations = [p['duration'] / 1000 for p in pauses]
            pause_std = np.std(pause_durations)
            pause_variability = pause_std / avg_pause_duration if avg_pause_duration > 0 else 0
        else:
            pause_std = 0
            pause_variability = 0
        
        features = {
            'num_pauses': num_pauses,
            'total_pause_duration': float(total_pause_duration),
            'avg_pause_duration': float(avg_pause_duration),
            'longest_pause_duration': float(longest_pause),
            'pause_frequency': float(pause_frequency),
            'speech_duration': float(speech_duration),
            'speech_rate': float(speech_rate),
            'pause_ratio': float(pause_ratio),
            'early_pauses': int(early_pauses),
            'middle_pauses': int(middle_pauses),
            'late_pauses': int(late_pauses),
            'pause_std': float(pause_std),
            'pause_variability': float(pause_variability),
            'hesitation_score': float(hesitation_score),
            'has_long_pauses': longest_pause > 1.5,
            'has_frequent_pauses': pause_frequency > 1.0,
            'has_high_pause_ratio': pause_ratio > 0.3
        }
        
        logger.log(f"‚úì Pause features extracted")
        logger.log(f"  ‚îî‚îÄ Pauses: {num_pauses}, Hesitation: {hesitation_score:.3f}")
        
        return features
        
    except Exception as e:
        logger.log_error(f"Failed to extract pause features: {str(e)}")
        return {
            'num_pauses': 0, 'total_pause_duration': 0, 'avg_pause_duration': 0,
            'longest_pause_duration': 0, 'pause_frequency': 0, 'speech_duration': 0,
            'speech_rate': 0, 'pause_ratio': 0, 'early_pauses': 0, 'middle_pauses': 0,
            'late_pauses': 0, 'pause_std': 0, 'pause_variability': 0, 'hesitation_score': 0,
            'has_long_pauses': False, 'has_frequent_pauses': False, 'has_high_pause_ratio': False
        }


# ==================== AUDIO FEATURES ====================
def extract_audio_features(audio_path: Path, config: RLTConfig, logger: RLTLogger) -> Optional[np.ndarray]:
    """
    Extract 94 audio features (CORRECTED from 108)
    Uses explicit librosa parameters for reproducibility
    """
    try:
        # Get librosa parameters from config
        librosa_params = config.get('audio.librosa_params', {})
        n_fft = librosa_params.get('n_fft', 2048)
        hop_length = librosa_params.get('hop_length', 512)
        win_length = librosa_params.get('win_length', 2048)
        window = librosa_params.get('window', 'hann')
        center = librosa_params.get('center', True)
        
        y, sr = librosa.load(str(audio_path), sr=16000)
        
        # MFCC (13 coefficients)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, 
                                    n_fft=n_fft, hop_length=hop_length,
                                    win_length=win_length, window=window, center=center)
        mfcc_mean = np.mean(mfcc, axis=1)
        mfcc_std = np.std(mfcc, axis=1)
        
        # Delta MFCC
        delta_mfcc = librosa.feature.delta(mfcc)
        delta_mfcc_mean = np.mean(delta_mfcc, axis=1)
        delta_mfcc_std = np.std(delta_mfcc, axis=1)
        
        # Delta2 MFCC
        delta2_mfcc = librosa.feature.delta(mfcc, order=2)
        delta2_mfcc_mean = np.mean(delta2_mfcc, axis=1)
        delta2_mfcc_std = np.std(delta2_mfcc, axis=1)
        
        # Mel spectrogram for bark energy
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=24,
                                                  n_fft=n_fft, hop_length=hop_length,
                                                  win_length=win_length, window=window, center=center)
        
        # ‚úÖ FIXED: Bark energy consistent with I3D v8 (axis=0)
        bark_energy = np.sum(mel_spec, axis=0)  # Total energy per time frame
        bark_energy_mean = np.mean(bark_energy)
        bark_energy_std = np.std(bark_energy)
        
        # Delta energy
        delta_energy = librosa.feature.delta(mel_spec)
        delta_energy_mean = np.mean(delta_energy)
        delta_energy_std = np.std(delta_energy)
        
        # Delta2 energy
        delta2_energy = librosa.feature.delta(mel_spec, order=2)
        delta2_energy_mean = np.mean(delta2_energy)
        delta2_energy_std = np.std(delta2_energy)
        
        # Spectral features
        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr, 
                                                              n_fft=n_fft, hop_length=hop_length,
                                                              win_length=win_length, window=window, center=center)[0]
        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr,
                                                                n_fft=n_fft, hop_length=hop_length,
                                                                win_length=win_length, window=window, center=center)[0]
        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr,
                                                            n_fft=n_fft, hop_length=hop_length,
                                                            win_length=win_length, window=window, center=center)[0]
        
        # Zero crossing rate
        zcr = librosa.feature.zero_crossing_rate(y)[0]
        
        # Chroma
        chroma = librosa.feature.chroma_stft(y=y, sr=sr,
                                            n_fft=n_fft, hop_length=hop_length,
                                            win_length=win_length, window=window, center=center)
        
        # Concatenate all features
        features = np.concatenate([
            mfcc_mean, mfcc_std,                    # 26 features
            delta_mfcc_mean, delta_mfcc_std,        # 26 features
            delta2_mfcc_mean, delta2_mfcc_std,      # 26 features
            [bark_energy_mean, bark_energy_std],    # 2 features
            [delta_energy_mean, delta_energy_std],  # 2 features
            [delta2_energy_mean, delta2_energy_std],# 2 features
            [np.mean(spectral_centroid), np.std(spectral_centroid)],    # 2 features
            [np.mean(spectral_bandwidth), np.std(spectral_bandwidth)],  # 2 features
            [np.mean(spectral_rolloff), np.std(spectral_rolloff)],      # 2 features
            [np.mean(zcr), np.std(zcr)],            # 2 features
            [np.mean(np.mean(chroma, axis=1)), np.std(np.mean(chroma, axis=1))]  # 2 features
        ])
        
        # ‚úÖ Assert feature count = 94
        assert len(features) == 94, f"Expected 94 features, got {len(features)}"
        
        logger.log(f"‚úì Audio features extracted: {len(features)} features")
        return features
        
    except Exception as e:
        logger.log_error(f"Failed to extract audio features: {str(e)}")
        return None


def create_audio_feature_names() -> List[str]:
    """Create column names for 94 audio features (CORRECTED from 108)"""
    feature_names = []
    
    # MFCC (13 x 2 = 26)
    for i in range(13):
        feature_names.append(f'mfcc{i+1}_mean')
        feature_names.append(f'mfcc{i+1}_std')
    
    # Delta MFCC (13 x 2 = 26)
    for i in range(13):
        feature_names.append(f'delta_mfcc{i+1}_mean')
        feature_names.append(f'delta_mfcc{i+1}_std')
    
    # Delta2 MFCC (13 x 2 = 26)
    for i in range(13):
        feature_names.append(f'delta2_mfcc{i+1}_mean')
        feature_names.append(f'delta2_mfcc{i+1}_std')
    
    # Bark energy (6)
    feature_names.extend([
        'bark_energy_mean', 'bark_energy_std',
        'delta_energy_mean', 'delta_energy_std',
        'delta2_energy_mean', 'delta2_energy_std'
    ])
    
    # Spectral features (6)
    feature_names.extend([
        'spectral_centroid_mean', 'spectral_centroid_std',
        'spectral_bandwidth_mean', 'spectral_bandwidth_std',
        'spectral_rolloff_mean', 'spectral_rolloff_std'
    ])
    
    # ZCR (2)
    feature_names.extend(['zcr_mean', 'zcr_std'])
    
    # Chroma (2)
    feature_names.extend(['chroma_mean', 'chroma_std'])
    
    # ‚úÖ Assert count = 94
    assert len(feature_names) == 94, f"Expected 94 feature names, got {len(feature_names)}"
    
    return feature_names


# ==================== AUDIO QUALITY ====================
def check_audio_quality(audio_path: Path, config: RLTConfig, logger: RLTLogger) -> Dict:
    """Check audio quality with explicit STFT parameters"""
    try:
        y, sr = librosa.load(str(audio_path), sr=16000)
        
        # Get librosa parameters
        librosa_params = config.get('audio.librosa_params', {})
        n_fft = librosa_params.get('n_fft', 2048)
        hop_length = librosa_params.get('hop_length', 512)
        
        rms = librosa.feature.rms(y=y)[0]
        rms_mean = np.mean(rms)
        rms_std = np.std(rms)
        
        zcr = librosa.feature.zero_crossing_rate(y)[0]
        zcr_mean = np.mean(zcr)
        
        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr, 
                                                              n_fft=n_fft, hop_length=hop_length)[0]
        sc_mean = np.mean(spectral_centroid)
        
        # ‚úÖ Use explicit STFT parameters for SNR
        S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length))
        noise_floor = np.percentile(S, 10)
        signal_power = np.mean(S ** 2)
        noise_power = noise_floor ** 2
        
        if noise_power > 0:
            snr = 10 * np.log10(signal_power / noise_power)
        else:
            snr = 60
        
        duration = len(y) / sr
        dynamic_range = np.max(np.abs(y)) - np.min(np.abs(y))
        
        rms_score = min(rms_mean / 0.1, 1.0)
        snr_score = min(max(snr - 10, 0) / 40, 1.0)
        zcr_score = 1 - min(zcr_mean / 0.2, 1.0)
        sc_score = min(sc_mean / 2000, 1.0)
        duration_score = min(duration / 5.0, 1.0)
        dr_score = min(dynamic_range / 0.5, 1.0)
        
        quality_score = (
            0.25 * rms_score + 0.25 * snr_score + 0.15 * zcr_score +
            0.15 * sc_score + 0.10 * duration_score + 0.10 * dr_score
        )
        
        with wave.open(str(audio_path), 'rb') as wf:
            n_channels = wf.getnchannels()
            sampwidth = wf.getsampwidth()
            framerate = wf.getframerate()
        
        return {
            'channels': n_channels,
            'sample_width': sampwidth,
            'frame_rate': framerate,
            'duration': duration,
            'rms': float(rms_mean),
            'rms_std': float(rms_std),
            'max_amplitude': float(np.max(np.abs(y))),
            'snr': float(snr),
            'zcr': float(zcr_mean),
            'spectral_centroid': float(sc_mean),
            'dynamic_range': float(dynamic_range),
            'quality_score': float(quality_score)
        }
        
    except Exception as e:
        logger.log_error(f"Failed to check audio quality: {str(e)}")
        return {
            'channels': 0, 'sample_width': 0, 'frame_rate': 0, 'duration': 0,
            'rms': 0, 'rms_std': 0, 'max_amplitude': 0, 'snr': 0,
            'zcr': 0, 'spectral_centroid': 0, 'dynamic_range': 0, 'quality_score': 0
        }


# ==================== TEXT ANALYSIS ====================
def analyze_text_content(text: str, language: str = 'en') -> Dict:
    """Analyze text for linguistic features"""
    if not text or len(text.strip()) == 0:
        return {
            'sentiment': 0, 'subjectivity': 0, 'complexity': 0, 'word_count': 0,
            'char_count': 0, 'avg_word_length': 0, 'unique_words': 0, 'lexical_diversity': 0
        }
    
    words = text.lower().split()
    unique_words = set(words)
    word_count = len(words)
    char_count = len(text)
    
    lexical_diversity = len(unique_words) / word_count if word_count > 0 else 0
    avg_word_length = sum(len(word) for word in words) / word_count if word_count > 0 else 0
    
    try:
        blob = TextBlob(text)
        sentiment = blob.sentiment.polarity
        subjectivity = blob.sentiment.subjectivity
        
        sentences = blob.sentences
        avg_sentence_length = sum(len(sentence.words) for sentence in sentences) / len(sentences) if sentences else 0
        complexity = (avg_word_length * 0.5 + avg_sentence_length * 0.5) / 10
    except:
        sentiment = 0
        subjectivity = 0
        complexity = 0
    
    return {
        'sentiment': sentiment,
        'subjectivity': subjectivity,
        'complexity': complexity,
        'word_count': word_count,
        'char_count': char_count,
        'avg_word_length': avg_word_length,
        'unique_words': len(unique_words),
        'lexical_diversity': lexical_diversity
    }


# ==================== UTILITY FUNCTIONS ====================
def get_valid_video_files(directory: Path) -> List[str]:
    """Get all valid video files"""
    if not directory.exists():
        return []
    
    valid_extensions = ('.mp4', '.mov', '.avi', '.mkv', '.flv', '.wmv', '.m4v')
    
    files = []
    for file in directory.iterdir():
        if file.is_file():
            if file.name.startswith('.') or file.name.startswith('._'):
                continue
            if file.name in ['.DS_Store', 'Thumbs.db', 'desktop.ini']:
                continue
            if file.suffix.lower() in valid_extensions:
                files.append(file.name)
    
    return sorted(files)


def load_transcription(transcription_path: Path, logger: RLTLogger) -> Optional[str]:
    """Load transcription from txt file"""
    try:
        if not transcription_path.exists():
            logger.log_warning(f"Transcription file not found: {transcription_path}")
            return None
        
        with open(transcription_path, 'r', encoding='utf-8') as f:
            text = f.read().strip()
        
        if len(text) > 0:
            logger.log(f"‚úì Transcription loaded: {len(text)} characters")
            return text
        else:
            logger.log_warning(f"Empty transcription file: {transcription_path}")
            return None
            
    except Exception as e:
        logger.log_error(f"Failed to load transcription: {str(e)}")
        return None


def convert_numpy_types(obj):
    """Convert NumPy types to Python native types"""
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_numpy_types(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(item) for item in obj]
    else:
        return obj


# ==================== DATA DICTIONARY GENERATOR ====================
def generate_data_dictionary(output_dir: Path, logger: RLTLogger):
    """
    Generate data dictionary for all output datasets
    ‚úÖ Scientific Data requirement
    """
    
    dictionary_data = []
    
    # Text features (English Original)
    dictionary_data.extend([
        {'column_name': 'filename', 'data_type': 'string', 'description': 'Video filename', 'unit': 'N/A', 'range': 'N/A'},
        {'column_name': 'text_english_original', 'data_type': 'string', 'description': 'Original English transcription', 'unit': 'N/A', 'range': 'N/A'},
        {'column_name': 'text_english_normalized', 'data_type': 'string', 'description': 'English transcription with numbers normalized to words', 'unit': 'N/A', 'range': 'N/A'},
        {'column_name': 'text_indonesian', 'data_type': 'string', 'description': 'Indonesian translation', 'unit': 'N/A', 'range': 'N/A'},
        {'column_name': 'label', 'data_type': 'integer', 'description': 'Class label (0=truth, 1=lie)', 'unit': 'N/A', 'range': '0-1'},
        {'column_name': 'dataset', 'data_type': 'string', 'description': 'Dataset identifier', 'unit': 'N/A', 'range': 'RLT'},
    ])
    
    # Number features
    dictionary_data.extend([
        {'column_name': 'has_numbers', 'data_type': 'boolean', 'description': 'Whether text contains numbers', 'unit': 'N/A', 'range': 'True/False'},
        {'column_name': 'number_count', 'data_type': 'integer', 'description': 'Total count of numbers in text', 'unit': 'count', 'range': '0-‚àû'},
        {'column_name': 'has_vague_quantifiers', 'data_type': 'boolean', 'description': 'Contains vague quantifiers (about, around, etc.)', 'unit': 'N/A', 'range': 'True/False'},
        {'column_name': 'has_exact_quantifiers', 'data_type': 'boolean', 'description': 'Contains exact quantifiers (exactly, precisely, etc.)', 'unit': 'N/A', 'range': 'True/False'},
        {'column_name': 'number_word_ratio', 'data_type': 'float', 'description': 'Ratio of number words to total words', 'unit': 'ratio', 'range': '0.0-1.0'},
        {'column_name': 'has_large_numbers', 'data_type': 'boolean', 'description': 'Contains numbers > 1000', 'unit': 'N/A', 'range': 'True/False'},
        {'column_name': 'has_decimal_numbers', 'data_type': 'boolean', 'description': 'Contains decimal numbers', 'unit': 'N/A', 'range': 'True/False'},
    ])
    
    # Text linguistic features
    for lang_suffix in ['_en_original', '_en', '_id']:
        lang_name = 'English (Original)' if lang_suffix == '_en_original' else ('English (Normalized)' if lang_suffix == '_en' else 'Indonesian')
        dictionary_data.extend([
            {'column_name': f'char_count{lang_suffix}', 'data_type': 'integer', 'description': f'Character count - {lang_name}', 'unit': 'characters', 'range': '0-‚àû'},
            {'column_name': f'word_count{lang_suffix}', 'data_type': 'integer', 'description': f'Word count - {lang_name}', 'unit': 'words', 'range': '0-‚àû'},
            {'column_name': f'sentiment{lang_suffix}', 'data_type': 'float', 'description': f'Sentiment polarity - {lang_name}', 'unit': 'polarity', 'range': '-1.0 to 1.0'},
            {'column_name': f'subjectivity{lang_suffix}', 'data_type': 'float', 'description': f'Subjectivity score - {lang_name}', 'unit': 'score', 'range': '0.0-1.0'},
            {'column_name': f'complexity{lang_suffix}', 'data_type': 'float', 'description': f'Text complexity - {lang_name}', 'unit': 'score', 'range': '0.0-‚àû'},
            {'column_name': f'lexical_diversity{lang_suffix}', 'data_type': 'float', 'description': f'Lexical diversity - {lang_name}', 'unit': 'ratio', 'range': '0.0-1.0'},
        ])
    
    # Audio quality features
    dictionary_data.extend([
        {'column_name': 'audio_quality_score', 'data_type': 'float', 'description': 'Overall audio quality score', 'unit': 'score', 'range': '0.0-1.0'},
        {'column_name': 'audio_duration', 'data_type': 'float', 'description': 'Audio duration', 'unit': 'seconds', 'range': '0.0-‚àû'},
        {'column_name': 'audio_snr', 'data_type': 'float', 'description': 'Signal-to-noise ratio', 'unit': 'dB', 'range': '-‚àû to ‚àû'},
        {'column_name': 'audio_rms', 'data_type': 'float', 'description': 'Root mean square energy', 'unit': 'amplitude', 'range': '0.0-1.0'},
        {'column_name': 'audio_spectral_centroid', 'data_type': 'float', 'description': 'Spectral centroid', 'unit': 'Hz', 'range': '0.0-‚àû'},
        {'column_name': 'audio_dynamic_range', 'data_type': 'float', 'description': 'Dynamic range', 'unit': 'amplitude', 'range': '0.0-2.0'},
    ])
    
    # Audio features (94 features)
    audio_feature_names = create_audio_feature_names()
    for feat_name in audio_feature_names:
        dictionary_data.append({
            'column_name': feat_name,
            'data_type': 'float',
            'description': f'Audio feature: {feat_name}',
            'unit': 'feature value',
            'range': '-‚àû to ‚àû'
        })
    
    # Pause features
    pause_features = [
        ('pause_num_pauses', 'integer', 'Number of pauses detected', 'count', '0-‚àû'),
        ('pause_total_pause_duration', 'float', 'Total pause duration', 'seconds', '0.0-‚àû'),
        ('pause_avg_pause_duration', 'float', 'Average pause duration', 'seconds', '0.0-‚àû'),
        ('pause_longest_pause_duration', 'float', 'Longest pause duration', 'seconds', '0.0-‚àû'),
        ('pause_pause_frequency', 'float', 'Pause frequency (pauses per second)', 'Hz', '0.0-‚àû'),
        ('pause_speech_duration', 'float', 'Total speech duration', 'seconds', '0.0-‚àû'),
        ('pause_speech_rate', 'float', 'Speech rate ratio', 'ratio', '0.0-1.0'),
        ('pause_pause_ratio', 'float', 'Pause ratio', 'ratio', '0.0-1.0'),
        ('pause_early_pauses', 'integer', 'Pauses in first third', 'count', '0-‚àû'),
        ('pause_middle_pauses', 'integer', 'Pauses in middle third', 'count', '0-‚àû'),
        ('pause_late_pauses', 'integer', 'Pauses in last third', 'count', '0-‚àû'),
        ('pause_pause_std', 'float', 'Standard deviation of pause durations', 'seconds', '0.0-‚àû'),
        ('pause_pause_variability', 'float', 'Pause variability coefficient', 'ratio', '0.0-‚àû'),
        ('pause_hesitation_score', 'float', 'Hesitation score', 'score', '0.0-1.0'),
        ('pause_has_long_pauses', 'boolean', 'Has pauses > 1.5s', 'N/A', 'True/False'),
        ('pause_has_frequent_pauses', 'boolean', 'Has frequent pauses (>1 per second)', 'N/A', 'True/False'),
        ('pause_has_high_pause_ratio', 'boolean', 'Has high pause ratio (>30%)', 'N/A', 'True/False'),
    ]
    
    for feat_name, dtype, desc, unit, range_val in pause_features:
        dictionary_data.append({
            'column_name': feat_name,
            'data_type': dtype,
            'description': desc,
            'unit': unit,
            'range': range_val
        })
    
    # Save data dictionary
    df_dict = pd.DataFrame(dictionary_data)
    dict_path = output_dir / 'data_dictionary.csv'
    df_dict.to_csv(dict_path, index=False, encoding='utf-8')
    
    logger.log(f"‚úì Data dictionary generated: {dict_path}")
    logger.log(f"  ‚îî‚îÄ Total columns documented: {len(dictionary_data)}")
    
    return dict_path


def validate_schema(csv_path: Path, data_dictionary_path: Path, logger: RLTLogger) -> Dict:
    """
    Validate CSV schema against data dictionary
    ‚úÖ FIX 4: Only validate MultimodalDataset_Full.csv (has all columns)
    """
    try:
        # Load CSV and dictionary
        df = pd.read_csv(csv_path, nrows=1)  # Just read header
        df_dict = pd.read_csv(data_dictionary_path)
        
        csv_columns = set(df.columns)
        dict_columns = set(df_dict['column_name'])
        
        # Find mismatches
        missing_in_dict = csv_columns - dict_columns
        missing_in_csv = dict_columns - csv_columns
        
        validation_result = {
            'csv_file': csv_path.name,
            'total_csv_columns': len(csv_columns),
            'total_dict_columns': len(dict_columns),
            'missing_in_dictionary': list(missing_in_dict),
            'missing_in_csv': list(missing_in_csv),
            'validation_passed': len(missing_in_dict) == 0 and len(missing_in_csv) == 0
        }
        
        if validation_result['validation_passed']:
            logger.log(f"‚úì Schema validation passed: {csv_path.name}")
        else:
            logger.log_warning(f"‚ö† Schema validation issues in {csv_path.name}:")
            if missing_in_dict:
                logger.log_warning(f"  Missing in dictionary: {missing_in_dict}")
            if missing_in_csv:
                logger.log_warning(f"  Missing in CSV: {missing_in_csv}")
        
        return validation_result
        
    except Exception as e:
        logger.log_error(f"Schema validation failed: {str(e)}")
        return {'validation_passed': False, 'error': str(e)}


# ==================== RUN MANIFEST GENERATOR ====================
def generate_run_manifest(config: RLTConfig, paths: RLTPathManager, 
                         stats: Dict, env_info: Dict, 
                         cache_stats: Dict, landmark_summary: Dict,
                         output_dir: Path, logger: RLTLogger):
    """
    Generate run manifest with complete metadata
    ‚úÖ Scientific Data requirement
    ‚úÖ FIX 3: Include landmark warnings in manifest
    """
    
    manifest = {
        'pipeline': {
            'version': PIPELINE_VERSION,
            'name': 'RLT Dataset Processor',
            'description': 'Scientific Data-ready multimodal deception detection dataset processor',
            'timestamp': datetime.now().isoformat()
        },
        'environment': env_info,
        'configuration': config.config,
        'input': {
            'source_directory': str(paths.paths['source_raw']),
            'total_files': stats['total_files'],
            'file_types': ['mp4', 'mov', 'avi']
        },
        'processing_statistics': {
            'total_files_processed': stats['total_files'],
            'successful_text_loads': stats['successful_text_loads'],
            'successful_translations': stats['successful_translations'],
            'successful_audio_extractions': stats['successful_audio_extractions'],
            'successful_landmark_extractions': stats['successful_landmark_extractions'],
            'total_frames_processed': stats['total_frames_processed'],
            'face_detected': stats['face_detected'],
            'iris_detected': stats['iris_detected'],
            'pose_detected': stats['pose_detected']
        },
        'translation_cache': cache_stats,
        # ‚úÖ FIX 3: Include landmark validation summary
        'landmark_validation': landmark_summary,
        'reproducibility': {
            'determinism_statement': 'First-run translations are non-deterministic (API-based). Subsequent runs use cached translations for deterministic results.',
            'audio_features': '94 features with explicit librosa parameters',
            'landmark_validation': f"Face: 478 landmarks (with iris), Pose: 33 landmarks (MediaPipe v{landmark_summary.get('mediapipe_version', 'unknown')})",
            'number_normalization': 'Digits converted to English words (0-trillion)',
            'bark_energy_definition': 'axis=0 (consistent with I3D v8)',
            'nltk_requirement': 'Fail-fast (manual installation required)'
        },
        'output_files': {
            'text_english_original': 'TextDataset_English_Original.csv',
            'text_english_normalized': 'TextDataset_English_Normalized.csv',
            'text_indonesian': 'TextDataset_Indonesian.csv',
            'number_features': 'NumberFeatures.csv',
            'audio_features': 'AudioDataset_Features.csv',
            'pause_features': 'PauseFeatures.csv',
            'landmarks': 'LandmarkDataset.csv',
            'multimodal_full': 'MultimodalDataset_Full.csv',
            'publication': 'PublicationDataset.csv',
            'data_dictionary': 'data_dictionary.csv',
            'run_manifest': 'run_manifest.json'
        }
    }
    
    # Save manifest
    manifest_path = output_dir / 'run_manifest.json'
    with open(manifest_path, 'w', encoding='utf-8') as f:
        json.dump(manifest, f, indent=2, ensure_ascii=False)
    
    logger.log(f"‚úì Run manifest generated: {manifest_path}")
    
    return manifest_path


# ==================== MARKDOWN REPORT ====================
def generate_markdown_report(stats: Dict, output_paths: Dict, paths: RLTPathManager, 
                            start_time: datetime, end_time: datetime, 
                            cache_stats: Dict, landmark_summary: Dict) -> Path:
    """
    Generate comprehensive markdown report
    ‚úÖ FIXED: Conditional mismatch log mention
    """
    duration = end_time - start_time
    hours, remainder = divmod(duration.total_seconds(), 3600)
    minutes, seconds = divmod(remainder, 60)
    
    # Extract landmark warnings
    landmark_warnings = landmark_summary.get('warnings', {})
    mediapipe_version = landmark_summary.get('mediapipe_version', 'unknown')
    total_mismatches = landmark_summary.get('total_mismatches', 0)
    mismatch_details_count = landmark_summary.get('mismatch_details_count', 0)
    
    # ‚úÖ FIXED: Conditional mismatch log section
    mismatch_log_section = ""
    if total_mismatches > 0:
        mismatch_log_section = f"\n\n‚úÖ **Detailed mismatch log saved:** `landmark_mismatch_details.csv`"
    else:
        mismatch_log_section = f"\n\n‚úÖ **No landmark mismatches detected** (file not generated)"
    
    markdown_content = f"""# üìä RLT Dataset Extraction Report v8.1 FINAL

**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}  
**Pipeline Version:** {PIPELINE_VERSION}  
**Language:** English + Indonesian  
**Status:** ‚úÖ Scientific Data-ready

---

## ‚úÖ Extraction Summary

| Metric | Value |
|--------|-------|
| **Total Files** | {stats['total_files']} |
| **Transcription Loaded** | {stats['successful_text_loads']}/{stats['total_files']} ({stats['successful_text_loads']/max(stats['total_files'], 1)*100:.1f}%) |
| **Numbers Normalized** | {stats['numbers_normalized']}/{stats['texts_with_numbers']} |
| **Translation (EN‚ÜíID)** | {stats['successful_translations']}/{stats['successful_text_loads']} ({stats['successful_translations']/max(stats['successful_text_loads'], 1)*100:.1f}%) |
| **Audio Extraction** | {stats['successful_audio_extractions']}/{stats['total_files']} ({stats['successful_audio_extractions']/max(stats['total_files'], 1)*100:.1f}%) |
| **Landmark Extraction** | {stats['successful_landmark_extractions']}/{stats['total_files']} ({stats['successful_landmark_extractions']/max(stats['total_files'], 1)*100:.1f}%) |

---

## üéØ Landmark Detection Performance

| Type | Detected | Rate |
|------|----------|------|
| **Face (478 landmarks)** | {stats['face_detected']:,} / {stats['total_frames_processed']:,} | {stats['face_detected']/max(stats['total_frames_processed'], 1)*100:.1f}% |
| **Iris (10 landmarks)** | {stats['iris_detected']:,} / {stats['total_frames_processed']:,} | {stats['iris_detected']/max(stats['total_frames_processed'], 1)*100:.1f}% |
| **Pose (33 landmarks)** | {stats['pose_detected']:,} / {stats['total_frames_processed']:,} | {stats['pose_detected']/max(stats['total_frames_processed'], 1)*100:.1f}% |

### Landmark Validation (MediaPipe v{mediapipe_version})

- **Face count mismatches:** {landmark_warnings.get('face_count_mismatch', 0)} frames
- **Pose count mismatches:** {landmark_warnings.get('pose_count_mismatch', 0)} frames
- **Total mismatches:** {total_mismatches} frames
- **Detailed log entries:** {mismatch_details_count}
{mismatch_log_section}

---

## üî¢ Number Features

| Metric | Value |
|--------|-------|
| **Texts with Numbers** | {stats.get('texts_with_numbers', 0)} |
| **Numbers Normalized** | {stats.get('numbers_normalized', 0)} |
| **Vague Quantifiers** | {stats.get('texts_with_vague', 0)} |
| **Exact Quantifiers** | {stats.get('texts_with_exact', 0)} |

---

## üåê Translation Cache Statistics

| Metric | Value |
|--------|-------|
| **Total Cached** | {cache_stats['total_cached']} |
| **Cache Size** | {cache_stats['cache_size_mb']:.2f} MB |
| **Last Updated** | {cache_stats['last_updated']} |
| **Cached Translations Used** | {stats.get('cached_translations', 0)} |
| **New Translations** | {stats.get('new_translations', 0)} |
| **Decision Log** | `{cache_stats.get('decision_log_file', 'translation_decisions_TIMESTAMP.csv')}` |

**Determinism Statement:**  
First-run translations are non-deterministic (Google Translate API). Subsequent runs use cached translations for fully deterministic results. Complete decision log saved per-run with timestamp.

---

## üéµ Audio Features (CORRECTED)

- **Total Features:** 94 (not 108) ‚úÖ
- **MFCC:** 13 coefficients √ó 3 (original + delta + delta2) √ó 2 (mean + std) = 78
- **Bark Energy:** 6 features (bark + delta + delta2, each with mean + std)
- **Spectral:** 6 features (centroid, bandwidth, rolloff, each with mean + std)
- **ZCR:** 2 features (mean + std)
- **Chroma:** 2 features (mean + std)

**Bark Energy Definition:** `axis=0` (consistent with I3D v8) ‚úÖ

**Librosa Parameters (Explicit for Reproducibility):**
- n_fft: 2048
- hop_length: 512
- win_length: 2048
- window: hann
- center: True

---

## ‚è±Ô∏è Processing Time

- **Start:** {start_time.strftime('%Y-%m-%d %H:%M:%S')}
- **End:** {end_time.strftime('%Y-%m-%d %H:%M:%S')}
- **Duration:** {int(hours)}h {int(minutes)}m {int(seconds)}s

---

## üìÇ Output Datasets

1. **Text (English - Original):** `TextDataset_English_Original.csv`
2. **Text (English - Normalized):** `TextDataset_English_Normalized.csv` ‚ú®
3. **Text (Indonesian):** `TextDataset_Indonesian.csv` ‚ú®
4. **Number Features:** `NumberFeatures.csv`
5. **Audio Features (94):** `AudioDataset_Features.csv` ‚úÖ
6. **Pause Features:** `PauseFeatures.csv`
7. **Landmarks:** `LandmarkDataset.csv`
8. **Multimodal (Full):** `MultimodalDataset_Full.csv`
9. **Publication-Ready:** `PublicationDataset.csv`
10. **Data Dictionary:** `data_dictionary.csv` ‚ú®
11. **Run Manifest:** `run_manifest.json` ‚ú®
12. **Landmark Mismatch Log:** `landmark_mismatch_details.csv` ‚ú® (if mismatches exist)
13. **Translation Decisions:** `translation_decisions_TIMESTAMP.csv` ‚ú® (per-run)

---

## üî¨ Scientific Data Compliance (v8.1 FINAL)

‚úÖ **Environment Capture:** System, Python, libraries (clean version detection)  
‚úÖ **Explicit Parameters:** All librosa parameters documented  
‚úÖ **Translation Cache:** MD5-keyed cache with per-run decision logging  
‚úÖ **Landmark Validation:** 478 face + 33 pose landmarks validated with detailed mismatch logging  
‚úÖ **Data Dictionary:** Complete column documentation  
‚úÖ **Schema Validation:** Only MultimodalDataset_Full.csv validated (has all columns)  
‚úÖ **Run Manifest:** Full processing metadata including landmark warnings  
‚úÖ **Reproducibility:** Deterministic for cached translations  
‚úÖ **NLTK:** Fail-fast (no silent download)  
‚úÖ **Config:** Deep copy (not shallow)  

---

## üéØ Consistency with I3D v8

‚úÖ Audio features: 94 (same as I3D v8)  
‚úÖ Bark energy: axis=0 (same definition)  
‚úÖ Librosa parameters: Explicit and documented  
‚úÖ Translation cache: Same structure with per-run logging  
‚úÖ Data dictionary: Same format  
‚úÖ Run manifest: Same metadata structure  
‚úÖ Landmark validation: Enhanced with detailed logging  
‚úÖ NLTK: Fail-fast approach  
‚úÖ Config: Deep copy approach  
‚úÖ Missing landmarks: float 0.0 (consistent with I3D)  

---

## üîß Fixes Applied in v8.1 FINAL

1. ‚úÖ **NLTK: Fail Fast** - No silent download, raises RuntimeError with instructions
2. ‚úÖ **Config: Deep Copy** - Prevents nested dict mutation
3. ‚úÖ **Landmark Validation: Detailed Logging** - Per file/frame with MediaPipe version
4. ‚úÖ **Schema Validation: Fixed** - Only validates MultimodalDataset_Full.csv
5. ‚úÖ **Environment: Complete & Clean** - pkg_resources with 'not_found' fallback
6. ‚úÖ **Translation Cache: Per-run Logging** - Timestamped decision log per run
7. ‚úÖ **Markdown Report: Conditional** - Mismatch log only mentioned if exists
8. ‚úÖ **Missing Landmarks: float 0.0** - Consistent with I3D v8

---

*Generated by RLT Dataset Processor v{PIPELINE_VERSION}*  
*Specialized for Real-Life Truth/Lie Dataset*  
*Scientific Data-ready & Fully Reproducible! üî¨*  
*ALL FIXES APPLIED - READY FOR SUBMISSION! üéâ*
"""
    
    report_path = paths.paths['processed'] / 'RLT_EXTRACTION_REPORT.md'
    with open(report_path, 'w', encoding='utf-8') as f:
        f.write(markdown_content)
    
    return report_path


# ==================== MAIN PROCESSOR ====================
class RLTProcessor:
    """Main processor for RLT Dataset v8.1 FINAL"""
    
    def __init__(self, config: RLTConfig, paths: RLTPathManager):
        self.config = config
        self.paths = paths
        self.cache_manager = None
    
    def process(self):
        """Process RLT dataset with full Scientific Data compliance"""
        
        # Initialize logger
        log_path, timestamp = self.paths.get_log_path()
        logger = RLTLogger(log_path)
        
        logger.log("="*70)
        logger.log(f"üéØ RLT DATASET PROCESSOR v{PIPELINE_VERSION}")
        logger.log("="*70)
        logger.log("‚ú® Scientific Data-ready Features (v8.1 FINAL):")
        logger.log("   ‚Ä¢ 94 audio features (corrected from 108)")
        logger.log("   ‚Ä¢ Explicit librosa parameters")
        logger.log("   ‚Ä¢ Translation cache with COMPLETE decision logging")
        logger.log("   ‚Ä¢ Environment capture (deep-translator, textblob, pydub)")
        logger.log("   ‚Ä¢ Run manifest + data dictionary")
        logger.log("   ‚Ä¢ Schema validation (MultimodalDataset_Full.csv only)")
        logger.log("   ‚Ä¢ Landmark validation with DETAILED logging")
        logger.log("   ‚Ä¢ NLTK: Fail fast (no silent download)")
        logger.log("   ‚Ä¢ Config: Deep copy (not shallow)")
        logger.log("   ‚Ä¢ Consistent with I3D v8 pipeline")
        
        # Capture environment
        if self.config.get('reproducibility.capture_environment', True):
            env_info = capture_environment()
            logger.log("\n‚úì Environment captured:")
            logger.log(f"  ‚îî‚îÄ Python: {env_info['system']['python_version']}")
            logger.log(f"  ‚îî‚îÄ Platform: {env_info['system']['platform']}")
            logger.log(f"  ‚îî‚îÄ Librosa: {env_info['library_versions']['librosa']}")
            logger.log(f"  ‚îî‚îÄ MediaPipe: {env_info['library_versions']['mediapipe']}")
            logger.log(f"  ‚îî‚îÄ deep-translator: {env_info['library_versions'].get('deep_translator', 'unknown')}")
            logger.log(f"  ‚îî‚îÄ textblob: {env_info['library_versions'].get('textblob', 'unknown')}")
            logger.log(f"  ‚îî‚îÄ pydub: {env_info['library_versions'].get('pydub', 'unknown')}")
        else:
            env_info = {}
        
        # Initialize translation cache
        self.cache_manager = TranslationCacheManager(self.paths.paths['cache'])
        cache_stats_initial = self.cache_manager.get_stats()
        logger.log(f"\n‚úì Translation cache initialized:")
        logger.log(f"  ‚îî‚îÄ Cached translations: {cache_stats_initial['total_cached']}")
        
        # Check source folders
        if not self.paths.paths['source_clips_lie'].exists():
            raise FileNotFoundError(f"LIE clips folder not found: {self.paths.paths['source_clips_lie']}")
        if not self.paths.paths['source_clips_truth'].exists():
            raise FileNotFoundError(f"TRUTH clips folder not found: {self.paths.paths['source_clips_truth']}")
        if not self.paths.paths['source_transcription_lie'].exists():
            raise FileNotFoundError(f"LIE transcription folder not found: {self.paths.paths['source_transcription_lie']}")
        if not self.paths.paths['source_transcription_truth'].exists():
            raise FileNotFoundError(f"TRUTH transcription folder not found: {self.paths.paths['source_transcription_truth']}")
        
        # Get video files
        lie_files = get_valid_video_files(self.paths.paths['source_clips_lie'])
        truth_files = get_valid_video_files(self.paths.paths['source_clips_truth'])
        total_files = len(lie_files) + len(truth_files)
        
        logger.log(f"\nüìÅ Files found:")
        logger.log(f"   - LIE   : {len(lie_files)} clips")
        logger.log(f"   - TRUTH : {len(truth_files)} clips")
        logger.log(f"   - TOTAL : {total_files} clips")
        
        if total_files == 0:
            raise ValueError("No video files found!")
        
        # Initialize statistics
        stats = {
            'total_files': total_files,
            'successful_text_loads': 0,
            'failed_text_loads': 0,
            'numbers_normalized': 0,
            'texts_with_numbers': 0,
            'texts_with_vague': 0,
            'texts_with_exact': 0,
            'successful_translations': 0,
            'failed_translations': 0,
            'cached_translations': 0,
            'new_translations': 0,
            'successful_audio_extractions': 0,
            'failed_audio_extractions': 0,
            'successful_landmark_extractions': 0,
            'failed_landmark_extractions': 0,
            'total_frames_processed': 0,
            'face_detected': 0,
            'iris_detected': 0,
            'pose_detected': 0,
            'pause_features_extracted': 0,
            'errors': []
        }
        
        # Data storage
        text_audio_data = []
        failed_samples = []
        
        # Initialize processors
        preprocessor = VideoPreprocessor(self.config)
        # ‚úÖ FIX 3: Pass logger to LandmarkExtractor
        landmark_extractor = LandmarkExtractor(self.config, preprocessor, logger)
        
        # MediaPipe configuration
        face_mesh_config = self.config.get('mediapipe.face_mesh', {})
        pose_config = self.config.get('mediapipe.pose', {})
        
        # CSV for landmarks
        landmark_csv_path = self.paths.paths['visual'] / 'LandmarkDataset.csv'
        landmark_csv = open(landmark_csv_path, 'w', newline='', encoding='utf-8')
        landmark_writer = csv.writer(landmark_csv)
        
        # Landmark header
        header = ['Video_Name', 'Frame']
        for i in range(478):
            header.extend([f'Landmark_{i}_X', f'Landmark_{i}_Y', f'Landmark_{i}_Z'])
        for i in range(33):
            header.extend([f'Pose_{i}_X', f'Pose_{i}_Y', f'Pose_{i}_Z'])
        header.append('Class')
        landmark_writer.writerow(header)
        
        # Process each label folder
        start_time = datetime.now()
        
        try:
            for label_name, clips_folder, transcription_folder in [
                ('lie', self.paths.paths['source_clips_lie'], self.paths.paths['source_transcription_lie']), 
                ('truth', self.paths.paths['source_clips_truth'], self.paths.paths['source_transcription_truth'])
            ]:
                files = get_valid_video_files(clips_folder)
                
                logger.log(f"\nüìÅ Processing {len(files)} files from {label_name}")
                
                # Open MediaPipe context
                with mp_face_mesh.FaceMesh(**face_mesh_config) as face_mesh, \
                     mp_pose.Pose(**pose_config) as pose:
                    
                    # Process each file
                    for filename in tqdm(files, desc=f"Processing {label_name}", unit="file"):
                        try:
                            video_path = clips_folder / filename
                            video_name = video_path.stem
                            
                            logger.log(f"\nüé¨ Processing: {filename}")
                            
                            # Get label
                            label = 1 if label_name == 'lie' else 0
                            
                            # STEP 1: Load transcription
                            transcription_filename = video_name + '.txt'
                            transcription_path = transcription_folder / transcription_filename
                            
                            text_en_original = load_transcription(transcription_path, logger)
                            
                            if text_en_original:
                                stats['successful_text_loads'] += 1
                                logger.log(f"  ‚îî‚îÄ ‚úì Transcription (EN): '{text_en_original[:100]}...'")
                            else:
                                stats['failed_text_loads'] += 1
                                failed_samples.append({'filename': filename, 'reason': 'Transcription not found', 'label': label})
                                text_en_original = ""
                            
                            # STEP 2: Extract number features
                            number_features = extract_number_features(text_en_original)
                            if number_features['has_numbers']:
                                stats['texts_with_numbers'] += 1
                                logger.log(f"  ‚îî‚îÄ üî¢ Numbers detected: {number_features['number_count']}")
                            if number_features['has_vague_quantifiers']:
                                stats['texts_with_vague'] += 1
                            if number_features['has_exact_quantifiers']:
                                stats['texts_with_exact'] += 1
                            
                            # STEP 3: Normalize numbers
                            text_en_normalized = ""
                            if self.config.get('text.normalize_numbers', True) and text_en_original:
                                text_en_normalized = normalize_numbers_in_text(text_en_original)
                                if text_en_original != text_en_normalized:
                                    stats['numbers_normalized'] += 1
                                    logger.log(f"  ‚îî‚îÄ ‚úì Numbers normalized (EN)")
                            else:
                                text_en_normalized = text_en_original
                            
                            # STEP 4: Translate to Indonesian with cache
                            text_id = ""
                            was_cached = False
                            if text_en_normalized and self.config.get('text.translate_to_indonesian', True):
                                logger.log(f"  ‚îî‚îÄ üåê Translating to Indonesian...")
                                text_id, was_cached = translate_text_with_cache(
                                    text_en_normalized, 
                                    self.cache_manager, 
                                    source='en', 
                                    target='id', 
                                    logger=logger
                                )
                                if text_id:
                                    stats['successful_translations'] += 1
                                    if was_cached:
                                        stats['cached_translations'] += 1
                                    else:
                                        stats['new_translations'] += 1
                                    logger.log(f"  ‚îî‚îÄ ‚úì Translation (ID): '{text_id[:100]}...'")
                                else:
                                    stats['failed_translations'] += 1
                                    text_id = ""
                            
                            # STEP 5: Analyze texts
                            features_en_original = analyze_text_content(text_en_original, 'en')
                            features_en_normalized = analyze_text_content(text_en_normalized, 'en')
                            features_id = analyze_text_content(text_id, 'id')
                            
                            # STEP 6: Convert to WAV
                            wav_path = self._convert_to_wav(video_path, label_name, logger)
                            
                            if not wav_path or not wav_path.exists():
                                logger.log_error(f"Failed to convert to WAV: {filename}")
                                stats['failed_audio_extractions'] += 1
                                failed_samples.append({'filename': filename, 'reason': 'WAV conversion failed'})
                                continue
                            
                            # STEP 7: Extract audio features (94)
                            audio_features = extract_audio_features(wav_path, self.config, logger)
                            
                            if audio_features is not None:
                                stats['successful_audio_extractions'] += 1
                            else:
                                stats['failed_audio_extractions'] += 1
                                logger.log_warning(f"Failed to extract audio features: {filename}")
                                audio_features = np.zeros(94)
                            
                            # STEP 8: Extract pause features
                            pause_features = extract_pause_silence_features(wav_path, self.config, logger)
                            if pause_features['num_pauses'] > 0:
                                stats['pause_features_extracted'] += 1
                            
                            # STEP 9: Check audio quality
                            quality_info = check_audio_quality(wav_path, self.config, logger)
                            
                            # STEP 10: Store TEXT + AUDIO data
                            data_entry = {
                                'filename': filename,
                                'text_english_original': text_en_original,
                                'text_english_normalized': text_en_normalized,
                                'text_indonesian': text_id,
                                'label': label,
                                'dataset': 'RLT',
                                
                                # Number features
                                'has_numbers': number_features['has_numbers'],
                                'number_count': number_features['number_count'],
                                'has_vague_quantifiers': number_features['has_vague_quantifiers'],
                                'has_exact_quantifiers': number_features['has_exact_quantifiers'],
                                'number_word_ratio': number_features['number_word_ratio'],
                                'has_large_numbers': number_features['has_large_numbers'],
                                'has_decimal_numbers': number_features['has_decimal_numbers'],
                                
                                # English text features (original)
                                'char_count_en_original': features_en_original['char_count'],
                                'word_count_en_original': features_en_original['word_count'],
                                'sentiment_en_original': features_en_original['sentiment'],
                                'subjectivity_en_original': features_en_original['subjectivity'],
                                'complexity_en_original': features_en_original['complexity'],
                                'lexical_diversity_en_original': features_en_original['lexical_diversity'],
                                
                                # English text features (normalized)
                                'char_count_en': features_en_normalized['char_count'],
                                'word_count_en': features_en_normalized['word_count'],
                                'sentiment_en': features_en_normalized['sentiment'],
                                'subjectivity_en': features_en_normalized['subjectivity'],
                                'complexity_en': features_en_normalized['complexity'],
                                'lexical_diversity_en': features_en_normalized['lexical_diversity'],
                                
                                # Indonesian text features
                                'char_count_id': features_id['char_count'],
                                'word_count_id': features_id['word_count'],
                                'sentiment_id': features_id['sentiment'],
                                'subjectivity_id': features_id['subjectivity'],
                                'complexity_id': features_id['complexity'],
                                'lexical_diversity_id': features_id['lexical_diversity'],
                                
                                # Audio quality
                                'audio_quality_score': quality_info.get('quality_score', 0),
                                'audio_duration': quality_info.get('duration', 0),
                                'audio_snr': quality_info.get('snr', 0),
                                'audio_rms': quality_info.get('rms', 0),
                                'audio_spectral_centroid': quality_info.get('spectral_centroid', 0),
                                'audio_dynamic_range': quality_info.get('dynamic_range', 0)
                            }
                            
                            # Add audio features (94)
                            audio_feature_names = create_audio_feature_names()
                            for i, feature_name in enumerate(audio_feature_names):
                                data_entry[feature_name] = audio_features[i]
                            
                            # Add pause features
                            for feature_name, feature_value in pause_features.items():
                                data_entry[f'pause_{feature_name}'] = feature_value
                            
                            text_audio_data.append(data_entry)
                            
                            # STEP 11: Extract LANDMARKS with validation
                            logger.log(f"  ‚îî‚îÄ üîç Extracting landmarks...")
                            
                            cap = cv2.VideoCapture(str(video_path))
                            
                            if not cap.isOpened():
                                logger.log_error(f"‚ùå Failed to open video: {filename}")
                                stats['failed_landmark_extractions'] += 1
                                continue
                            
                            frame_count = 0
                            landmark_detected_count = 0
                            sample_rate = self.config.get('video.sample_rate', 1)
                            
                            detection_summary = {
                                'total_frames': 0,
                                'face_detected': 0,
                                'iris_detected': 0,
                                'pose_detected': 0,
                                'strategies_used': {}
                            }
                            
                            while cap.isOpened():
                                ret, frame = cap.read()
                                if not ret:
                                    break
                                
                                if frame_count % sample_rate != 0:
                                    frame_count += 1
                                    continue
                                
                                stats['total_frames_processed'] += 1
                                detection_summary['total_frames'] += 1
                                
                                # ‚úÖ FIX 3: Pass filename and frame_num for detailed logging
                                face_landmarks_dict, iris_count, strategy = landmark_extractor.extract_with_multi_strategy(
                                    frame, face_mesh, iris_focus=True,
                                    filename=filename, frame_num=frame_count
                                )
                                
                                # Extract pose landmarks
                                pose_landmarks_dict = None
                                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                                pose_results = pose.process(frame_rgb)
                                
                                if pose_results.pose_landmarks:
                                    pose_landmarks_dict = {}
                                    for idx, landmark in enumerate(pose_results.pose_landmarks.landmark):
                                        pose_landmarks_dict[idx] = [landmark.x, landmark.y, landmark.z]
                                    
                                    # ‚úÖ FIX 3: Validate pose with detailed logging
                                    if self.config.get('reproducibility.validate_landmark_counts', True):
                                        landmark_extractor.validate_landmark_count(
                                            pose_landmarks_dict, 
                                            landmark_extractor.expected_pose_landmarks, 
                                            'pose',
                                            filename,
                                            frame_count
                                        )
                                
                                # Update detection statistics
                                if face_landmarks_dict:
                                    detection_summary['face_detected'] += 1
                                    stats['face_detected'] += 1
                                    
                                    if strategy:
                                        detection_summary['strategies_used'][strategy] = \
                                            detection_summary['strategies_used'].get(strategy, 0) + 1
                                
                                if iris_count >= 5:
                                    detection_summary['iris_detected'] += 1
                                    stats['iris_detected'] += 1
                                    landmark_detected_count += 1
                                
                                if pose_landmarks_dict:
                                    detection_summary['pose_detected'] += 1
                                    stats['pose_detected'] += 1
                                
                                # Write to CSV
                                if face_landmarks_dict or pose_landmarks_dict:
                                    row = [video_name, frame_count]
                                    
                                    # Face + Iris landmarks (478)
                                    for i in range(478):
                                        if face_landmarks_dict and i in face_landmarks_dict:
                                            row.extend(face_landmarks_dict[i])
                                        else:
                                            row.extend([0.0, 0.0, 0.0])
                                    
                                    # Pose landmarks (33)
                                    for i in range(33):
                                        if pose_landmarks_dict and i in pose_landmarks_dict:
                                            row.extend(pose_landmarks_dict[i])
                                        else:
                                            row.extend([0.0, 0.0, 0.0])
                                    
                                    row.append(label)
                                    landmark_writer.writerow(row)
                                
                                frame_count += 1
                            
                            cap.release()
                            
                            # Log detection summary
                            if detection_summary['total_frames'] > 0:
                                face_rate = (detection_summary['face_detected'] / detection_summary['total_frames']) * 100
                                iris_rate = (detection_summary['iris_detected'] / detection_summary['total_frames']) * 100
                                pose_rate = (detection_summary['pose_detected'] / detection_summary['total_frames']) * 100
                                
                                logger.log(f"  ‚îî‚îÄ ‚úì Landmarks extracted: {landmark_detected_count} frames")
                                logger.log(f"     Face: {face_rate:.1f}% ({detection_summary['face_detected']}/{detection_summary['total_frames']})")
                                logger.log(f"     Iris: {iris_rate:.1f}% ({detection_summary['iris_detected']}/{detection_summary['total_frames']})")
                                logger.log(f"     Pose: {pose_rate:.1f}% ({detection_summary['pose_detected']}/{detection_summary['total_frames']})")
                                
                                if landmark_detected_count > 0:
                                    stats['successful_landmark_extractions'] += 1
                                else:
                                    stats['failed_landmark_extractions'] += 1
                            else:
                                stats['failed_landmark_extractions'] += 1
                        
                        except Exception as e:
                            logger.log_error(f"‚ùå Error processing {filename}: {str(e)}")
                            import traceback
                            logger.log_error(traceback.format_exc())
                            stats['errors'].append(filename)
                            stats['failed_text_loads'] += 1
                            stats['failed_audio_extractions'] += 1
                            stats['failed_landmark_extractions'] += 1
                            failed_samples.append({'filename': filename, 'reason': str(e)})
                            continue
        
        finally:
            # Close landmark CSV
            landmark_csv.close()
            
            # Save translation cache
            self.cache_manager.save()
            cache_stats_final = self.cache_manager.get_stats()
            logger.log(f"\n‚úì Translation cache saved:")
            logger.log(f"  ‚îî‚îÄ Total cached: {cache_stats_final['total_cached']}")
            logger.log(f"  ‚îî‚îÄ New translations: {stats['new_translations']}")
            logger.log(f"  ‚îî‚îÄ Cached translations used: {stats['cached_translations']}")
        
        # ‚úÖ FIX 3: Get landmark warnings and save mismatch details
        landmark_summary = landmark_extractor.get_warning_summary()
        if landmark_summary['total_mismatches'] > 0:
            logger.log(f"\n‚ö† Landmark validation warnings:")
            logger.log(f"  ‚îî‚îÄ Total mismatches: {landmark_summary['total_mismatches']}")
            logger.log(f"  ‚îî‚îÄ Face mismatches: {landmark_summary['warnings'].get('face_count_mismatch', 0)}")
            logger.log(f"  ‚îî‚îÄ Pose mismatches: {landmark_summary['warnings'].get('pose_count_mismatch', 0)}")
            logger.log(f"  ‚îî‚îÄ MediaPipe version: {landmark_summary['mediapipe_version']}")
            
            # Save detailed mismatch log
            mismatch_path = landmark_extractor.save_mismatch_details(self.paths.paths['validation'])
            if mismatch_path:
                logger.log(f"  ‚îî‚îÄ Detailed log: {mismatch_path}")
        
        # Save datasets
        output_paths = self._save_datasets(text_audio_data, logger)
        
        # Save failed samples
        if failed_samples:
            failed_df = pd.DataFrame(failed_samples)
            failed_path = self.paths.paths['reextraction'] / 'failed_samples.csv'
            failed_df.to_csv(failed_path, index=False, encoding='utf-8')
            logger.log(f"\n‚ö†Ô∏è Failed samples saved: {failed_path}")
            logger.log(f"  ‚îî‚îÄ {len(failed_samples)} samples need manual review")
        
        # Generate data dictionary
        if self.config.get('output.generate_data_dictionary', True):
            dict_path = generate_data_dictionary(self.paths.paths['metadata'], logger)
            
            # ‚úÖ FIX 4: Validate schema ONLY for MultimodalDataset_Full.csv
            logger.log("\nüîç Validating schema (MultimodalDataset_Full.csv only)...")
            multimodal_path = Path(output_paths.get('multimodal_full'))
            if multimodal_path.exists():
                validate_schema(multimodal_path, dict_path, logger)
            else:
                logger.log_warning("MultimodalDataset_Full.csv not found for validation")
        
        # Generate statistics
        self._generate_statistics(text_audio_data, stats, logger)
        
        # Generate run manifest
        end_time = datetime.now()
        if self.config.get('reproducibility.save_run_manifest', True):
            manifest_path = generate_run_manifest(
                self.config, self.paths, stats, env_info, 
                cache_stats_final, landmark_summary,
                self.paths.paths['metadata'], logger
            )
        
        # Generate markdown report
        if self.config.get('output.generate_markdown_report', True):
            markdown_path = generate_markdown_report(
                stats, output_paths, self.paths, start_time, end_time,
                cache_stats_final, landmark_summary
            )
            logger.log(f"\n‚úì Markdown report generated: {markdown_path}")
        
        logger.finalize()
        
        return {
            'status': 'success',
            'stats': stats,
            'failed_samples': len(failed_samples),
            'output_paths': output_paths,
            'cache_stats': cache_stats_final,
            'landmark_summary': landmark_summary
        }
    
    def _convert_to_wav(self, video_path: Path, label_name: str, logger: RLTLogger) -> Optional[Path]:
        """Convert video to WAV"""
        if label_name == 'lie':
            output_dir = self.paths.paths['audio_wav_lie']
        else:
            output_dir = self.paths.paths['audio_wav_truth']
        
        wav_path = output_dir / f"{video_path.stem}.wav"
        
        if wav_path.exists():
            logger.log(f"WAV already exists: {wav_path.name}")
            return wav_path
        
        try:
            subprocess.run([
                "ffmpeg", "-i", str(video_path),
                "-vn",
                "-acodec", "pcm_s16le",
                "-ar", "16000",
                "-ac", "1",
                "-af", "highpass=f=80,lowpass=f=3000,volume=3.0,afftdn=nf=-25,equalizer=f=1000:width_type=h:width=200:g=3",
                "-y", str(wav_path)
            ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            
            logger.log(f"‚úì Converted to WAV: {video_path.name}")
            return wav_path
        
        except subprocess.CalledProcessError as e:
            logger.log_error(f"Error converting {video_path}: {e}")
            return None
        except FileNotFoundError:
            logger.log_error("ffmpeg not found. Please install ffmpeg.")
            return None
    
    def _save_datasets(self, data: List[Dict], logger: RLTLogger) -> Dict:
        """Save all datasets"""
        if not data:
            logger.log_warning("‚ö†Ô∏è No data to save")
            return {}
        
        df = pd.DataFrame(data)
        output_paths = {}
        
        # DATASET 1: Text (English - Original)
        text_en_original_cols = ['filename', 'text_english_original', 'label', 'dataset', 
                                 'char_count_en_original', 'word_count_en_original',
                                 'sentiment_en_original', 'subjectivity_en_original', 
                                 'complexity_en_original', 'lexical_diversity_en_original']
        df_text_en_original = df[text_en_original_cols].copy()
        
        text_en_original_path = self.paths.paths['text'] / 'TextDataset_English_Original.csv'
        df_text_en_original.to_csv(text_en_original_path, index=False, encoding='utf-8')
        output_paths['text_english_original'] = str(text_en_original_path)
        logger.log(f"‚úì Text (English - Original) saved: {text_en_original_path}")
        
        # DATASET 2: Text (English - Normalized)
        text_en_normalized_cols = ['filename', 'text_english_normalized', 'label', 'dataset', 
                                   'char_count_en', 'word_count_en',
                                   'sentiment_en', 'subjectivity_en', 
                                   'complexity_en', 'lexical_diversity_en']
        df_text_en_normalized = df[text_en_normalized_cols].copy()
        
        text_en_normalized_path = self.paths.paths['text'] / 'TextDataset_English_Normalized.csv'
        df_text_en_normalized.to_csv(text_en_normalized_path, index=False, encoding='utf-8')
        output_paths['text_english_normalized'] = str(text_en_normalized_path)
        logger.log(f"‚úì Text (English - Normalized) saved: {text_en_normalized_path}")
        
        # DATASET 3: Text (Indonesian)
        text_id_cols = ['filename', 'text_indonesian', 'label', 'dataset', 
                        'char_count_id', 'word_count_id',
                        'sentiment_id', 'subjectivity_id', 
                        'complexity_id', 'lexical_diversity_id']
        df_text_id = df[text_id_cols].copy()
        
        text_id_path = self.paths.paths['text'] / 'TextDataset_Indonesian.csv'
        df_text_id.to_csv(text_id_path, index=False, encoding='utf-8')
        output_paths['text_indonesian'] = str(text_id_path)
        logger.log(f"‚úì Text (Indonesian) saved: {text_id_path}")
        
        # DATASET 4: Number Features
        number_cols = ['filename', 'label', 'dataset', 'has_numbers', 'number_count',
                       'has_vague_quantifiers', 'has_exact_quantifiers', 'number_word_ratio',
                       'has_large_numbers', 'has_decimal_numbers']
        df_numbers = df[number_cols].copy()
        
        numbers_path = self.paths.paths['text'] / 'NumberFeatures.csv'
        df_numbers.to_csv(numbers_path, index=False, encoding='utf-8')
        output_paths['number_features'] = str(numbers_path)
        logger.log(f"‚úì Number features saved: {numbers_path}")
        
        # DATASET 5: Audio Features (94)
        audio_feature_names = create_audio_feature_names()
        audio_cols = ['filename', 'label', 'dataset'] + audio_feature_names + [
            'audio_quality_score', 'audio_duration', 'audio_snr', 
            'audio_rms', 'audio_spectral_centroid', 'audio_dynamic_range'
        ]
        df_audio = df[audio_cols].copy()
        
        audio_path = self.paths.paths['audio'] / 'AudioDataset_Features.csv'
        df_audio.to_csv(audio_path, index=False, encoding='utf-8')
        output_paths['audio_features'] = str(audio_path)
        logger.log(f"‚úì Audio features (94) saved: {audio_path}")
        
        # DATASET 6: Pause Features
        pause_cols = ['filename', 'label', 'dataset'] + [col for col in df.columns if col.startswith('pause_')]
        df_pause = df[pause_cols].copy()
        
        pause_path = self.paths.paths['audio'] / 'PauseFeatures.csv'
        df_pause.to_csv(pause_path, index=False, encoding='utf-8')
        output_paths['pause_features'] = str(pause_path)
        logger.log(f"‚úì Pause features saved: {pause_path}")
        
        # DATASET 7: Multimodal (Full)
        multimodal_path = self.paths.paths['multimodal'] / 'MultimodalDataset_Full.csv'
        df.to_csv(multimodal_path, index=False, encoding='utf-8')
        output_paths['multimodal_full'] = str(multimodal_path)
        logger.log(f"‚úì Multimodal (Full) saved: {multimodal_path}")
        
        # DATASET 8: Publication-Ready
        publication_cols = [
            'filename', 'label', 'dataset',
            'text_english_normalized', 'text_indonesian',
            'word_count_en', 'sentiment_en', 'lexical_diversity_en',
            'word_count_id', 'sentiment_id', 'lexical_diversity_id',
            'has_vague_quantifiers', 'has_exact_quantifiers', 'number_count',
            'mfcc1_mean', 'mfcc2_mean', 'mfcc3_mean', 'spectral_centroid_mean',
            'audio_quality_score', 'audio_snr',
            'pause_hesitation_score', 'pause_pause_frequency', 'pause_pause_ratio'
        ]
        publication_cols = [col for col in publication_cols if col in df.columns]
        df_publication = df[publication_cols].copy()
        
        publication_path = self.paths.paths['multimodal'] / 'PublicationDataset.csv'
        df_publication.to_csv(publication_path, index=False, encoding='utf-8')
        output_paths['publication'] = str(publication_path)
        logger.log(f"‚úì Publication dataset saved: {publication_path}")
        
        # DATASET 9: Landmark
        landmark_path = self.paths.paths['visual'] / 'LandmarkDataset.csv'
        output_paths['landmark'] = str(landmark_path)
        logger.log(f"‚úì Landmarks saved: {landmark_path}")
        
        return output_paths
    
    def _generate_statistics(self, data: List[Dict], stats: Dict, logger: RLTLogger):
        """Generate statistics and visualizations"""
        logger.log("\nüìä Generating statistics...")
        
        # Save processing statistics
        stats_path = self.paths.paths['statistical'] / 'processing_statistics.json'
        with open(stats_path, 'w', encoding='utf-8') as f:
            json.dump(convert_numpy_types(stats), f, indent=2)
        logger.log(f"‚úì Statistics saved: {stats_path}")
        
        # Generate visualizations
        if self.config.get('output.generate_figures', True):
            self._generate_visualizations(data, stats, logger)
    
    def _generate_visualizations(self, data: List[Dict], stats: Dict, logger: RLTLogger):
        """Generate visualizations"""
        if not data:
            return
            
        df = pd.DataFrame(data)
        figures_dir = self.paths.paths['exploratory_figs']
        dpi = self.config.get('output.figure_dpi', 300)
        
        sns.set_style("whitegrid")
        
        # Visualization 1: Class Distribution
        plt.figure(figsize=(10, 6))
        label_counts = df['label'].value_counts()
        labels = ['TRUTH (0)', 'LIE (1)']
        colors = ['#3498db', '#e74c3c']
        
        plt.bar(labels, [label_counts.get(0, 0), label_counts.get(1, 0)], 
                color=colors, alpha=0.8, edgecolor='black')
        plt.title('Class Distribution - RLT Dataset', fontsize=16, fontweight='bold')
        plt.ylabel('Count', fontsize=12)
        plt.grid(axis='y', alpha=0.3)
        
        for i, v in enumerate([label_counts.get(0, 0), label_counts.get(1, 0)]):
            plt.text(i, v + 0.5, str(v), ha='center', fontweight='bold', fontsize=12)
        
        plt.tight_layout()
        plt.savefig(figures_dir / 'class_distribution.png', dpi=dpi, bbox_inches='tight')
        plt.close()
        
        logger.log(f"‚úì Visualizations saved: {figures_dir}")


# ==================== MAIN EXECUTION ====================
def main():
    """Main execution function"""
    
    print("="*70)
    print(f"üéØ RLT DATASET PROCESSOR v{PIPELINE_VERSION} FINAL")
    print("="*70)
    print("‚ú® Scientific Data-ready Features (ALL FIXES APPLIED):")
    print("   ‚Ä¢ 94 audio features (corrected from 108) ‚úÖ")
    print("   ‚Ä¢ Explicit librosa parameters for reproducibility ‚úÖ")
    print("   ‚Ä¢ Translation cache with COMPLETE decision logging ‚úÖ")
    print("   ‚Ä¢ Environment capture (deep-translator, textblob, pydub) ‚úÖ")
    print("   ‚Ä¢ Run manifest with complete processing metadata ‚úÖ")
    print("   ‚Ä¢ Data dictionary for all columns ‚úÖ")
    print("   ‚Ä¢ Schema validation (MultimodalDataset_Full.csv ONLY) ‚úÖ")
    print("   ‚Ä¢ Landmark validation with DETAILED logging (478+33) ‚úÖ")
    print("   ‚Ä¢ Bark energy: axis=0 (consistent with I3D v8) ‚úÖ")
    print("   ‚Ä¢ Number normalization: Digits ‚Üí English words ‚úÖ")
    print("   ‚Ä¢ Translation: English ‚Üí Indonesian (cached) ‚úÖ")
    print("   ‚Ä¢ NLTK: Fail-fast (no silent download) ‚úÖ")
    print("   ‚Ä¢ Config: Deep copy (not shallow) ‚úÖ")
    print("   ‚Ä¢ üî• FULLY CONSISTENT WITH I3D v8 + ALL FIXES!")
    print("="*70)
    
    # Initialize configuration
    base_dir = os.getcwd()
    config_path = os.path.join(base_dir, "config_RLT.yaml")
    
    config = RLTConfig(config_path if os.path.exists(config_path) else None)
    
    # Save default config if not exists
    if not os.path.exists(config_path):
        config.save_to_file(config_path)
        print(f"\n‚úì Default configuration saved to: {config_path}")
        print("  You can edit this file to customize settings.")
    
    # Initialize path manager
    path_manager = RLTPathManager(base_dir)
    path_manager.create_directories()
    
    # Check ffmpeg
    try:
        subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, 
                      stderr=subprocess.PIPE, check=True)
        print("‚úì ffmpeg detected")
    except (subprocess.CalledProcessError, FileNotFoundError):
        print("‚ùå ffmpeg not found!")
        print("üí° Please install ffmpeg: https://ffmpeg.org/download.html")
        return
    
    # Confirm processing
    print("\n" + "="*70)
    print("‚ö†Ô∏è  IMPORTANT:")
    print("   ‚Ä¢ Internet connection required for NEW translations (EN ‚Üí ID)")
    print("   ‚Ä¢ Cached translations will be used when available (deterministic)")
    print("   ‚Ä¢ Estimated time: ~20-25 minutes per 100 files")
    print("   ‚Ä¢ Processing: TRANSCRIPTION + NORMALIZATION + TRANSLATION + AUDIO + LANDMARKS")
    print("   ‚Ä¢ Transcriptions loaded from txt files")
    print("   ‚Ä¢ Number normalization: Digits ‚Üí English words (0-trillion)")
    print("   ‚Ä¢ Translation: English ‚Üí Indonesian (Google Translate, cached)")
    print("   ‚Ä¢ Audio features: 94 (not 108) with explicit librosa params")
    print("   ‚Ä¢ Landmark validation: 478 face + 33 pose (detailed logging)")
    print("   ‚Ä¢ NLTK resources must be pre-installed (fail-fast)")
    print("="*70)
    
    print("\nüìÅ Expected directory structure:")
    print("   dataset/raw/RLT/clips/lie/              <- Place LIE videos here")
    print("   dataset/raw/RLT/clips/truth/            <- Place TRUTH videos here")
    print("   dataset/raw/RLT/Transcription/lie/      <- Place LIE txt files here")
    print("   dataset/raw/RLT/Transcription/truth/    <- Place TRUTH txt files here")
    
    print("\nüì¶ NLTK Requirements:")
    print("   If not installed, run:")
    print("   python -m nltk.downloader punkt stopwords")
    
    response = input("\nüöÄ Start processing? (y/n): ")
    if response.lower() != 'y':
        print("‚ùå Processing cancelled.")
        return
    
    # Initialize processor
    processor = RLTProcessor(config, path_manager)
    
    # Process dataset
    try:
        result = processor.process()
        
        if result['status'] == 'success':
            print("\n" + "="*70)
            print("‚úÖ RLT DATASET PROCESSING COMPLETE!")
            print("="*70)
            print(f"üìä Total files: {result['stats']['total_files']}")
            print(f"‚úì  Transcription loaded: {result['stats']['successful_text_loads']}/{result['stats']['total_files']}")
            print(f"‚úì  Numbers normalized: {result['stats']['numbers_normalized']}/{result['stats']['texts_with_numbers']}")
            print(f"‚úì  Translation (EN‚ÜíID): {result['stats']['successful_translations']}/{result['stats']['successful_text_loads']}")
            print(f"   ‚îî‚îÄ Cached: {result['stats']['cached_translations']}, New: {result['stats']['new_translations']}")
            print(f"‚úì  Audio extraction (94 features): {result['stats']['successful_audio_extractions']}/{result['stats']['total_files']}")
            print(f"‚úì  Landmark extraction: {result['stats']['successful_landmark_extractions']}/{result['stats']['total_files']}")
            print(f"üëÅÔ∏è  Face detection: {result['stats']['face_detected']:,} frames")
            print(f"üëÅÔ∏è  Iris detection: {result['stats']['iris_detected']:,} frames")
            print(f"üßç Pose detection: {result['stats']['pose_detected']:,} frames")
            print(f"üî¢ Texts with numbers: {result['stats']['texts_with_numbers']}")
            print(f"‚ö†Ô∏è  Failed samples: {result['failed_samples']}")
            
            # Landmark warnings
            landmark_summary = result['landmark_summary']
            if landmark_summary.get('total_mismatches', 0) > 0:
                print("\n‚ö†Ô∏è  Landmark Validation Warnings:")
                print(f"   ‚îî‚îÄ Total mismatches: {landmark_summary['total_mismatches']}")
                print(f"   ‚îî‚îÄ Face mismatches: {landmark_summary['warnings'].get('face_count_mismatch', 0)}")
                print(f"   ‚îî‚îÄ Pose mismatches: {landmark_summary['warnings'].get('pose_count_mismatch', 0)}")
                print(f"   ‚îî‚îÄ MediaPipe version: {landmark_summary.get('mediapipe_version', 'unknown')}")
                print(f"   ‚îî‚îÄ Detailed log: landmark_mismatch_details.csv")
            
            print("\nüåê Translation Cache:")
            print(f"   ‚îî‚îÄ Total cached: {result['cache_stats']['total_cached']}")
            print(f"   ‚îî‚îÄ Cache size: {result['cache_stats']['cache_size_mb']:.2f} MB")
            print(f"   ‚îî‚îÄ Decision log: translation_decisions.csv")
            
            print("="*70)
            print(f"\nüì¶ Output directory: {path_manager.paths['processed']}")
            print(f"üìã Detailed logs: {path_manager.paths['logs']}")
            print(f"üíæ Translation cache: {path_manager.paths['cache']}")
            print(f"üìö Data dictionary: {path_manager.paths['metadata']}/data_dictionary.csv")
            print(f"üìÑ Run manifest: {path_manager.paths['metadata']}/run_manifest.json")
            print(f"‚ö†Ô∏è  Landmark mismatches: {path_manager.paths['validation']}/landmark_mismatch_details.csv")
            
            print("\nüéâ SCIENTIFIC DATA-READY (v8.1 FINAL)!")
            print("   ‚úÖ 94 audio features (corrected)")
            print("   ‚úÖ Explicit librosa parameters")
            print("   ‚úÖ Translation cache with complete decision logging")
            print("   ‚úÖ Environment captured (all dependencies)")
            print("   ‚úÖ Run manifest generated")
            print("   ‚úÖ Data dictionary created")
            print("   ‚úÖ Schema validated (MultimodalDataset_Full.csv)")
            print("   ‚úÖ Landmark counts validated with detailed logging")
            print("   ‚úÖ Bark energy: axis=0 (I3D v8 consistent)")
            print("   ‚úÖ NLTK: Fail-fast approach")
            print("   ‚úÖ Config: Deep copy approach")
            print("   ‚úÖ Fully reproducible (with cache)")
            
            print("\nüìä Output Datasets:")
            print("   1. TextDataset_English_Original.csv")
            print("   2. TextDataset_English_Normalized.csv ‚ú®")
            print("   3. TextDataset_Indonesian.csv ‚ú®")
            print("   4. NumberFeatures.csv")
            print("   5. AudioDataset_Features.csv (94 features) ‚úÖ")
            print("   6. PauseFeatures.csv")
            print("   7. LandmarkDataset.csv (478+33)")
            print("   8. MultimodalDataset_Full.csv")
            print("   9. PublicationDataset.csv")
            print("   10. data_dictionary.csv ‚ú®")
            print("   11. run_manifest.json ‚ú®")
            print("   12. landmark_mismatch_details.csv ‚ú®")
            print("   13. translation_decisions.csv ‚ú®")
            
            print("\nüîß Fixes Applied in v8.1:")
            print("   1. ‚úÖ NLTK: Fail Fast (RuntimeError with instructions)")
            print("   2. ‚úÖ Config: Deep Copy (prevents nested dict mutation)")
            print("   3. ‚úÖ Landmark: Detailed Logging (file/frame + MediaPipe version)")
            print("   4. ‚úÖ Schema: Fixed (only MultimodalDataset_Full.csv)")
            print("   5. ‚úÖ Environment: Complete (deep-translator, textblob, pydub)")
            print("   6. ‚úÖ Translation: Complete Logging (cached=True & False)")
            
            print("="*70 + "\n")
            print("üöÄ READY FOR SCIENTIFIC DATA SUBMISSION!")
            print("="*70 + "\n")
        else:
            print("\n‚ùå Processing failed!")
            
    except RuntimeError as e:
        # ‚úÖ FIX 1: Catch NLTK fail-fast error
        if "NLTK" in str(e):
            print("\n" + "="*70)
            print("‚ùå NLTK RESOURCES NOT FOUND!")
            print("="*70)
            print(str(e))
            print("\nTo fix this issue:")
            print("1. Run: python -m nltk.downloader punkt stopwords")
            print("2. Or in Python:")
            print("   import nltk")
            print("   nltk.download('punkt')")
            print("   nltk.download('stopwords')")
            print("="*70)
        else:
            raise
    except Exception as e:
        print(f"\n‚ùå Error: {str(e)}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()


üéØ RLT DATASET PROCESSOR v8.1.0 FINAL
‚ú® Scientific Data-ready Features (ALL FIXES APPLIED):
   ‚Ä¢ 94 audio features (corrected from 108) ‚úÖ
   ‚Ä¢ Explicit librosa parameters for reproducibility ‚úÖ
   ‚Ä¢ Translation cache with COMPLETE decision logging ‚úÖ
   ‚Ä¢ Environment capture (deep-translator, textblob, pydub) ‚úÖ
   ‚Ä¢ Run manifest with complete processing metadata ‚úÖ
   ‚Ä¢ Data dictionary for all columns ‚úÖ
   ‚Ä¢ Schema validation (MultimodalDataset_Full.csv ONLY) ‚úÖ
   ‚Ä¢ Landmark validation with DETAILED logging (478+33) ‚úÖ
   ‚Ä¢ Bark energy: axis=0 (consistent with I3D v8) ‚úÖ
   ‚Ä¢ Number normalization: Digits ‚Üí English words ‚úÖ
   ‚Ä¢ Translation: English ‚Üí Indonesian (cached) ‚úÖ
   ‚Ä¢ NLTK: Fail-fast (no silent download) ‚úÖ
   ‚Ä¢ Config: Deep copy (not shallow) ‚úÖ
   ‚Ä¢ üî• FULLY CONSISTENT WITH I3D v8 + ALL FIXES!
‚úì Directory structure created for RLT
‚úì ffmpeg detected

‚ö†Ô∏è  IMPORTANT:
   ‚Ä¢ Internet connection required for NEW tran

2026-01-30 14:05:41,850 - [RLT v8.1] - INFO - üéØ RLT DATASET PROCESSOR v8.1.0
2026-01-30 14:05:41,850 - [RLT v8.1] - INFO - ‚ú® Scientific Data-ready Features (v8.1 FINAL):
2026-01-30 14:05:41,852 - [RLT v8.1] - INFO -    ‚Ä¢ 94 audio features (corrected from 108)
2026-01-30 14:05:41,853 - [RLT v8.1] - INFO -    ‚Ä¢ Explicit librosa parameters
2026-01-30 14:05:41,853 - [RLT v8.1] - INFO -    ‚Ä¢ Translation cache with COMPLETE decision logging
2026-01-30 14:05:41,854 - [RLT v8.1] - INFO -    ‚Ä¢ Environment capture (deep-translator, textblob, pydub)
2026-01-30 14:05:41,854 - [RLT v8.1] - INFO -    ‚Ä¢ Run manifest + data dictionary
2026-01-30 14:05:41,855 - [RLT v8.1] - INFO -    ‚Ä¢ Schema validation (MultimodalDataset_Full.csv only)
2026-01-30 14:05:41,855 - [RLT v8.1] - INFO -    ‚Ä¢ Landmark validation with DETAILED logging
2026-01-30 14:05:41,856 - [RLT v8.1] - INFO -    ‚Ä¢ NLTK: Fail fast (no silent download)
2026-01-30 14:05:41,857 - [RLT v8.1] - INFO -    ‚Ä¢ Config: Deep co


‚úÖ RLT DATASET PROCESSING COMPLETE!
üìä Total files: 121
‚úì  Transcription loaded: 121/121
‚úì  Numbers normalized: 34/43
‚úì  Translation (EN‚ÜíID): 121/121
   ‚îî‚îÄ Cached: 4, New: 117
‚úì  Audio extraction (94 features): 121/121
‚úì  Landmark extraction: 121/121
üëÅÔ∏è  Face detection: 86,248 frames
üëÅÔ∏è  Iris detection: 86,248 frames
üßç Pose detection: 89,583 frames
üî¢ Texts with numbers: 43
‚ö†Ô∏è  Failed samples: 0

üåê Translation Cache:
   ‚îî‚îÄ Total cached: 121
   ‚îî‚îÄ Cache size: 0.08 MB
   ‚îî‚îÄ Decision log: translation_decisions.csv

üì¶ Output directory: f:\MULAI LAGI\eKSTRAKSI - FULL DATA\dataset\processed\RLT
üìã Detailed logs: f:\MULAI LAGI\eKSTRAKSI - FULL DATA\dataset\_logs\RLT
üíæ Translation cache: f:\MULAI LAGI\eKSTRAKSI - FULL DATA\dataset\cache\RLT
üìö Data dictionary: f:\MULAI LAGI\eKSTRAKSI - FULL DATA\dataset\metadata\RLT/data_dictionary.csv
üìÑ Run manifest: f:\MULAI LAGI\eKSTRAKSI - FULL DATA\dataset\metadata\RLT/run_manifest.json
‚ö