In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
=============================================================================
DATASET CONSISTENCY CHECKER & FIXER - DUAL PROCESSOR
=============================================================================
Verify and fix consistency for BOTH I3D and RLT datasets automatically
No user interaction required - fully automated
=============================================================================
Version: 2.1
Author: Yeni Dwi Rahayu
Date: 2025-01-14
=============================================================================
"""

import pandas as pd
import numpy as np
import os
import json
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# ==================== CONFIGURATION ====================
class ConsistencyConfig:
    """Configuration for consistency checker"""
    
    def __init__(self, dataset_name: str = 'I3D'):
        self.dataset_name = dataset_name
        self.base_dir = Path(os.getcwd())
        self.dataset_dir = self.base_dir / "dataset" / "processed" / dataset_name
        
        # Expected paths
        self.paths = {
            'text_indonesian': self.dataset_dir / "text" / "TextDataset_Indonesian.csv",
            'text_english': self.dataset_dir / "text" / "TextDataset_English.csv",
            'number_features': self.dataset_dir / "text" / "NumberFeatures.csv",
            'audio_features': self.dataset_dir / "audio" / "AudioDataset_Features.csv",
            'pause_features': self.dataset_dir / "audio" / "PauseFeatures.csv",
            'landmarks': self.dataset_dir / "visual" / "LandmarkDataset.csv",
            'multimodal_full': self.dataset_dir / "multimodal" / "MultimodalDataset_Full.csv",
            'publication': self.dataset_dir / "multimodal" / "PublicationDataset.csv",
            'validation': self.dataset_dir.parent.parent / "validation" / dataset_name,
            'reextraction': self.dataset_dir / "reextraction"
        }
        
        # Create validation directory
        self.paths['validation'].mkdir(parents=True, exist_ok=True)
        self.paths['reextraction'].mkdir(parents=True, exist_ok=True)


# ==================== DATASET LOADER ====================
class DatasetLoader:
    """Load all datasets with error handling"""
    
    def __init__(self, config: ConsistencyConfig):
        self.config = config
        self.datasets = {}
        self.load_status = {}
    
    def load_all(self) -> Dict[str, pd.DataFrame]:
        """Load all datasets"""
        print(f"\nüìÇ Loading {self.config.dataset_name} datasets...")
        
        for name, path in self.config.paths.items():
            if name in ['validation', 'reextraction']:
                continue
            
            try:
                if path.exists():
                    df = pd.read_csv(path, encoding='utf-8')
                    self.datasets[name] = df
                    self.load_status[name] = 'success'
                    print(f"  ‚úì {name}: {len(df):,} rows")
                else:
                    self.load_status[name] = 'not_found'
                    print(f"  ‚ö†Ô∏è {name}: File not found")
            except Exception as e:
                self.load_status[name] = f'error: {str(e)}'
                print(f"  ‚ùå {name}: Error loading - {str(e)}")
        
        return self.datasets
    
    def get_dataset(self, name: str) -> Optional[pd.DataFrame]:
        """Get specific dataset"""
        return self.datasets.get(name)


# ==================== CONSISTENCY CHECKER ====================
class ConsistencyChecker:
    """Check consistency across datasets"""
    
    def __init__(self, datasets: Dict[str, pd.DataFrame], config: ConsistencyConfig):
        self.datasets = datasets
        self.config = config
        self.issues = []
        self.stats = {}
    
    def check_all(self) -> Dict:
        """Run all consistency checks"""
        print(f"\n{'='*70}")
        print(f"üîç CHECKING {self.config.dataset_name} CONSISTENCY")
        print(f"{'='*70}")
        
        results = {
            'unique_videos': self.check_unique_videos(),
            'samples_per_video': self.check_samples_per_video(),
            'class_distribution': self.check_class_distribution(),
            'missing_data': self.check_missing_data(),
            'column_consistency': self.check_column_consistency(),
            'data_types': self.check_data_types(),
            'value_ranges': self.check_value_ranges(),
            'landmark_quality': self.check_landmark_quality()
        }
        
        return results
    
    def check_unique_videos(self) -> Dict:
        """Check 1: Unique videos across datasets"""
        print(f"\nüìä CHECK 1: UNIQUE VIDEOS")
        print("-" * 70)
        
        result = {'status': 'pass', 'details': {}}
        
        # Get unique identifiers from each dataset
        video_sets = {}
        
        if 'landmarks' in self.datasets:
            video_sets['landmarks'] = set(self.datasets['landmarks']['Video_Name'].unique())
        
        for name in ['text_indonesian', 'text_english', 'audio_features', 'multimodal_full']:
            if name in self.datasets:
                # Remove file extensions
                filenames = self.datasets[name]['filename'].unique()
                video_sets[name] = set([f.rsplit('.', 1)[0] if '.' in f else f for f in filenames])
        
        # Print counts
        for name, videos in video_sets.items():
            print(f"  ‚Ä¢ {name}: {len(videos)} unique videos")
            result['details'][name] = len(videos)
        
        # Check consistency
        if len(video_sets) > 1:
            reference_set = list(video_sets.values())[0]
            all_match = all(vset == reference_set for vset in video_sets.values())
            
            if all_match:
                print(f"  ‚úÖ All datasets have consistent video names")
                result['status'] = 'pass'
            else:
                print(f"  ‚ùå Video name mismatch detected")
                result['status'] = 'fail'
                
                # Find differences
                for name1, set1 in video_sets.items():
                    for name2, set2 in video_sets.items():
                        if name1 < name2:  # Avoid duplicate comparisons
                            missing_in_2 = set1 - set2
                            missing_in_1 = set2 - set1
                            
                            if missing_in_2:
                                print(f"  ‚ö†Ô∏è In {name1} but NOT in {name2}: {len(missing_in_2)}")
                                self.issues.append({
                                    'type': 'missing_videos',
                                    'from': name1,
                                    'to': name2,
                                    'count': len(missing_in_2),
                                    'videos': list(missing_in_2)[:10]
                                })
                            
                            if missing_in_1:
                                print(f"  ‚ö†Ô∏è In {name2} but NOT in {name1}: {len(missing_in_1)}")
                                self.issues.append({
                                    'type': 'missing_videos',
                                    'from': name2,
                                    'to': name1,
                                    'count': len(missing_in_1),
                                    'videos': list(missing_in_1)[:10]
                                })
        
        return result
    
    def check_samples_per_video(self) -> Dict:
        """Check 2: Samples/frames per video"""
        print(f"\nüìä CHECK 2: SAMPLES PER VIDEO")
        print("-" * 70)
        
        result = {'status': 'pass', 'details': {}}
        
        # Text/Audio datasets (should have 1 sample per video)
        for name in ['text_indonesian', 'text_english', 'audio_features']:
            if name in self.datasets:
                counts = self.datasets[name]['filename'].value_counts()
                
                print(f"\n{name}:")
                print(f"  ‚îú‚îÄ Min: {counts.min()}")
                print(f"  ‚îú‚îÄ Max: {counts.max()}")
                print(f"  ‚îú‚îÄ Mean: {counts.mean():.2f}")
                print(f"  ‚îî‚îÄ Median: {counts.median():.0f}")
                
                result['details'][name] = {
                    'min': int(counts.min()),
                    'max': int(counts.max()),
                    'mean': float(counts.mean()),
                    'median': float(counts.median())
                }
                
                # Check for duplicates
                if counts.max() > 1:
                    duplicates = counts[counts > 1]
                    print(f"  ‚ö†Ô∏è {len(duplicates)} videos have multiple entries!")
                    result['status'] = 'warning'
                    self.issues.append({
                        'type': 'duplicate_samples',
                        'dataset': name,
                        'count': len(duplicates),
                        'examples': list(duplicates.index[:5])
                    })
        
        # Landmarks (multiple frames per video is normal)
        if 'landmarks' in self.datasets:
            counts = self.datasets['landmarks']['Video_Name'].value_counts()
            
            print(f"\nlandmarks:")
            print(f"  ‚îú‚îÄ Min frames: {counts.min()}")
            print(f"  ‚îú‚îÄ Max frames: {counts.max()}")
            print(f"  ‚îú‚îÄ Mean frames: {counts.mean():.2f}")
            print(f"  ‚îî‚îÄ Median frames: {counts.median():.0f}")
            
            result['details']['landmarks'] = {
                'min': int(counts.min()),
                'max': int(counts.max()),
                'mean': float(counts.mean()),
                'median': float(counts.median())
            }
            
            # Check for videos with very few frames
            low_frame_videos = counts[counts < 10]
            if len(low_frame_videos) > 0:
                print(f"  ‚ö†Ô∏è {len(low_frame_videos)} videos have < 10 frames")
                result['status'] = 'warning'
                self.issues.append({
                    'type': 'low_frame_count',
                    'dataset': 'landmarks',
                    'count': len(low_frame_videos),
                    'examples': list(low_frame_videos.index[:5])
                })
        
        return result
    
    def check_class_distribution(self) -> Dict:
        """Check 3: Class distribution (TRUTH vs LIE)"""
        print(f"\nüìä CHECK 3: CLASS DISTRIBUTION")
        print("-" * 70)
        
        result = {'status': 'pass', 'details': {}}
        
        for name in ['text_indonesian', 'audio_features', 'landmarks', 'multimodal_full']:
            if name in self.datasets:
                df = self.datasets[name]
                
                # Determine label column
                label_col = 'Class' if name == 'landmarks' else 'label'
                
                if label_col in df.columns:
                    class_dist = df[label_col].value_counts()
                    total = len(df)
                    
                    truth_count = class_dist.get(0, 0)
                    lie_count = class_dist.get(1, 0)
                    truth_pct = (truth_count / total * 100) if total > 0 else 0
                    lie_pct = (lie_count / total * 100) if total > 0 else 0
                    
                    print(f"\n{name}:")
                    print(f"  ‚îú‚îÄ TRUTH (0): {truth_count:,} ({truth_pct:.1f}%)")
                    print(f"  ‚îî‚îÄ LIE (1): {lie_count:,} ({lie_pct:.1f}%)")
                    
                    result['details'][name] = {
                        'truth': int(truth_count),
                        'lie': int(lie_count),
                        'truth_pct': float(truth_pct),
                        'lie_pct': float(lie_pct)
                    }
                    
                    # Check balance (40-60% is acceptable)
                    if not (40 <= lie_pct <= 60):
                        print(f"  ‚ö†Ô∏è Imbalanced dataset!")
                        result['status'] = 'warning'
                        self.issues.append({
                            'type': 'class_imbalance',
                            'dataset': name,
                            'lie_percentage': float(lie_pct)
                        })
        
        return result
    
    def check_missing_data(self) -> Dict:
        """Check 4: Missing values"""
        print(f"\nüìä CHECK 4: MISSING DATA")
        print("-" * 70)
        
        result = {'status': 'pass', 'details': {}}
        
        for name, df in self.datasets.items():
            missing_counts = df.isnull().sum()
            missing_cols = missing_counts[missing_counts > 0]
            
            if len(missing_cols) > 0:
                total_missing = missing_cols.sum()
                print(f"\n{name}:")
                print(f"  ‚îú‚îÄ Columns with missing: {len(missing_cols)}")
                print(f"  ‚îî‚îÄ Total missing values: {total_missing:,}")
                
                # Show top 5 columns with most missing
                top_missing = missing_cols.nlargest(5)
                for col, count in top_missing.items():
                    pct = (count / len(df) * 100)
                    print(f"     ‚Ä¢ {col}: {count:,} ({pct:.1f}%)")
                
                result['details'][name] = {
                    'columns_with_missing': len(missing_cols),
                    'total_missing': int(total_missing),
                    'top_missing': {col: int(count) for col, count in top_missing.items()}
                }
                
                # Critical if > 10% missing in important columns
                critical_cols = ['filename', 'Video_Name', 'label', 'Class', 'text_indonesian_normalized', 'text_english']
                critical_missing = missing_cols[missing_cols.index.isin(critical_cols)]
                
                if len(critical_missing) > 0:
                    print(f"  ‚ùå Critical columns have missing data!")
                    result['status'] = 'fail'
                    self.issues.append({
                        'type': 'critical_missing',
                        'dataset': name,
                        'columns': list(critical_missing.index)
                    })
            else:
                print(f"\n{name}: ‚úÖ No missing values")
                result['details'][name] = {'status': 'clean'}
        
        return result
    
    def check_column_consistency(self) -> Dict:
        """Check 5: Column name consistency"""
        print(f"\nüìä CHECK 5: COLUMN CONSISTENCY")
        print("-" * 70)
        
        result = {'status': 'pass', 'details': {}}
        
        # Expected columns for each dataset
        expected_cols = {
            'text_indonesian': ['filename', 'text_indonesian_original', 'text_indonesian_normalized', 
                               'label', 'dataset', 'char_count_id', 'word_count_id'],
            'text_english': ['filename', 'text_english', 'label', 'dataset', 
                            'char_count_en', 'word_count_en'],
            'audio_features': ['filename', 'label', 'dataset', 'mfcc1_mean', 'mfcc1_std'],
            'pause_features': ['filename', 'label', 'dataset', 'pause_num_pauses', 
                              'pause_hesitation_score'],
            'landmarks': ['Video_Name', 'Frame', 'Landmark_0_X', 'Landmark_0_Y', 
                         'Landmark_0_Z', 'Class']
        }
        
        for name, expected in expected_cols.items():
            if name in self.datasets:
                df = self.datasets[name]
                actual_cols = set(df.columns)
                expected_set = set(expected)
                
                missing = expected_set - actual_cols
                extra = actual_cols - expected_set
                
                print(f"\n{name}:")
                print(f"  ‚îú‚îÄ Total columns: {len(actual_cols)}")
                
                if missing:
                    print(f"  ‚îú‚îÄ Missing expected: {len(missing)}")
                    for col in list(missing)[:5]:
                        print(f"     ‚Ä¢ {col}")
                    result['status'] = 'warning'
                    self.issues.append({
                        'type': 'missing_columns',
                        'dataset': name,
                        'columns': list(missing)
                    })
                
                if len(missing) == 0:
                    print(f"  ‚îî‚îÄ ‚úÖ All expected columns present")
                
                result['details'][name] = {
                    'total': len(actual_cols),
                    'missing': list(missing) if missing else [],
                    'extra_count': len(extra)
                }
        
        return result
    
    def check_data_types(self) -> Dict:
        """Check 6: Data types"""
        print(f"\nüìä CHECK 6: DATA TYPES")
        print("-" * 70)
        
        result = {'status': 'pass', 'details': {}}
        
        for name, df in self.datasets.items():
            # Check for unexpected data types
            numeric_cols = df.select_dtypes(include=[np.number]).columns
            object_cols = df.select_dtypes(include=['object']).columns
            
            print(f"\n{name}:")
            print(f"  ‚îú‚îÄ Numeric columns: {len(numeric_cols)}")
            print(f"  ‚îî‚îÄ Text columns: {len(object_cols)}")
            
            result['details'][name] = {
                'numeric': len(numeric_cols),
                'text': len(object_cols)
            }
            
            # Check if label/Class is numeric
            label_col = 'Class' if name == 'landmarks' else 'label'
            if label_col in df.columns:
                if df[label_col].dtype not in [np.int64, np.int32, np.float64]:
                    print(f"  ‚ö†Ô∏è {label_col} is not numeric!")
                    result['status'] = 'warning'
                    self.issues.append({
                        'type': 'wrong_dtype',
                        'dataset': name,
                        'column': label_col,
                        'actual': str(df[label_col].dtype)
                    })
        
        return result
    
    def check_value_ranges(self) -> Dict:
        """Check 7: Value ranges (sanity check)"""
        print(f"\nüìä CHECK 7: VALUE RANGES")
        print("-" * 70)
        
        result = {'status': 'pass', 'details': {}}
        
        # Check landmarks (should be between 0 and 1)
        if 'landmarks' in self.datasets:
            df = self.datasets['landmarks']
            landmark_cols = [col for col in df.columns if col.startswith('Landmark_') or col.startswith('Pose_')]
            
            if landmark_cols:
                landmark_data = df[landmark_cols]
                
                # Exclude zeros (missing landmarks)
                landmark_data_nonzero = landmark_data.replace(0, np.nan)
                
                min_val = landmark_data_nonzero.min().min()
                max_val = landmark_data_nonzero.max().max()
                
                print(f"\nlandmarks:")
                print(f"  ‚îú‚îÄ Min value: {min_val:.4f}")
                print(f"  ‚îî‚îÄ Max value: {max_val:.4f}")
                
                result['details']['landmarks'] = {
                    'min': float(min_val) if not np.isnan(min_val) else None,
                    'max': float(max_val) if not np.isnan(max_val) else None
                }
                
                # Landmarks should be normalized (0-1 range)
                if min_val < -0.5 or max_val > 1.5:
                    print(f"  ‚ö†Ô∏è Values outside expected range [0, 1]!")
                    result['status'] = 'warning'
                    self.issues.append({
                        'type': 'value_range',
                        'dataset': 'landmarks',
                        'min': float(min_val),
                        'max': float(max_val)
                    })
        
        # Check audio features (MFCC typically -50 to 50)
        if 'audio_features' in self.datasets:
            df = self.datasets['audio_features']
            mfcc_cols = [col for col in df.columns if col.startswith('mfcc')]
            
            if mfcc_cols:
                mfcc_data = df[mfcc_cols]
                
                min_val = mfcc_data.min().min()
                max_val = mfcc_data.max().max()
                
                print(f"\naudio_features (MFCC):")
                print(f"  ‚îú‚îÄ Min value: {min_val:.2f}")
                print(f"  ‚îî‚îÄ Max value: {max_val:.2f}")
                
                result['details']['audio_mfcc'] = {
                    'min': float(min_val),
                    'max': float(max_val)
                }
                
                # MFCC typically in range [-100, 100]
                if min_val < -200 or max_val > 200:
                    print(f"  ‚ö†Ô∏è Unusual MFCC values detected!")
                    result['status'] = 'warning'
        
        return result
    
    def check_landmark_quality(self) -> Dict:
        """Check 8: Landmark detection quality"""
        print(f"\nüìä CHECK 8: LANDMARK QUALITY")
        print("-" * 70)
        
        result = {'status': 'pass', 'details': {}}
        
        if 'landmarks' not in self.datasets:
            print("  ‚ö†Ô∏è Landmark dataset not found")
            return result
        
        df = self.datasets['landmarks']
        
        # Face landmarks (0-467)
        face_cols = [col for col in df.columns if col.startswith('Landmark_') and 
                     int(col.split('_')[1]) < 468]
        
        # Iris landmarks (468-477)
        iris_cols = [col for col in df.columns if col.startswith('Landmark_') and 
                     468 <= int(col.split('_')[1]) < 478]
        
        # Pose landmarks
        pose_cols = [col for col in df.columns if col.startswith('Pose_')]
        
        # Calculate detection rates
        total_frames = len(df)
        
        face_detected = (df[face_cols].sum(axis=1) != 0).sum()
        iris_detected = (df[iris_cols].sum(axis=1) != 0).sum() if iris_cols else 0
        pose_detected = (df[pose_cols].sum(axis=1) != 0).sum() if pose_cols else 0
        
        face_rate = (face_detected / total_frames * 100) if total_frames > 0 else 0
        iris_rate = (iris_detected / total_frames * 100) if total_frames > 0 else 0
        pose_rate = (pose_detected / total_frames * 100) if total_frames > 0 else 0
        
        print(f"\nDetection rates:")
        print(f"  ‚îú‚îÄ Face: {face_rate:.1f}% ({face_detected:,}/{total_frames:,})")
        print(f"  ‚îú‚îÄ Iris: {iris_rate:.1f}% ({iris_detected:,}/{total_frames:,})")
        print(f"  ‚îî‚îÄ Pose: {pose_rate:.1f}% ({pose_detected:,}/{total_frames:,})")
        
        result['details'] = {
            'face_rate': float(face_rate),
            'iris_rate': float(iris_rate),
            'pose_rate': float(pose_rate),
            'total_frames': int(total_frames)
        }
        
        # Warning if detection rate is too low
        if face_rate < 50:
            print(f"  ‚ö†Ô∏è Low face detection rate!")
            result['status'] = 'warning'
            self.issues.append({
                'type': 'low_detection',
                'feature': 'face',
                'rate': float(face_rate)
            })
        
        if iris_rate < 30:
            print(f"  ‚ö†Ô∏è Low iris detection rate!")
            result['status'] = 'warning'
            self.issues.append({
                'type': 'low_detection',
                'feature': 'iris',
                'rate': float(iris_rate)
            })
        
        return result


# ==================== FIXER ====================
class DatasetFixer:
    """Fix common dataset issues"""
    
    def __init__(self, datasets: Dict[str, pd.DataFrame], issues: List[Dict], config: ConsistencyConfig):
        self.datasets = datasets
        self.issues = issues
        self.config = config
        self.fixes_applied = []
    
    def fix_all(self) -> Dict:
        """Apply all fixes"""
        print(f"\n{'='*70}")
        print(f"üîß APPLYING FIXES TO {self.config.dataset_name}")
        print(f"{'='*70}")
        
        # Fix 1: Remove duplicate samples
        self.fix_duplicates()
        
        # Fix 2: Fill missing critical values
        self.fix_missing_values()
        
        # Fix 3: Standardize video names
        self.fix_video_names()
        
        # Fix 4: Fix data types
        self.fix_data_types()
        
        # Fix 5: Remove videos with insufficient data
        self.fix_insufficient_data()
        
        print(f"\n‚úÖ Total fixes applied: {len(self.fixes_applied)}")
        
        return {
            'fixes_applied': len(self.fixes_applied),
            'details': self.fixes_applied
        }
    
    def fix_duplicates(self):
        """Remove duplicate samples"""
        print("\nüîß Fix 1: Removing duplicates...")
        
        for name in ['text_indonesian', 'text_english', 'audio_features', 'multimodal_full']:
            if name in self.datasets:
                df = self.datasets[name]
                original_len = len(df)
                
                # Keep first occurrence
                df_clean = df.drop_duplicates(subset=['filename'], keep='first')
                
                if len(df_clean) < original_len:
                    removed = original_len - len(df_clean)
                    print(f"  ‚Ä¢ {name}: Removed {removed} duplicates")
                    self.datasets[name] = df_clean
                    self.fixes_applied.append({
                        'type': 'remove_duplicates',
                        'dataset': name,
                        'removed': removed
                    })
                else:
                    print(f"  ‚Ä¢ {name}: No duplicates found")
    
    def fix_missing_values(self):
        """Fill missing critical values"""
        print("\nüîß Fix 2: Filling missing values...")
        
        for name, df in self.datasets.items():
            fixed_count = 0
            
            # Fill missing text with empty string
            text_cols = [col for col in df.columns if 'text' in col.lower()]
            for col in text_cols:
                if df[col].isnull().any():
                    before = df[col].isnull().sum()
                    df[col].fillna('', inplace=True)
                    fixed_count += before
                    self.fixes_applied.append({
                        'type': 'fill_missing',
                        'dataset': name,
                        'column': col,
                        'count': int(before)
                    })
            
            # Fill missing numeric with 0
            numeric_cols = df.select_dtypes(include=[np.number]).columns
            for col in numeric_cols:
                if df[col].isnull().any():
                    before = df[col].isnull().sum()
                    df[col].fillna(0, inplace=True)
                    fixed_count += before
                    self.fixes_applied.append({
                        'type': 'fill_missing',
                        'dataset': name,
                        'column': col,
                        'count': int(before)
                    })
            
            if fixed_count > 0:
                print(f"  ‚Ä¢ {name}: Filled {fixed_count} missing values")
            else:
                print(f"  ‚Ä¢ {name}: No missing values")
    
    def fix_video_names(self):
        """Standardize video names"""
        print("\nüîß Fix 3: Standardizing video names...")
        
        for name in ['text_indonesian', 'text_english', 'audio_features', 'multimodal_full']:
            if name in self.datasets:
                df = self.datasets[name]
                
                # Remove file extensions
                if 'filename' in df.columns:
                    original = df['filename'].copy()
                    df['filename'] = df['filename'].apply(lambda x: x.rsplit('.', 1)[0] if '.' in str(x) else str(x))
                    
                    changed = (original != df['filename']).sum()
                    if changed > 0:
                        print(f"  ‚Ä¢ {name}: Standardized {changed} filenames")
                        self.fixes_applied.append({
                            'type': 'standardize_names',
                            'dataset': name,
                            'count': int(changed)
                        })
                    else:
                        print(f"  ‚Ä¢ {name}: All filenames already standardized")
    
    def fix_data_types(self):
        """Fix data types"""
        print("\nüîß Fix 4: Fixing data types...")
        
        for name, df in self.datasets.items():
            # Ensure label/Class is integer
            label_col = 'Class' if name == 'landmarks' else 'label'
            
            if label_col in df.columns:
                if df[label_col].dtype not in [np.int64, np.int32]:
                    try:
                        df[label_col] = df[label_col].astype(int)
                        print(f"  ‚Ä¢ {name}.{label_col}: Converted to integer")
                        self.fixes_applied.append({
                            'type': 'fix_dtype',
                            'dataset': name,
                            'column': label_col
                        })
                    except:
                        print(f"  ‚ö†Ô∏è {name}.{label_col}: Could not convert to integer")
                else:
                    print(f"  ‚Ä¢ {name}.{label_col}: Already correct type")
    
    def fix_insufficient_data(self):
        """Remove videos with insufficient data"""
        print("\nüîß Fix 5: Removing videos with insufficient data...")
        
        if 'landmarks' in self.datasets:
            df = self.datasets['landmarks']
            
            # Count frames per video
            frame_counts = df['Video_Name'].value_counts()
            
            # Remove videos with < 5 frames
            low_frame_videos = frame_counts[frame_counts < 5].index
            
            if len(low_frame_videos) > 0:
                df_clean = df[~df['Video_Name'].isin(low_frame_videos)]
                removed_frames = len(df) - len(df_clean)
                
                print(f"  ‚Ä¢ landmarks: Removed {len(low_frame_videos)} videos ({removed_frames:,} frames)")
                self.datasets['landmarks'] = df_clean
                self.fixes_applied.append({
                    'type': 'remove_insufficient',
                    'dataset': 'landmarks',
                    'videos_removed': len(low_frame_videos),
                    'frames_removed': removed_frames
                })
            else:
                print(f"  ‚Ä¢ landmarks: All videos have sufficient frames")
        else:
            print(f"  ‚Ä¢ landmarks: Dataset not found")
    
    def save_fixed_datasets(self):
        """Save fixed datasets"""
        print(f"\nüíæ Saving fixed {self.config.dataset_name} datasets...")
        
        saved_count = 0
        for name, df in self.datasets.items():
            if name in self.config.paths:
                output_path = self.config.paths[name]
                
                try:
                    df.to_csv(output_path, index=False, encoding='utf-8')
                    print(f"  ‚úì {name}: Saved ({len(df):,} rows)")
                    saved_count += 1
                except Exception as e:
                    print(f"  ‚ùå {name}: Error saving - {str(e)}")
        
        print(f"\n‚úÖ Saved {saved_count} datasets")


# ==================== REPORT GENERATOR ====================
class ReportGenerator:
    """Generate consistency report"""
    
    def __init__(self, config: ConsistencyConfig, check_results: Dict, 
                 fix_results: Dict, issues: List[Dict]):
        self.config = config
        self.check_results = check_results
        self.fix_results = fix_results
        self.issues = issues
    
    def generate_markdown(self) -> Path:
        """Generate markdown report"""
        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        
        content = f"""# üìä Dataset Consistency Report

**Dataset:** {self.config.dataset_name}  
**Generated:** {timestamp}  
**Version:** 2.1

---

## ‚úÖ Summary

| Check | Status |
|-------|--------|
"""
        
        for check_name, result in self.check_results.items():
            status_icon = "‚úÖ" if result['status'] == 'pass' else ("‚ö†Ô∏è" if result['status'] == 'warning' else "‚ùå")
            content += f"| {check_name.replace('_', ' ').title()} | {status_icon} {result['status'].upper()} |\n"
        
        content += f"\n**Total Issues Found:** {len(self.issues)}\n"
        content += f"**Fixes Applied:** {self.fix_results.get('fixes_applied', 0)}\n"
        
        # Detailed results
        content += "\n---\n\n## üìã Detailed Results\n\n"
        
        for check_name, result in self.check_results.items():
            content += f"### {check_name.replace('_', ' ').title()}\n\n"
            content += f"**Status:** {result['status'].upper()}\n\n"
            
            if 'details' in result and result['details']:
                content += "```json\n"
                content += json.dumps(result['details'], indent=2)
                content += "\n```\n\n"
        
        # Issues
        if self.issues:
            content += "---\n\n## ‚ö†Ô∏è Issues Detected\n\n"
            
            for i, issue in enumerate(self.issues, 1):
                content += f"### Issue {i}: {issue['type'].replace('_', ' ').title()}\n\n"
                content += "```json\n"
                content += json.dumps(issue, indent=2)
                content += "\n```\n\n"
        
        # Fixes
        if self.fix_results.get('details'):
            content += "---\n\n## üîß Fixes Applied\n\n"
            
            for i, fix in enumerate(self.fix_results['details'], 1):
                content += f"{i}. **{fix['type'].replace('_', ' ').title()}**\n"
                content += f"   - Dataset: `{fix.get('dataset', 'N/A')}`\n"
                
                if 'count' in fix:
                    content += f"   - Count: {fix['count']}\n"
                if 'removed' in fix:
                    content += f"   - Removed: {fix['removed']}\n"
                
                content += "\n"
        
        content += "---\n\n*Generated by Dataset Consistency Checker v2.1*\n"
        
        # Save report
        report_path = self.config.paths['validation'] / f'consistency_report_{datetime.now().strftime("%Y%m%d_%H%M%S")}.md'
        with open(report_path, 'w', encoding='utf-8') as f:
            f.write(content)
        
        return report_path
    
    def generate_json(self) -> Path:
        """Generate JSON report"""
        report_data = {
            'dataset': self.config.dataset_name,
            'timestamp': datetime.now().isoformat(),
            'check_results': self.check_results,
            'fix_results': self.fix_results,
            'issues': self.issues
        }
        
        # Convert numpy types
        report_data = self._convert_numpy(report_data)
        
        report_path = self.config.paths['validation'] / f'consistency_report_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
        with open(report_path, 'w', encoding='utf-8') as f:
            json.dump(report_data, f, indent=2)
        
        return report_path
    
    def _convert_numpy(self, obj):
        """Convert numpy types to Python native"""
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, dict):
            return {key: self._convert_numpy(value) for key, value in obj.items()}
        elif isinstance(obj, list):
            return [self._convert_numpy(item) for item in obj]
        else:
            return obj


# ==================== DUAL PROCESSOR ====================
class DualDatasetProcessor:
    """Process both I3D and RLT datasets"""
    
    def __init__(self):
        self.datasets = ['I3D', 'RLT']
        self.results = {}
    
    def process_all(self):
        """Process both datasets"""
        print("="*70)
        print("üîç DUAL DATASET CONSISTENCY CHECKER & FIXER")
        print("="*70)
        print("Processing both I3D and RLT datasets automatically")
        print("="*70)
        
        for dataset_name in self.datasets:
            print(f"\n\n{'#'*70}")
            print(f"# PROCESSING: {dataset_name}")
            print(f"{'#'*70}")
            
            try:
                result = self.process_single_dataset(dataset_name)
                self.results[dataset_name] = result
            except Exception as e:
                print(f"\n‚ùå Error processing {dataset_name}: {str(e)}")
                import traceback
                traceback.print_exc()
                self.results[dataset_name] = {'status': 'error', 'message': str(e)}
        
        # Final summary
        self.print_final_summary()
    
    def process_single_dataset(self, dataset_name: str) -> Dict:
        """Process single dataset"""
        # Initialize
        config = ConsistencyConfig(dataset_name)
        
        # Load datasets
        loader = DatasetLoader(config)
        datasets = loader.load_all()
        
        if not datasets:
            print(f"\n‚ö†Ô∏è No datasets found for {dataset_name}")
            return {'status': 'no_data'}
        
        # Check consistency
        checker = ConsistencyChecker(datasets, config)
        check_results = checker.check_all()
        
        # Apply fixes
        fixer = DatasetFixer(datasets, checker.issues, config)
        fix_results = fixer.fix_all()
        
        # Save fixed datasets
        fixer.save_fixed_datasets()
        
        # Generate reports
        print(f"\nüìù Generating {dataset_name} reports...")
        reporter = ReportGenerator(config, check_results, fix_results, checker.issues)
        
        md_path = reporter.generate_markdown()
        json_path = reporter.generate_json()
        
        print(f"  ‚úì Markdown: {md_path.name}")
        print(f"  ‚úì JSON: {json_path.name}")
        
        return {
            'status': 'success',
            'issues_found': len(checker.issues),
            'fixes_applied': fix_results.get('fixes_applied', 0),
            'md_report': str(md_path),
            'json_report': str(json_path)
        }
    
    def print_final_summary(self):
        """Print final summary for both datasets"""
        print("\n\n" + "="*70)
        print("üìä FINAL SUMMARY - BOTH DATASETS")
        print("="*70)
        
        for dataset_name, result in self.results.items():
            print(f"\n{dataset_name}:")
            
            if result.get('status') == 'success':
                print(f"  ‚úÖ Status: SUCCESS")
                print(f"  ‚îú‚îÄ Issues found: {result.get('issues_found', 0)}")
                print(f"  ‚îú‚îÄ Fixes applied: {result.get('fixes_applied', 0)}")
                print(f"  ‚îú‚îÄ MD report: {Path(result.get('md_report', '')).name}")
                print(f"  ‚îî‚îÄ JSON report: {Path(result.get('json_report', '')).name}")
            elif result.get('status') == 'no_data':
                print(f"  ‚ö†Ô∏è Status: NO DATA FOUND")
            elif result.get('status') == 'error':
                print(f"  ‚ùå Status: ERROR")
                print(f"  ‚îî‚îÄ Message: {result.get('message', 'Unknown error')}")
        
        print("\n" + "="*70)
        print("‚úÖ DUAL DATASET PROCESSING COMPLETE!")
        print("="*70)


# ==================== MAIN ====================
def main():
    """Main execution"""
    processor = DualDatasetProcessor()
    processor.process_all()


if __name__ == "__main__":
    main()


üîç DUAL DATASET CONSISTENCY CHECKER & FIXER
Processing both I3D and RLT datasets automatically


######################################################################
# PROCESSING: I3D
######################################################################

üìÇ Loading I3D datasets...
  ‚úì text_indonesian: 1,568 rows
  ‚úì text_english: 1,568 rows
  ‚úì number_features: 1,568 rows
  ‚úì audio_features: 1,568 rows
  ‚úì pause_features: 1,568 rows
  ‚úì landmarks: 647,871 rows
  ‚úì multimodal_full: 1,568 rows
  ‚úì publication: 1,568 rows

üîç CHECKING I3D CONSISTENCY

üìä CHECK 1: UNIQUE VIDEOS
----------------------------------------------------------------------
  ‚Ä¢ landmarks: 1568 unique videos
  ‚Ä¢ text_indonesian: 1568 unique videos
  ‚Ä¢ text_english: 1568 unique videos
  ‚Ä¢ audio_features: 1568 unique videos
  ‚Ä¢ multimodal_full: 1568 unique videos
  ‚úÖ All datasets have consistent video names

üìä CHECK 2: SAMPLES PER VIDEO
-----------------------------------------