In [1]:
"""
Merge Results from Parallel Processing
"""

import pandas as pd
from pathlib import Path

def merge_datasets(base_path: Path):
    """Merge LIE and TRUTH datasets into final combined datasets"""
    
    print("="*70)
    print("üîÑ MERGING PARALLEL PROCESSING RESULTS")
    print("="*70)
    
    # Define all datasets to merge
    datasets = {
        'text': [
            'TextDataset_Indonesian',
            'TextDataset_English',
            'NumberFeatures'
        ],
        'audio': [
            'AudioDataset_Features',
            'PauseFeatures'
        ],
        'visual': [
            'LandmarkDataset'
        ],
        'multimodal': [
            'MultimodalDataset_Full',
            'PublicationDataset'
        ]
    }
    
    merged_count = 0
    
    for folder_name, dataset_list in datasets.items():
        folder = base_path / 'processed' / 'I3D' / folder_name
        
        for dataset_name in dataset_list:
            lie_file = folder / f"{dataset_name}_lie.csv"
            truth_file = folder / f"{dataset_name}_truth.csv"
            
            # Check if both files exist
            if not lie_file.exists():
                print(f"‚ö†Ô∏è  Missing: {lie_file.name}")
                continue
            if not truth_file.exists():
                print(f"‚ö†Ô∏è  Missing: {truth_file.name}")
                continue
            
            try:
                # Read both files
                print(f"\nüìÇ Processing: {dataset_name}")
                df_lie = pd.read_csv(lie_file)
                df_truth = pd.read_csv(truth_file)
                
                print(f"   LIE   : {len(df_lie):,} rows")
                print(f"   TRUTH : {len(df_truth):,} rows")
                
                # Merge
                df_merged = pd.concat([df_lie, df_truth], ignore_index=True)
                
                # Save merged file
                output_file = folder / f"{dataset_name}.csv"
                df_merged.to_csv(output_file, index=False, encoding='utf-8')
                
                print(f"   ‚úÖ MERGED: {len(df_merged):,} rows ‚Üí {output_file.name}")
                merged_count += 1
                
            except Exception as e:
                print(f"   ‚ùå ERROR: {str(e)}")
    
    print("\n" + "="*70)
    print(f"‚úÖ MERGE COMPLETE: {merged_count} datasets merged")
    print("="*70)
    
    # Verify label distribution
    print("\nüìä Verification: Label Distribution")
    multimodal_file = base_path / 'processed' / 'I3D' / 'multimodal' / 'MultimodalDataset_Full.csv'
    if multimodal_file.exists():
        df = pd.read_csv(multimodal_file)
        label_counts = df['label'].value_counts().sort_index()
        print(f"   TRUTH (0): {label_counts.get(0, 0):,} samples")
        print(f"   LIE   (1): {label_counts.get(1, 0):,} samples")
        print(f"   TOTAL    : {len(df):,} samples")
    
    print("\n‚úÖ All datasets ready for analysis!")

if __name__ == "__main__":
    base_path = Path.cwd() / 'dataset'
    merge_datasets(base_path)


üîÑ MERGING PARALLEL PROCESSING RESULTS

üìÇ Processing: TextDataset_Indonesian
   LIE   : 784 rows
   TRUTH : 784 rows
   ‚úÖ MERGED: 1,568 rows ‚Üí TextDataset_Indonesian.csv

üìÇ Processing: TextDataset_English
   LIE   : 784 rows
   TRUTH : 784 rows
   ‚úÖ MERGED: 1,568 rows ‚Üí TextDataset_English.csv

üìÇ Processing: NumberFeatures
   LIE   : 784 rows
   TRUTH : 784 rows
   ‚úÖ MERGED: 1,568 rows ‚Üí NumberFeatures.csv

üìÇ Processing: AudioDataset_Features
   LIE   : 784 rows
   TRUTH : 784 rows
   ‚úÖ MERGED: 1,568 rows ‚Üí AudioDataset_Features.csv

üìÇ Processing: PauseFeatures
   LIE   : 784 rows
   TRUTH : 784 rows
   ‚úÖ MERGED: 1,568 rows ‚Üí PauseFeatures.csv

üìÇ Processing: LandmarkDataset
   LIE   : 293,010 rows
   TRUTH : 354,861 rows
   ‚úÖ MERGED: 647,871 rows ‚Üí LandmarkDataset.csv

üìÇ Processing: MultimodalDataset_Full
   LIE   : 784 rows
   TRUTH : 784 rows
   ‚úÖ MERGED: 1,568 rows ‚Üí MultimodalDataset_Full.csv

üìÇ Processing: PublicationDataset
  