# Data Validation and Valid Files Generation

This notebook:
1. Scans through scene images, keyword masks, and attention heatmaps
2. Validates that all required files exist for each scene
3. Writes valid entries to a CSV file for efficient dataloader usage

**Directory structure:**
- Scene images: `data/video_scene_cuts/[video_id]/[video_id]-Scene-0xx-01.jpg`
- Keyword masks: `keyword_masks/[video_id]/[video_id]-Scene-0xx-01.png`
- Attention heatmaps: `attention_heatmap/[video_id]/[video_id]-Scene-0xx-01.jpg`

In [None]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm
import re

print("Data validation notebook initialized")

## Configuration

In [None]:
# Define paths
SCENE_IMAGES_DIR = 'data/video_scene_cuts'
KEYWORD_MASKS_DIR = 'keyword_masks'
ATTENTION_HEATMAPS_DIR = 'attention_heatmap'
KEYWORDS_FILE = 'data/keywords.csv'

# Output file
OUTPUT_CSV = 'data/valid_scenes.csv'

print(f"Scene images directory: {SCENE_IMAGES_DIR}")
print(f"Keyword masks directory: {KEYWORD_MASKS_DIR}")
print(f"Attention heatmaps directory: {ATTENTION_HEATMAPS_DIR}")
print(f"Keywords file: {KEYWORDS_FILE}")
print(f"Output CSV: {OUTPUT_CSV}")

## Load Keywords

In [None]:
# Load keywords
keywords_df = pd.read_csv(KEYWORDS_FILE)
keywords_df.columns = keywords_df.columns.str.strip()

# Get video ID column
if '_id' in keywords_df.columns:
    video_id_col = '_id'
elif 'video_id' in keywords_df.columns:
    video_id_col = 'video_id'
else:
    raise ValueError("Could not find video ID column in keywords.csv")

# Get keyword column
if 'keyword_list[0]' in keywords_df.columns:
    keyword_col = 'keyword_list[0]'
elif 'keyword' in keywords_df.columns:
    keyword_col = 'keyword'
else:
    raise ValueError("Could not find keyword column in keywords.csv")

# Create keyword mapping
keywords_map = dict(zip(
    keywords_df[video_id_col].astype(str),
    keywords_df[keyword_col]
))

# Filter out empty keywords
keywords_map = {k: v for k, v in keywords_map.items() if pd.notna(v) and str(v).strip() != ''}

print(f"Loaded {len(keywords_map)} videos with valid keywords")
print(f"Sample keywords: {list(keywords_map.items())[:3]}")

## Parse Scene Filename

Scene filenames follow the pattern: `[video_id]-Scene-0xx-01.jpg`

In [None]:
def parse_scene_filename(filename):
    """
    Parse scene filename to extract video_id and scene_number.
    
    Expected format: [video_id]-Scene-0xx-01.jpg
    
    Returns:
        tuple: (video_id, scene_number) or (None, None) if parsing fails
    """
    # Pattern: anything before '-Scene-', then digits, then '-01'
    pattern = r'^(.+?)-Scene-(\d+)-01\.(jpg|png)$'
    match = re.match(pattern, filename)
    
    if match:
        video_id = match.group(1)
        scene_number = int(match.group(2))
        return video_id, scene_number
    else:
        return None, None

# Test parsing
test_filename = "123456-Scene-001-01.jpg"
vid, scene = parse_scene_filename(test_filename)
print(f"Test parsing: '{test_filename}' -> video_id='{vid}', scene={scene}")

## Scan and Validate Files

In [None]:
def check_file_exists(base_dir, video_id, filename):
    """Check if a file exists in the expected location."""
    path = os.path.join(base_dir, video_id, filename)
    return os.path.exists(path), path

# Initialize results list
valid_scenes = []
stats = {
    'total_scene_files': 0,
    'missing_keyword': 0,
    'missing_keyword_mask': 0,
    'missing_attention_heatmap': 0,
    'valid_scenes': 0,
}

# Check if scene images directory exists
if not os.path.exists(SCENE_IMAGES_DIR):
    print(f"ERROR: Scene images directory not found: {SCENE_IMAGES_DIR}")
else:
    # Scan scene images directory
    print("\nScanning scene images directory...")
    video_dirs = [d for d in os.listdir(SCENE_IMAGES_DIR) 
                  if os.path.isdir(os.path.join(SCENE_IMAGES_DIR, d))]
    
    print(f"Found {len(video_dirs)} video directories")
    
    for video_id in tqdm(video_dirs, desc="Validating scenes"):
        # Check if video has keyword
        if video_id not in keywords_map:
            stats['missing_keyword'] += 1
            continue
        
        keyword = keywords_map[video_id]
        
        # Get all scene files for this video
        video_scene_dir = os.path.join(SCENE_IMAGES_DIR, video_id)
        scene_files = [f for f in os.listdir(video_scene_dir) 
                      if f.endswith('.jpg') or f.endswith('.png')]
        
        for scene_file in scene_files:
            stats['total_scene_files'] += 1
            
            # Parse filename
            parsed_vid, scene_number = parse_scene_filename(scene_file)
            if parsed_vid is None:
                # Skip files that don't match expected pattern
                continue
            
            # Update video_id to parsed value (in case directory name differs)
            video_id_parsed = parsed_vid
            
            # Check for keyword mask (should have .png extension)
            mask_filename = scene_file.replace('.jpg', '.png')
            mask_exists, mask_path = check_file_exists(KEYWORD_MASKS_DIR, video_id, mask_filename)
            
            if not mask_exists:
                stats['missing_keyword_mask'] += 1
                continue
            
            # Check for attention heatmap (same filename as scene)
            heatmap_exists, heatmap_path = check_file_exists(ATTENTION_HEATMAPS_DIR, video_id, scene_file)
            
            if not heatmap_exists:
                stats['missing_attention_heatmap'] += 1
                continue
            
            # All checks passed - add to valid scenes
            valid_scenes.append({
                'video_id': video_id_parsed,
                'scene_number': scene_number,
                'keyword': keyword,
                'scene_image_path': os.path.join(SCENE_IMAGES_DIR, video_id, scene_file),
                'keyword_mask_path': mask_path,
                'attention_heatmap_path': heatmap_path,
                'filename': scene_file,
            })
            stats['valid_scenes'] += 1

print("\n" + "="*60)
print("Validation Statistics:")
print("="*60)
print(f"Total scene files found: {stats['total_scene_files']}")
print(f"Missing keyword: {stats['missing_keyword']} videos")
print(f"Missing keyword mask: {stats['missing_keyword_mask']} scenes")
print(f"Missing attention heatmap: {stats['missing_attention_heatmap']} scenes")
print(f"\nVALID SCENES: {stats['valid_scenes']}")
print("="*60)

## Save Valid Scenes to CSV

In [None]:
if len(valid_scenes) > 0:
    # Create DataFrame
    valid_df = pd.DataFrame(valid_scenes)
    
    # Sort by video_id and scene_number
    valid_df = valid_df.sort_values(['video_id', 'scene_number']).reset_index(drop=True)
    
    # Save to CSV
    os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
    valid_df.to_csv(OUTPUT_CSV, index=False)
    
    print(f"\n✓ Saved {len(valid_df)} valid scenes to: {OUTPUT_CSV}")
    print(f"\nColumns: {list(valid_df.columns)}")
    print(f"\nUnique videos: {valid_df['video_id'].nunique()}")
    print(f"Scenes per video: {valid_df.groupby('video_id').size().describe()}")
    
    # Display first few rows
    print(f"\nFirst 5 rows:")
    print(valid_df.head())
else:
    print("\n⚠ No valid scenes found. Please check:")
    print("  1. Scene images directory exists and contains video folders")
    print("  2. Keyword masks directory exists with matching files")
    print("  3. Attention heatmaps directory exists with matching files")
    print("  4. Keywords.csv contains valid keywords for the videos")

## Summary

This notebook has created a CSV file (`data/valid_scenes.csv`) containing all scenes that have:
1. Scene image in `data/video_scene_cuts/`
2. Keyword mask in `keyword_masks/`
3. Attention heatmap in `attention_heatmap/`
4. Valid keyword in `data/keywords.csv`

The dataloader will now use this CSV file to load only valid scenes, avoiding repeated file existence checks.