# Data Validation and Valid Files Generation

This notebook:
1. Starts from alignment_score.csv to get video IDs and scene numbers
2. Validates that all required files exist for each scene
3. Writes valid entries to a CSV file for efficient dataloader usage

**Directory structure:**
- Scene images: `data/video_scene_cuts/{video_id}/{video_id}-Scene-0xx-01.jpg`
- Keyword masks: `keyword_masks/{video_id}/scene_{x}.png` (x = scene number)
- Attention heatmaps: `attention_heatmap/{video_id}/{video_id}-Scene-0xx-01.jpg`

In [None]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm
import re

print("Data validation notebook initialized")

## Configuration

In [None]:
# Define paths
ALIGNMENT_SCORE_FILE = 'data/alignment_score.csv'
SCENE_IMAGES_DIR = 'data/video_scene_cuts'
KEYWORD_MASKS_DIR = 'keyword_masks'
ATTENTION_HEATMAPS_DIR = 'attention_heatmap'
KEYWORDS_FILE = 'data/keywords.csv'

# Output file
OUTPUT_CSV = 'data/valid_scenes.csv'

print(f"Alignment scores: {ALIGNMENT_SCORE_FILE}")
print(f"Scene images directory: {SCENE_IMAGES_DIR}")
print(f"Keyword masks directory: {KEYWORD_MASKS_DIR}")
print(f"Attention heatmaps directory: {ATTENTION_HEATMAPS_DIR}")
print(f"Keywords file: {KEYWORDS_FILE}")
print(f"Output CSV: {OUTPUT_CSV}")

## Load Alignment Scores and Keywords

In [None]:
# Load alignment scores
print("Loading alignment_score.csv...")
alignment_df = pd.read_csv(ALIGNMENT_SCORE_FILE)
alignment_df.columns = alignment_df.columns.str.strip()

print(f"Total entries in alignment_score.csv: {len(alignment_df)}")
print(f"Unique videos: {alignment_df['video id'].nunique()}")
print(f"\nColumns: {list(alignment_df.columns)}")
print(f"\nFirst 3 rows:")
print(alignment_df.head(3))

In [None]:
# Load keywords
print("\nLoading keywords.csv...")
keywords_df = pd.read_csv(KEYWORDS_FILE)
keywords_df.columns = keywords_df.columns.str.strip()

# Get video ID column
if '_id' in keywords_df.columns:
    video_id_col = '_id'
elif 'video_id' in keywords_df.columns:
    video_id_col = 'video_id'
else:
    raise ValueError("Could not find video ID column in keywords.csv")

# Get keyword column
if 'keyword_list[0]' in keywords_df.columns:
    keyword_col = 'keyword_list[0]'
elif 'keyword' in keywords_df.columns:
    keyword_col = 'keyword'
else:
    raise ValueError("Could not find keyword column in keywords.csv")

# Create keyword mapping
keywords_map = dict(zip(
    keywords_df[video_id_col].astype(str),
    keywords_df[keyword_col]
))

# Filter out empty keywords
keywords_map = {k: v for k, v in keywords_map.items() if pd.notna(v) and str(v).strip() != ''}

print(f"Loaded {len(keywords_map)} videos with valid keywords")
print(f"Sample keywords: {list(keywords_map.items())[:3]}")

## Define Path Construction Functions

In [None]:
def construct_scene_image_path(video_id, scene_number):
    """
    Construct path to scene image.
    Format: data/video_scene_cuts/{video_id}/{video_id}-Scene-{scene_number:03d}-01.jpg
    """
    filename = f"{video_id}-Scene-{scene_number:03d}-01.jpg"
    path = os.path.join(SCENE_IMAGES_DIR, str(video_id), filename)
    return path, filename

def construct_keyword_mask_path(video_id, scene_number):
    """
    Construct path to keyword mask.
    Format: keyword_masks/{video_id}/scene_{scene_number}.png
    """
    filename = f"scene_{scene_number}.png"
    path = os.path.join(KEYWORD_MASKS_DIR, str(video_id), filename)
    return path

def construct_attention_heatmap_path(video_id, scene_number):
    """
    Construct path to attention heatmap.
    Format: attention_heatmap/{video_id}/{video_id}-Scene-{scene_number:03d}-01.jpg
    """
    filename = f"{video_id}-Scene-{scene_number:03d}-01.jpg"
    path = os.path.join(ATTENTION_HEATMAPS_DIR, str(video_id), filename)
    return path

# Test path construction
test_video_id = "123456"
test_scene_num = 1

scene_path, scene_file = construct_scene_image_path(test_video_id, test_scene_num)
mask_path = construct_keyword_mask_path(test_video_id, test_scene_num)
heatmap_path = construct_attention_heatmap_path(test_video_id, test_scene_num)

print("Test path construction:")
print(f"  Scene image: {scene_path}")
print(f"  Keyword mask: {mask_path}")
print(f"  Attention heatmap: {heatmap_path}")

## Validate Files for Each Scene

Starting from alignment_score.csv entries, we check if all required files exist.

In [None]:
# Initialize results list
valid_scenes = []
stats = {
    'total_entries': len(alignment_df),
    'missing_keyword': 0,
    'missing_scene_image': 0,
    'missing_keyword_mask': 0,
    'missing_attention_heatmap': 0,
    'valid_scenes': 0,
}

print(f"\nValidating {len(alignment_df)} scenes from alignment_score.csv...\n")

# Iterate through alignment scores
for idx, row in tqdm(alignment_df.iterrows(), total=len(alignment_df), desc="Validating scenes"):
    video_id = str(row['video id'])
    scene_number = int(row['Scene Number'])
    
    # Check if video has keyword
    if video_id not in keywords_map:
        stats['missing_keyword'] += 1
        continue
    
    keyword = keywords_map[video_id]
    
    # Construct paths
    scene_image_path, filename = construct_scene_image_path(video_id, scene_number)
    keyword_mask_path = construct_keyword_mask_path(video_id, scene_number)
    attention_heatmap_path = construct_attention_heatmap_path(video_id, scene_number)
    
    # Check if scene image exists
    if not os.path.exists(scene_image_path):
        stats['missing_scene_image'] += 1
        continue
    
    # Check if keyword mask exists
    if not os.path.exists(keyword_mask_path):
        stats['missing_keyword_mask'] += 1
        continue
    
    # Check if attention heatmap exists
    if not os.path.exists(attention_heatmap_path):
        stats['missing_attention_heatmap'] += 1
        continue
    
    # All checks passed - add to valid scenes
    valid_scenes.append({
        'video_id': video_id,
        'scene_number': scene_number,
        'keyword': keyword,
        'scene_image_path': scene_image_path,
        'keyword_mask_path': keyword_mask_path,
        'attention_heatmap_path': attention_heatmap_path,
        'filename': filename,
    })
    stats['valid_scenes'] += 1

print("\n" + "="*60)
print("Validation Statistics:")
print("="*60)
print(f"Total entries in alignment_score.csv: {stats['total_entries']}")
print(f"Missing keyword: {stats['missing_keyword']} scenes")
print(f"Missing scene image: {stats['missing_scene_image']} scenes")
print(f"Missing keyword mask: {stats['missing_keyword_mask']} scenes")
print(f"Missing attention heatmap: {stats['missing_attention_heatmap']} scenes")
print(f"\n✅ VALID SCENES: {stats['valid_scenes']}")
print(f"   ({stats['valid_scenes']/stats['total_entries']*100:.1f}% of total)")
print("="*60)

## Save Valid Scenes to CSV

In [None]:
if len(valid_scenes) > 0:
    # Create DataFrame
    valid_df = pd.DataFrame(valid_scenes)
    
    # Sort by video_id and scene_number
    valid_df = valid_df.sort_values(['video_id', 'scene_number']).reset_index(drop=True)
    
    # Save to CSV
    os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
    valid_df.to_csv(OUTPUT_CSV, index=False)
    
    print(f"\n✅ Saved {len(valid_df)} valid scenes to: {OUTPUT_CSV}")
    print(f"\nColumns: {list(valid_df.columns)}")
    print(f"\nUnique videos: {valid_df['video_id'].nunique()}")
    
    # Scenes per video statistics
    scenes_per_video = valid_df.groupby('video_id').size()
    print(f"\nScenes per video:")
    print(f"  Min: {scenes_per_video.min()}")
    print(f"  Max: {scenes_per_video.max()}")
    print(f"  Mean: {scenes_per_video.mean():.1f}")
    print(f"  Median: {scenes_per_video.median():.0f}")
    
    # Display first few rows
    print(f"\nFirst 5 rows:")
    print(valid_df.head())
    
    # Display sample paths
    print(f"\nSample paths for first scene:")
    first_scene = valid_df.iloc[0]
    print(f"  Video ID: {first_scene['video_id']}")
    print(f"  Scene number: {first_scene['scene_number']}")
    print(f"  Keyword: {first_scene['keyword']}")
    print(f"  Scene image: {first_scene['scene_image_path']}")
    print(f"  Keyword mask: {first_scene['keyword_mask_path']}")
    print(f"  Attention heatmap: {first_scene['attention_heatmap_path']}")
    
else:
    print("\n" + "="*60)
    print("⚠️  WARNING: No valid scenes found!")
    print("="*60)
    print("\nPlease check:")
    print("  1. Scene images exist in:", SCENE_IMAGES_DIR)
    print("     Format: {video_id}/{video_id}-Scene-{number:03d}-01.jpg")
    print("\n  2. Keyword masks exist in:", KEYWORD_MASKS_DIR)
    print("     Format: {video_id}/scene_{number}.png")
    print("\n  3. Attention heatmaps exist in:", ATTENTION_HEATMAPS_DIR)
    print("     Format: {video_id}/{video_id}-Scene-{number:03d}-01.jpg")
    print("\n  4. Keywords.csv contains valid keywords for the videos")
    print("\n  5. Video IDs and scene numbers in alignment_score.csv are correct")
    print("="*60)

## Summary

This notebook has created a CSV file (`data/valid_scenes.csv`) containing all scenes that have:
1. Entry in `alignment_score.csv`
2. Scene image in `data/video_scene_cuts/`
3. Keyword mask in `keyword_masks/`
4. Attention heatmap in `attention_heatmap/`
5. Valid keyword in `data/keywords.csv`

The dataloader will now use this CSV file to load only valid scenes, avoiding repeated file existence checks.

### Path Formats Verified

- **Scene images**: `data/video_scene_cuts/{video_id}/{video_id}-Scene-{number:03d}-01.jpg`
- **Keyword masks**: `keyword_masks/{video_id}/scene_{number}.png`
- **Attention heatmaps**: `attention_heatmap/{video_id}/{video_id}-Scene-{number:03d}-01.jpg`