diff --git a/docs.json b/docs.json index 78e32db..83e1f2b 100644 --- a/docs.json +++ b/docs.json @@ -127,7 +127,9 @@ "docs/Creating-Datasets/importing-annotations/bounding-box", "docs/Creating-Datasets/importing-annotations/converting-yolo", "docs/Creating-Datasets/importing-annotations/converting-segmentation-masks", - "docs/Creating-Datasets/importing-annotations/creating-annotation-file-with-class-labels" + "docs/Creating-Datasets/importing-annotations/creating-annotation-file-with-class-labels", + "docs/Creating-Datasets/importing-annotations/reusing-caption-data", + "docs/code-blocks/caption-extraction-script" ] } ] diff --git a/docs/Creating-Datasets/importing-annotations/reusing-caption-data.mdx b/docs/Creating-Datasets/importing-annotations/reusing-caption-data.mdx new file mode 100644 index 0000000..d4ec70e --- /dev/null +++ b/docs/Creating-Datasets/importing-annotations/reusing-caption-data.mdx @@ -0,0 +1,331 @@ +--- +title: "Reusing Caption Data from Previous Datasets" +sidebarTitle: "Reuse Captions" +description: "Save time by extracting and reusing caption data from previous Visual Layer pipeline runs to avoid redundant captioning" +--- + +## Introduction + +Caption generation is one of the most time-consuming operations in Visual Layer's dataset pipeline. When creating multiple datasets with the same images, re-running caption generation for each dataset wastes valuable time and computational resources. + +This guide shows you how to **extract and reuse caption data** from previous pipeline runs on the same data, allowing you to: + +- ✓ Skip caption generation on subsequent dataset creations +- ✓ Maintain consistent captions across multiple datasets + + +**Use Case**: This approach is ideal when you need to create multiple datasets or dataset versions using the same images but with different configurations. + + +## Overview + +After running a dataset pipeline, Visual Layer stores processed data in: +``` +/.vl/tmp/[dataset-id]/input/metadata/image_annotations.parquet +``` + +This parquet file contains all the caption data you need to reuse. + +## Using the Caption Extraction Script + +### Prerequisites + +- Python 3.x +- pandas library (`pip install pandas pyarrow`) + +### What the Script Does + +The extraction script processes Visual Layer's internal parquet files to create a clean annotation file ready for reuse: + +1. **Extracts relevant columns**: Keeps only `filename` and `caption` +2. **Removes system paths**: Strips prefixes like `/hostfs`, `/mnt`, etc. +3. **Creates relative paths**: Converts absolute paths to relative filenames +4. **Outputs clean parquet**: Generates a properly formatted `image_annotations.parquet` + +### Script Location + + +The complete Python script is available on a separate page. Click here to view and copy the code. + + +### Installation + +```bash +# Ensure pandas and pyarrow are installed +pip install pandas pyarrow + +# Make script executable (optional) +chmod +x process_annotations.py +``` + +## Step-by-Step Workflow + +### Step 1: Create Initial Dataset (with Captioning) +Create your first dataset with captioning enabled as usual. This will generate the initial captions and store them in the internal parquet file. + +After the pipeline completes, find the dataset ID and locate the parquet file: + +```bash +# List recent datasets +ls -lt /.vl/tmp/ + +# Navigate to your dataset's metadata +cd /.vl/tmp/[your-dataset-id]/input/metadata/ + +# Verify the file exists +ls image_annotations.parquet +``` + +### Step 3: Run the Extraction Script + +Process the parquet file to extract captions: + +```bash +# Basic usage - creates image_annotations_processed.parquet in same directory +python3 process_annotations.py /.vl/tmp/[dataset-id]/input/metadata/image_annotations.parquet + +# Specify custom output location +python3 process_annotations.py /.vl/tmp/[dataset-id]/input/metadata/image_annotations.parquet \ + -o /path/to/new-dataset/image_annotations.parquet + +# Custom prefix removal (if needed) +python3 process_annotations.py input.parquet --prefix /custom/prefix/to/remove +``` + +**Script Output:** +``` +Reading parquet file: /.vl/tmp/abc123.../input/metadata/image_annotations.parquet +Original shape: (12, 9) +Columns: ['filename', 'file_size_bytes', 'video', 'frame_timestamp', 'caption', ...] + +Removing prefix '/hostfs' from filenames... + +Processed shape: (12, 2) + +Sample filenames after processing: +['dog_1.jpg', 'dog_2.jpg', 'dog_3.jpg'] + +✓ Successfully processed 12 rows +✓ Output saved to: /path/to/output.parquet +``` + +### Step 4: Copy to New Dataset Directory + +Place the extracted parquet file in your new dataset directory alongside the images: + +```bash +# Copy to new dataset location +cp image_annotations_processed.parquet /path/to/new-dataset/image_annotations.parquet + +# Directory structure should look like: +# /path/to/new-dataset/ +# ├── image_annotations.parquet (your extracted file) +# ├── dog_1.jpg +# ├── dog_2.jpg +# └── dog_3.jpg +``` + + +The parquet file must be named exactly `image_annotations.parquet` for Visual Layer to recognize it. + + +### Step 5: Create New Dataset (Fast!) + +Now create your new dataset. Visual Layer will: + +✓ Detect the existing `image_annotations.parquet` file +✓ Use the provided captions +✓ Complete much faster! + + +Note you will need to remove the captioning step from your dataset configuration to avoid conflicts. + + +## Understanding Relative Paths + +**Critical Concept:** Filenames in the parquet file must be **relative** to the dataset directory location. + +### Why Relative Paths? + +Visual Layer looks for images relative to where the `image_annotations.parquet` file is located. Absolute paths won't work because they reference specific system locations that may not exist or may differ across environments. + +### Examples + +#### ✗ Wrong - Absolute Paths + +``` +filename: /home/ubuntu/images/dog_1.jpg +filename: /mnt/data/dogs/dog_2.jpg +filename: /hostfs/workspace/dog_3.jpg +``` + +**Problem:** These paths are tied to specific locations. If the parquet is in `/new/location/`, Visual Layer can't find `/home/ubuntu/images/dog_1.jpg`. + +#### ✓ Correct - Relative Paths + +**Scenario 1: Parquet in same directory as images** +``` +Dataset directory: /any/path/dataset/ + ├── image_annotations.parquet + ├── dog_1.jpg + ├── dog_2.jpg + └── dog_3.jpg + +Filenames in parquet: + - dog_1.jpg + - dog_2.jpg + - dog_3.jpg +``` + +**Scenario 2: Images in subdirectory** +``` +Dataset directory: /any/path/dataset/ + ├── image_annotations.parquet + └── images/ + ├── dog_1.jpg + ├── dog_2.jpg + └── dog_3.jpg + +Filenames in parquet: + - images/dog_1.jpg + - images/dog_2.jpg + - images/dog_3.jpg +``` + +### How the Script Handles Paths + +The script automatically removes common system prefixes: + +| Original Path (from VL) | After Processing | Notes | +|-------------------------|------------------|-------| +| `/hostfs/home/ubuntu/images/dog.jpg` | `dog.jpg` | Removed `/hostfs/home/ubuntu/images/` | +| `/mnt/data/dogs/dog.jpg` | `dog.jpg` | Custom prefix with `--prefix /mnt/data/dogs/` | +| `/workspace/project/images/subdir/dog.jpg` | `dog.jpg` or `subdir/dog.jpg` | Depends on prefix specified | + + +**Key Insight:** The dataset directory can be anywhere on your system. The important thing is that filenames are relative to wherever you place the `image_annotations.parquet` file. + + +## Complete Example + +Let's walk through a real example using dog images. + +### Initial State: Dataset 1 (with captioning) + +After running the first dataset pipeline: + +```bash +# Pipeline completed, captions generated +Dataset ID: b451f7c6-f911-4ceb-8b8a-dd6c1ebb50fd +``` + +**Internal parquet file:** +``` +Location: /.vl/tmp/b451f7c6-f911-4ceb-8b8a-dd6c1ebb50fd/input/metadata/image_annotations.parquet +Columns: [filename, file_size_bytes, video, frame_timestamp, caption, + captions_source_id, default_embedding_index, _vl_stats, stats] +Rows: 12 +``` + +**Sample data:** +| filename | caption | +|----------|---------| +| `/hostfs/home/ubuntu/images/dog_1.jpg` | "A Golden Retriever sitting on grass. The dog has a friendly expression..." | +| `/hostfs/home/ubuntu/images/dog_2.jpg` | "A playful puppy with a red collar running through a park..." | + +### Processing with Script + +```bash +# Run extraction script +python3 process_annotations.py \ + /.vl/tmp/b451f7c6-.../input/metadata/image_annotations.parquet \ + -o /home/ubuntu/new-dataset/image_annotations.parquet +``` + +**Output parquet file:** +``` +Location: /home/ubuntu/new-dataset/image_annotations.parquet +Columns: [filename, caption] +Rows: 12 +``` + +**Processed data:** +| filename | caption | +|----------|---------| +| `dog_1.jpg` | "A Golden Retriever sitting on grass. The dog has a friendly expression..." | +| `dog_2.jpg` | "A playful puppy with a red collar running through a park..." | + +### Dataset 2 (without captioning - fast!) + +```bash +# Directory structure +/home/ubuntu/new-dataset/ + ├── image_annotations.parquet (extracted file) + ├── dog_1.jpg + ├── dog_2.jpg + └── ... (all 12 images) + +# Create new dataset +# Visual Layer detects image_annotations.parquet +``` + +## Troubleshooting + +### Images Not Found + +**Error:** "Could not find image at path: dog_1.jpg" + +**Cause:** Filenames in parquet don't match actual file locations. + +**Solutions:** +1. Verify parquet is in the same directory as images +2. Check that filenames match exactly (case-sensitive) +3. Inspect parquet contents to verify paths are relative + +### Checking Parquet Contents + +```bash +# View parquet file contents +python3 -c " +import pandas as pd +df = pd.read_parquet('image_annotations.parquet') +print('Columns:', df.columns.tolist()) +print('\nFirst 5 filenames:') +print(df['filename'].head().tolist()) +" +``` + +**Expected output:** +``` +Columns: ['filename', 'caption'] + +First 5 filenames: +['dog_1.jpg', 'dog_2.jpg', 'dog_3.jpg', 'dog_4.jpg', 'dog_5.jpg'] +``` + +### Captions Not Being Used + +**Issue:** Visual Layer is still generating captions even though `image_annotations.parquet` exists. + +**Solutions:** +1. Verify filename is exactly `image_annotations.parquet` (not `image_annotations_processed.parquet`) +2. Ensure file is in the correct location relative to images +3. Check that parquet file has both `filename` and `caption` columns + +### Script Errors + +**Error:** "Missing required columns: ['caption']" + +**Cause:** Source parquet file doesn't contain caption data. + +**Solution:** The source dataset must have had captions generated. Check if captioning was enabled in the original pipeline. + +## Related Documentation + +- [Preparing Annotation Files](/docs/Creating-Datasets/importing-annotations/preparing-annotation-data-file) - Format requirements for annotation files +- [Annotations Overview](/docs/Creating-Datasets/importing-annotations/annotations-overview-new) - Complete guide to importing annotations +- [Creating Datasets](/docs/Creating-Datasets/createdelete-datasets) - Dataset creation fundamentals + +--- + +By following this workflow, you can significantly reduce dataset creation time when working with the same images across multiple datasets or configurations. The initial investment of generating captions once pays off through faster subsequent dataset creations. diff --git a/docs/code-blocks/caption-extraction-script.mdx b/docs/code-blocks/caption-extraction-script.mdx new file mode 100644 index 0000000..6ac3bc8 --- /dev/null +++ b/docs/code-blocks/caption-extraction-script.mdx @@ -0,0 +1,158 @@ +--- +title: "Caption Extraction Script" +description: "Python script for extracting caption data from Visual Layer pipeline runs" +sidebarTitle: "Script Code" +--- + + +Complete Python script for extracting filename and caption columns from Visual Layer's internal parquet files. + + + +See the main guide for detailed usage instructions, workflow, and examples. + + +## Installation + +```bash +pip install pandas pyarrow +``` + +## Quick Usage + +```bash +# Basic usage +python3 process_annotations.py /.vl/tmp/[dataset-id]/input/metadata/image_annotations.parquet + +# Specify output location +python3 process_annotations.py input.parquet -o /path/to/output.parquet + +# Custom prefix removal +python3 process_annotations.py input.parquet --prefix /custom/prefix +``` + +## Script Code + +```python +#!/usr/bin/env python3 +""" +Process parquet annotation files to extract filename and caption columns. +Removes path prefixes from filenames. +""" + +import argparse +import sys +from pathlib import Path +import pandas as pd + + +def process_parquet(input_path, output_path=None, prefix_to_remove='/hostfs'): + """ + Process a parquet file to extract filename and caption columns. + + Args: + input_path: Path to input parquet file + output_path: Path to output parquet file (optional) + prefix_to_remove: Prefix to remove from filenames (default: '/hostfs') + + Returns: + Path to output file + """ + # Validate input file + input_file = Path(input_path) + if not input_file.exists(): + raise FileNotFoundError(f"Input file not found: {input_path}") + + if not input_file.suffix == '.parquet': + raise ValueError(f"Input file must be a parquet file, got: {input_file.suffix}") + + # Determine output path + if output_path is None: + output_file = input_file.parent / f"{input_file.stem}_processed.parquet" + else: + output_file = Path(output_path) + + print(f"Reading parquet file: {input_file}") + + # Read parquet file + try: + df = pd.read_parquet(input_file) + except Exception as e: + raise RuntimeError(f"Failed to read parquet file: {e}") + + print(f"Original shape: {df.shape}") + print(f"Columns: {df.columns.tolist()}") + + # Validate required columns exist + required_columns = ['filename', 'caption'] + missing_columns = [col for col in required_columns if col not in df.columns] + if missing_columns: + raise ValueError(f"Missing required columns: {missing_columns}") + + # Select only filename and caption columns + df = df[['filename', 'caption']] + + # Remove prefix from filename + print(f"Removing prefix '{prefix_to_remove}' from filenames...") + df['filename'] = df['filename'].apply( + lambda x: x.replace(prefix_to_remove, '', 1) + if isinstance(x, str) and x.startswith(prefix_to_remove) + else x + ) + + # Show sample of processed data + print(f"\nProcessed shape: {df.shape}") + print(f"\nSample filenames after processing:") + print(df['filename'].head(3).tolist()) + + # Save to output file + print(f"\nSaving to: {output_file}") + df.to_parquet(output_file, index=False) + + print(f"✓ Successfully processed {len(df)} rows") + print(f"✓ Output saved to: {output_file}") + + return output_file + + +def main(): + parser = argparse.ArgumentParser( + description='Process parquet annotation files to extract filename and caption columns.', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s input.parquet + %(prog)s input.parquet -o output.parquet + %(prog)s input.parquet --prefix //hostfs/mnt + """ + ) + + parser.add_argument( + 'input', + help='Path to input parquet file' + ) + + parser.add_argument( + '-o', '--output', + help='Path to output parquet file (default: _processed.parquet)' + ) + + parser.add_argument( + '--prefix', + default='/hostfs', + help='Prefix to remove from filenames (default: /hostfs)' + ) + + args = parser.parse_args() + + try: + process_parquet(args.input, args.output, args.prefix) + return 0 + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + return 1 + + +if __name__ == '__main__': + sys.exit(main()) +```