# Generate All Embeddings

This notebook coordinates the generation of embeddings for all Flickr30K images and captions.

**Note**: For large-scale generation, consider using the standalone scripts:
- `scripts/generate_image_embeddings.py`
- `scripts/generate_text_embeddings.py`

## Option 1: Run Scripts from Notebook

In [None]:
# Generate image embeddings
!python ../scripts/generate_image_embeddings.py

In [None]:
# Generate text embeddings
!python ../scripts/generate_text_embeddings.py

## Option 2: Generate in Notebook (for smaller datasets)

In [None]:
import sys
from pathlib import Path

# Add src to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root / 'src'))

from retrieval import BiEncoder
from flickr30k import Flickr30KDataset
from flickr30k.utils import load_config

In [None]:
# Load configuration
config = load_config('../configs/clip_config.yaml')

# Initialize dataset
dataset = Flickr30KDataset(
    images_dir='../data/images',
    captions_file='../data/results.csv'
)

# Initialize encoder
encoder = BiEncoder(
    model_name=config['model']['name'],
    pretrained=config['model']['pretrained']
)

print(f"Dataset: {dataset}")
print(f"Encoder: {encoder}")

In [None]:
# Generate image embeddings (uncomment to run)
# image_names = dataset.get_unique_images()
# image_paths = [Path('../data/images') / name for name in image_names]

# image_embeddings = encoder.encode_images(
#     image_paths,
#     batch_size=32,
#     show_progress=True
# )

# metadata = {
#     'num_images': len(image_names),
#     'image_names': image_names,
#     'model': config['model']['name']
# }

# encoder.save_embeddings(
#     image_embeddings,
#     '../data/embeddings/image_embeddings.npy',
#     metadata=metadata
# )

In [None]:
# Generate text embeddings (uncomment to run)
# all_captions = dataset.df['caption'].tolist()

# text_embeddings = encoder.encode_texts(
#     all_captions,
#     batch_size=64,
#     show_progress=True
# )

# metadata = {
#     'num_captions': len(all_captions),
#     'model': config['model']['name']
# }

# encoder.save_embeddings(
#     text_embeddings,
#     '../data/embeddings/text_embeddings.npy',
#     metadata=metadata
# )

## Verify Generated Embeddings

In [None]:
# Load and verify image embeddings
image_emb, image_meta = encoder.load_embeddings('../data/embeddings/image_embeddings.npy')
print(f"\nImage embeddings:")
print(f"  Shape: {image_emb.shape}")
print(f"  Metadata: {image_meta.keys() if image_meta else 'None'}")

In [None]:
# Load and verify text embeddings
text_emb, text_meta = encoder.load_embeddings('../data/embeddings/text_embeddings.npy')
print(f"\nText embeddings:")
print(f"  Shape: {text_emb.shape}")
print(f"  Metadata: {text_meta.keys() if text_meta else 'None'}")