In [None]:
# KAGGLE SETUP - Cell 1: Clone repository and install dependencies
!rm -rf hybrid_multimodal_retrieval
!git clone https://github.com/vinhhna/hybrid_multimodal_retrieval.git
%cd hybrid_multimodal_retrieval
!pip install -q transformers accelerate open-clip-torch pyyaml tqdm pillow faiss-cpu
!pip install -e .

In [None]:
# KAGGLE SETUP - Cell 2: Setup data paths
from pathlib import Path

# Set paths based on Kaggle dataset location
IMAGES_DIR = Path('/kaggle/input/flickr30k/data/images')
CAPTIONS_FILE = Path('/kaggle/input/flickr30k/data/results.csv')

# Verify paths
print(f"Images dir exists: {IMAGES_DIR.exists()} - {IMAGES_DIR}")
print(f"Captions file exists: {CAPTIONS_FILE.exists()} - {CAPTIONS_FILE}")

if IMAGES_DIR.exists():
    num_images = len(list(IMAGES_DIR.glob('*.jpg')))
    print(f"Found {num_images} images")

## ðŸš€ Kaggle Setup - RUN THESE FIRST!

**Important:** Execute the two cells above before proceeding with the rest of the notebook.

These cells will:
1. Clone the repository and install all dependencies
2. Set up the correct data paths for Kaggle environment

# Flickr30K Dataset Exploration

This notebook provides exploration and analysis of the Flickr30K dataset for hybrid multimodal retrieval.

**For Kaggle:** Run the setup cells at the top first!

## Dataset Information
- **Images**: 31,000 images from Flickr
- **Captions**: 5 captions per image (~158,915 total)
- **Format**: CSV file with pipe-delimited captions, JPEG images

## 1. Import Libraries and Package

In [None]:
# Standard imports
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Import our flickr30k package (installed via setup cells)
from flickr30k import Flickr30KDataset
from flickr30k.visualization import (
    display_image_with_captions,
    display_random_samples,
    plot_caption_statistics,
    print_dataset_statistics,
    display_search_results
)
from flickr30k.utils import (
    load_config,
    print_data_status,
    check_data_availability
)

print("âœ“ All imports successful!")

## 2. Check Dataset Availability

In [None]:
# Check if dataset is available
print_data_status()

## 3. Verify Data Paths

In [None]:
# Use Kaggle data paths (set in setup cells above)
print("Using Kaggle data paths:")
print(f"  Images directory: {IMAGES_DIR}")
print(f"  Captions file: {CAPTIONS_FILE}")

## 4. Initialize and Load Dataset

In [None]:
# Initialize dataset with Kaggle paths
dataset = Flickr30KDataset(
    images_dir=str(IMAGES_DIR),
    captions_file=str(CAPTIONS_FILE),
    auto_load=True
)

print(f"\n{dataset}")

## 5. Verify Caption Loading

In [None]:
# Get sample captions for first image
unique_images = dataset.get_unique_images()
first_image = unique_images[0]

print(f"Sample captions for '{first_image}':")
captions = dataset.get_captions(first_image)
for i, caption in enumerate(captions, 1):
    print(f"{i}. {caption}")

## 6. Dataset Statistics

In [None]:
# Print comprehensive statistics
print_dataset_statistics(dataset)

## 7. Visualize Caption Statistics

In [None]:
# Plot caption length distributions
plot_caption_statistics(dataset, figsize=(15, 5))

## 8. Display Random Sample Images with Captions

In [None]:
# Display 3 random samples
display_random_samples(
    dataset=dataset,
    n_samples=3,
    seed=42,
    figsize=(10, 8)
)

## 9. Display Specific Image with Captions

In [None]:
# Display a specific image
# Change the image name to view different images
image_name = unique_images[10]  # Change index to view different images

display_image_with_captions(
    image_name=image_name,
    dataset=dataset,
    figsize=(10, 8)
)

## 10. Analyze Caption Diversity

In [None]:
# Check caption uniqueness
unique_captions = dataset.df['caption'].nunique()
total_captions = len(dataset.df)
duplicate_ratio = 1 - (unique_captions / total_captions)

print(f"Total captions: {total_captions:,}")
print(f"Unique captions: {unique_captions:,}")
print(f"Duplicate ratio: {duplicate_ratio:.4f} ({duplicate_ratio*100:.2f}%)")

# Find most common captions
print("\nTop 10 Most Common Captions:")
caption_counts = dataset.df['caption'].value_counts().head(10)
for caption, count in caption_counts.items():
    print(f"  [{count}x] {caption[:80]}{'...' if len(caption) > 80 else ''}")

## 11. Search Captions by Keyword

In [None]:
# Search for captions containing a keyword
keyword = "dog"  # Change this to search for different keywords

results = dataset.search_captions(keyword, max_results=10)
display_search_results(
    results_df=results,
    keyword=keyword,
    max_display=5,
    show_images=False,  # Set to True to show images (slower)
    dataset=dataset
)

## 12. Interactive Exploration

Use the cells below to explore the dataset interactively.

In [None]:
# Get random sample
image_name, captions = dataset.get_random_sample(seed=None)

print(f"Random image: {image_name}")
print("\nCaptions:")
for i, caption in enumerate(captions, 1):
    print(f"{i}. {caption}")

# Display the image
display_image_with_captions(image_name, dataset=dataset)

## 13. Export Sample Data

Export a subset of data for further analysis or experimentation.

In [None]:
# Create a sample subset (e.g., 1000 images)
import numpy as np

np.random.seed(42)
sample_images = np.random.choice(unique_images, size=1000, replace=False)
sample_df = dataset.df[dataset.df['image_name'].isin(sample_images)]

print(f"Sample subset created:")
print(f"  Images: {len(sample_images)}")
print(f"  Captions: {len(sample_df)}")

# Optionally save to CSV
# sample_df.to_csv('data/flickr30k_sample_1k.csv', index=False)
# print("âœ“ Sample saved to data/flickr30k_sample_1k.csv")

## Next Steps

Now that you've explored the dataset, here are some next steps:

1. **Feature Extraction**: Extract visual features using pre-trained models (ResNet, CLIP)
2. **Text Embedding**: Create embeddings for captions using transformers
3. **Hybrid Retrieval**: Implement text-to-image and image-to-text search
4. **Evaluation**: Implement retrieval metrics (Recall@K, MRR, MAP)
5. **Model Training**: Fine-tune models for better cross-modal alignment

Check the project README for more information on the roadmap!