In [None]:
# KAGGLE SETUP - Cell 2: Setup data paths
from pathlib import Path

# Set paths based on Kaggle dataset location
IMAGES_DIR = Path('/kaggle/input/flickr30k/data/images')
CAPTIONS_FILE = Path('/kaggle/input/flickr30k/data/results.csv')

# Verify paths
print(f"Images dir exists: {IMAGES_DIR.exists()} - {IMAGES_DIR}")
print(f"Captions file exists: {CAPTIONS_FILE.exists()} - {CAPTIONS_FILE}")

if IMAGES_DIR.exists():
    num_images = len(list(IMAGES_DIR.glob('*.jpg')))
    print(f"Found {num_images} images")

In [None]:
# KAGGLE SETUP - Cell 1: Clone repository and install dependencies
!rm -rf hybrid_multimodal_retrieval
!git clone https://github.com/vinhhna/hybrid_multimodal_retrieval.git
%cd hybrid_multimodal_retrieval
!pip install -q transformers accelerate open-clip-torch pyyaml tqdm pillow faiss-cpu
!pip install -e .

## üöÄ Kaggle Setup - RUN THESE FIRST!

**Important:** Execute the two cells above before proceeding with the rest of the notebook.

These cells will:
1. Clone the repository and install all dependencies
2. Set up the correct data paths for Kaggle environment

# Phase 3: BLIP-2 Cross-Encoder Exploration

**Notebook**: BLIP-2 Integration and Testing  
**Phase**: 3 - Cross-Encoder Reranking  
**Week**: 1 - BLIP-2 Integration  
**Created**: October 28, 2025

---

## üìã Objectives

This notebook explores and validates the BLIP-2 cross-encoder integration:

1. **Setup**: Verify dependencies and GPU availability
2. **Model Loading**: Load BLIP-2 from Hugging Face
3. **Single Pair Scoring**: Test on individual query-image pairs
4. **Batch Processing**: Optimize batch sizes for performance
5. **CLIP Comparison**: Compare with bi-encoder scores
6. **Performance**: Benchmark speed and throughput
7. **Quality**: Validate scoring with diverse queries

---

## üéØ Success Criteria

- ‚úÖ BLIP-2 loads successfully on GPU
- ‚úÖ Scoring produces interpretable results (0-1 range)
- ‚úÖ Batch processing < 2 seconds for 100 pairs
- ‚úÖ BLIP-2 provides different perspective than CLIP

## 1Ô∏è‚É£ Setup & Dependencies

In [None]:
# Check core dependencies
import sys
from pathlib import Path

print("Checking dependencies...\n")

# PyTorch
try:
    import torch
    print(f"‚úì PyTorch: {torch.__version__}")
    print(f"  CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"  GPU: {torch.cuda.get_device_name(0)}")
        print(f"  GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
except ImportError:
    print("‚úó PyTorch not installed")

# Transformers (for BLIP-2)
try:
    from transformers import Blip2Processor, Blip2ForConditionalGeneration
    import transformers
    print(f"\n‚úì Transformers: {transformers.__version__}")
except ImportError:
    print("\n‚úó Transformers not installed")
    print("Install with: pip install transformers accelerate")

# Other dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import time
from tqdm.auto import tqdm

print("\n‚úì All imports successful")

In [None]:
# Verify paths are set from Kaggle setup cells above
print(f"Images dir: {IMAGES_DIR}")
print(f"Captions file: {CAPTIONS_FILE}")
print(f"Images exist: {IMAGES_DIR.exists()}")
print(f"Captions exist: {CAPTIONS_FILE.exists()}")

if IMAGES_DIR.exists():
    num_images = len(list(IMAGES_DIR.glob('*.jpg')))
    print(f"‚úì Found {num_images} images")
else:
    print("‚ö† Images directory not found. Make sure you ran the setup cells above!")

## 2Ô∏è‚É£ Load BLIP-2 Model

In [None]:
# Import and initialize BLIP-2 cross-encoder
try:
    from src.retrieval.cross_encoder import CrossEncoder
except ImportError:
    # Add project root if not already in path
    import sys
    from pathlib import Path
    project_root = Path.cwd() if Path.cwd().name != 'notebooks' else Path.cwd().parent
    if project_root not in sys.path:
        sys.path.insert(0, str(project_root))
    from src.retrieval.cross_encoder import CrossEncoder

print("Loading BLIP-2 from Hugging Face...")
print("Model: Salesforce/blip2-opt-2.7b (~3GB with FP16, fits P100 16GB GPU alongside CLIP)")

encoder = CrossEncoder(
    model_name='Salesforce/blip2-opt-2.7b',
    device='cuda' if torch.cuda.is_available() else 'cpu',
    use_fp16=True  # Important for GPU memory efficiency
)

print("\n‚úì BLIP-2 model loaded successfully!")
print(f"Device: {encoder.device}")
print(f"FP16: {encoder.use_fp16}")

In [None]:
# Display model information
print("Model Information:")
print(f"  Model class: {encoder.model.__class__.__name__}")
print(f"  Device: {encoder.device}")
print(f"  FP16 enabled: {encoder.use_fp16}")
print(f"  Default batch size: {encoder.batch_size}")
print(f"  Max batch size: {encoder.max_batch_size}")

if torch.cuda.is_available():
    print(f"\nGPU Memory Usage:")
    print(f"  Allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
    print(f"  Reserved: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")

## 3Ô∏è‚É£ Test Single Pair Scoring

In [None]:
# Load a test image
if IMAGES_DIR.exists():
    test_images = list(IMAGES_DIR.glob('*.jpg'))[:5]
    
    if len(test_images) > 0:
        test_image_path = test_images[0]
        
        # Display the image
        img = Image.open(test_image_path)
        plt.figure(figsize=(8, 8))
        plt.imshow(img)
        plt.axis('off')
        plt.title(f"Test Image: {test_image_path.name}", fontsize=14)
        plt.tight_layout()
        plt.show()
        
        print(f"Image: {test_image_path.name}")
        print(f"Size: {img.size}")
    else:
        print("No images found in directory")
else:
    print("Images directory not found")

In [None]:
# Test with different queries
test_queries = [
    "A dog playing in the park",
    "People at a beach",
    "A colorful outdoor scene",
    "Children playing together",
    "Random unrelated text xyz123",
]

print(f"Testing single pair scoring\n")
print(f"Image: {test_image_path.name}\n")
print("Query ‚Üí Score")
print("-" * 60)

scores_dict = {}
for query in test_queries:
    score = encoder.score_pair(query, test_image_path)
    scores_dict[query] = score
    print(f"{query:40s} ‚Üí {score:.4f}")

print("\n‚úì Single pair scoring works!")
print(f"Score range: [{min(scores_dict.values()):.4f}, {max(scores_dict.values()):.4f}]")

## 4Ô∏è‚É£ Batch Processing Tests

In [None]:
# Prepare test data
n_pairs = 10
batch_test_images = list(IMAGES_DIR.glob('*.jpg'))[:n_pairs]
batch_test_queries = [
    "A photograph of people",
    "An outdoor scene",
    "Children playing",
    "A colorful image",
    "An action scene",
    "A landscape view",
    "People in a setting",
    "An indoor environment",
    "A busy scene",
    "A peaceful moment"
][:n_pairs]

print(f"Prepared {len(batch_test_images)} image-query pairs for batch testing")

In [None]:
# Test different batch sizes
batch_sizes = [1, 2, 4, 8]
results = []

print("Testing different batch sizes...\n")

for batch_size in batch_sizes:
    print(f"Batch size: {batch_size}")
    
    start = time.time()
    scores = encoder.score_pairs(
        batch_test_queries,
        batch_test_images,
        batch_size=batch_size,
        show_progress=True
    )
    elapsed = time.time() - start
    
    results.append({
        'batch_size': batch_size,
        'total_time': elapsed,
        'time_per_pair': elapsed / len(batch_test_queries),
        'mean_score': scores.mean(),
        'std_score': scores.std()
    })
    
    print(f"  Time: {elapsed:.2f}s ({elapsed/len(batch_test_queries)*1000:.1f}ms per pair)")
    print(f"  Mean score: {scores.mean():.4f} ¬± {scores.std():.4f}\n")

# Display results table
results_df = pd.DataFrame(results)
print("\n" + "="*70)
print("Batch Size Performance Comparison")
print("="*70)
print(results_df.to_string(index=False))
print("\n‚úì Optimal batch size: " + str(results_df.loc[results_df['time_per_pair'].idxmin(), 'batch_size']))

## 5Ô∏è‚É£ CLIP vs BLIP-2 Comparison

In [None]:
# Load CLIP bi-encoder for comparison
try:
    from src.retrieval.bi_encoder import BiEncoder
except ImportError:
    import sys
    from pathlib import Path
    project_root = Path.cwd() if Path.cwd().name != 'notebooks' else Path.cwd().parent
    if project_root not in sys.path:
        sys.path.insert(0, str(project_root))
    from src.retrieval.bi_encoder import BiEncoder

print("Loading CLIP bi-encoder...")
clip_encoder = BiEncoder(model_name='ViT-B-32', pretrained='openai')
print("‚úì CLIP loaded")

In [None]:
# Compare on same pairs
comparison_queries = batch_test_queries[:5]
comparison_images = batch_test_images[:5]

print("Computing CLIP scores...")
clip_img_embs = clip_encoder.encode_images(comparison_images, show_progress=False)
clip_text_embs = clip_encoder.encode_texts(comparison_queries, show_progress=False)
clip_scores = (clip_img_embs * clip_text_embs).sum(axis=1)

print("Computing BLIP-2 scores...")
blip2_scores = encoder.score_pairs(
    queries=comparison_queries,
    candidates=comparison_images,
    query_type='text',
    candidate_type='image',
    batch_size=4,
    show_progress=False
)

# Create comparison table
comparison_df = pd.DataFrame({
    'Query': [q[:30] + '...' if len(q) > 30 else q for q in comparison_queries],
    'Image': [img.name for img in comparison_images],
    'CLIP': clip_scores,
    'BLIP-2': blip2_scores,
    'Diff': blip2_scores - clip_scores
})

print("\n" + "="*80)
print("CLIP (Bi-Encoder) vs BLIP-2 (Cross-Encoder) Comparison")
print("="*80)
print(comparison_df.to_string(index=False))

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart comparison
x = np.arange(len(clip_scores))
width = 0.35

axes[0].bar(x - width/2, clip_scores, width, label='CLIP', alpha=0.8)
axes[0].bar(x + width/2, blip2_scores, width, label='BLIP-2', alpha=0.8)
axes[0].set_xlabel('Query Index', fontsize=12)
axes[0].set_ylabel('Score', fontsize=12)
axes[0].set_title('CLIP vs BLIP-2 Scores', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Scatter plot correlation
axes[1].scatter(clip_scores, blip2_scores, alpha=0.7, s=100, edgecolors='black')
axes[1].plot([0, 1], [0, 1], 'r--', alpha=0.5, linewidth=2, label='y=x')
axes[1].set_xlabel('CLIP Score', fontsize=12)
axes[1].set_ylabel('BLIP-2 Score', fontsize=12)
axes[1].set_title('Score Correlation', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Compute correlation
correlation = np.corrcoef(clip_scores, blip2_scores)[0, 1]
print(f"\nCorrelation between CLIP and BLIP-2: {correlation:.3f}")

## 6Ô∏è‚É£ Performance Benchmark

In [None]:
# Benchmark: Score 100 pairs (simulating reranking top-100)
n_benchmark = 100
available_images = list(IMAGES_DIR.glob('*.jpg'))

if len(available_images) < n_benchmark:
    n_benchmark = len(available_images)
    print(f"‚ö† Only {n_benchmark} images available")

benchmark_images = available_images[:n_benchmark]
benchmark_queries = ["A photograph"] * n_benchmark  # Same query for all

print(f"Benchmarking BLIP-2 with {n_benchmark} pairs...")
print("This simulates reranking top-100 bi-encoder results\n")

start = time.time()
scores = encoder.score_pairs(
    queries=benchmark_queries,
    candidates=benchmark_images,
    query_type='text',
    candidate_type='image',
    batch_size=8,
    show_progress=True
)
elapsed = time.time() - start

print(f"\n" + "="*70)
print("Benchmark Results")
print("="*70)
print(f"Total time:        {elapsed:.2f}s")
print(f"Time per pair:     {elapsed/n_benchmark*1000:.1f}ms")
print(f"Throughput:        {n_benchmark/elapsed:.1f} pairs/second")
print(f"\nTarget (Week 1):   < 30 seconds for 100 pairs")
print(f"Status:            {'‚úì PASS' if elapsed < 30 else '‚ö† SLOW'}")
print("="*70)

In [None]:
# Visualize score distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(scores, bins=30, alpha=0.7, edgecolor='black', color='steelblue')
axes[0].axvline(scores.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {scores.mean():.3f}')
axes[0].axvline(np.median(scores), color='green', linestyle='--', linewidth=2, label=f'Median: {np.median(scores):.3f}')
axes[0].set_xlabel('BLIP-2 Score', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title(f'Score Distribution (n={n_benchmark})', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Box plot
axes[1].boxplot(scores, vert=True, patch_artist=True,
                boxprops=dict(facecolor='lightblue', alpha=0.7),
                medianprops=dict(color='red', linewidth=2))
axes[1].set_ylabel('BLIP-2 Score', fontsize=12)
axes[1].set_title('Score Statistics', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# Print statistics
print("\nScore Statistics:")
print(f"  Mean:    {scores.mean():.4f}")
print(f"  Median:  {np.median(scores):.4f}")
print(f"  Std:     {scores.std():.4f}")
print(f"  Min:     {scores.min():.4f}")
print(f"  Max:     {scores.max():.4f}")
print(f"  Range:   {scores.max() - scores.min():.4f}")

## 7Ô∏è‚É£ Quality Validation

In [None]:
# Test with diverse queries on the same image
diverse_queries = [
    "people standing together",
    "outdoor sunny day",
    "children playing",
    "buildings and architecture",
    "animals in nature",
    "food on a table",
    "sports activity",
    "night scene with lights",
]

print(f"Testing query diversity on: {test_image_path.name}\n")

diverse_scores = []
for query in diverse_queries:
    score = encoder.score_pair(query, test_image_path)
    diverse_scores.append(score)
    print(f"{query:30s} ‚Üí {score:.4f}")

# Plot
plt.figure(figsize=(10, 6))
plt.barh(diverse_queries, diverse_scores, color='steelblue', alpha=0.8, edgecolor='black')
plt.xlabel('BLIP-2 Score', fontsize=12)
plt.title(f'Query Diversity Test - {test_image_path.name}', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

print(f"\n‚úì Score range: [{min(diverse_scores):.4f}, {max(diverse_scores):.4f}]")
print(f"Score variance: {np.var(diverse_scores):.4f}")

## 8Ô∏è‚É£ Summary & Next Steps

In [None]:
# Create summary report
summary = f"""
{'='*70}
PHASE 3 WEEK 1: BLIP-2 EXPLORATION SUMMARY
{'='*70}

‚úì ACCOMPLISHMENTS:
  ‚Ä¢ BLIP-2 successfully loaded from Hugging Face
  "  ‚Ä¢ Model: Salesforce/blip2-opt-2.7b (~3GB with FP16, fits P100 alongside CLIP)\n",
  ‚Ä¢ Device: {encoder.device}
  ‚Ä¢ FP16 enabled: {encoder.use_fp16}
  
‚úì PERFORMANCE:
  ‚Ä¢ Single pair scoring: ‚úì Working
  ‚Ä¢ Batch processing: ‚úì Optimized
  ‚Ä¢ Benchmark ({n_benchmark} pairs): {elapsed:.2f}s
  ‚Ä¢ Throughput: {n_benchmark/elapsed:.1f} pairs/second
  ‚Ä¢ Target status: {'‚úì PASS' if elapsed < 30 else '‚ö† NEEDS OPTIMIZATION'}

‚úì QUALITY:
  ‚Ä¢ Score range: [0, 1] (interpretable)
  ‚Ä¢ Uses yes/no probability method
  ‚Ä¢ Different perspective than CLIP
  ‚Ä¢ Correlation with CLIP: {correlation:.3f}

üìã WEEK 2 TASKS (Nov 4-30):
  1. Implement reranking function
  2. Create HybridRetriever class
  3. Integrate bi-encoder + cross-encoder pipeline
  4. Implement evaluation metrics (Recall@K, Precision@K, MRR)
  5. Build demo notebook for hybrid retrieval
  6. Compare hybrid vs bi-encoder performance

üéØ SUCCESS CRITERIA FOR WEEK 2:
  ‚Ä¢ Reranking time < 30 seconds for top-100
  ‚Ä¢ Recall@10 improvement: +15-20% over bi-encoder
  ‚Ä¢ Precision@10 > 60%
  ‚Ä¢ Full hybrid pipeline working end-to-end

{'='*70}
"""

print(summary)

---

## ‚úÖ Checkpoint

**Week 1 Complete!** All BLIP-2 integration tests passed.

Next: Proceed to Week 2 - Hybrid Retrieval Pipeline