# Minimal Late Fusion Example with Paired Multimodal Data (Flickr8k)
  
[View on Google Colab](https://colab.research.google.com/drive/1F9Tek26MLHys1uE5s9YFTzDqcjgwVA63?usp=sharing)

### Import the necessary libraries

In [25]:
# !pip install torch datasets transformers
# !pip install "numpy<2.0.0"

import numpy as np
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, AutoImageProcessor

# For transformers specific warnings
import transformers
transformers.logging.set_verbosity_error()

# For datasets warnings
import datasets
datasets.logging.set_verbosity_error()

---

### Load the Dataset from HuggingFace

In [26]:
def load_flickr8k_samples(num_samples=5):
    """
    Loads image-caption pairs from the Flickr8k dataset.

    Args:
        num_samples (int): Number of samples to load.

    Returns:
        images (list of PIL.Image): Images.
        captions (list of str): Corresponding captions.
    """

    ds = load_dataset("jxie/flickr8k", split="train")
    ds = ds.select(range(num_samples))
    
    images, captions = [], []
    for sample in ds:
        image = sample["image"]
        caption = sample["caption_0"]
        images.append(image)
        captions.append(caption)
    return images, captions


In [None]:
images, captions = load_flickr8k_samples()
print(f"Loaded {len(images)} image-caption pairs")
print(f"Captions: {captions}")

---

### Generate Image Embeddings

In [16]:
def get_text_embeddings(texts, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    """
    Get text embeddings using a Hugging Face model.

    Args:
        texts (list of str): Input texts.
        model_name (str): Hugging Face model name.

    Returns:
        np.ndarray: Embedding matrix.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

In [None]:
text_embeds = get_text_embeddings(captions)
print("Text embeddings shape:", text_embeds.shape)

---

In [18]:
def get_image_embeddings(images, model_name="google/vit-base-patch16-224"):
    """
    Get image embeddings using a Hugging Face vision model.

    Args:
        images (list of PIL.Image): Input images.
        model_name (str): Hugging Face model name.

    Returns:
        np.ndarray: Embedding matrix.
    """
    processor = AutoImageProcessor.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    inputs = processor(images, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    return embeddings

In [None]:
image_embeds = get_image_embeddings(images)
print("Image embeddings shape:", image_embeds.shape)

---

### Create Dummy Classifiers

In [20]:
def text_dummy_classifier(embeddings):
    """
    Dummy classifier for text: returns sigmoid-based scores between 0 and 1.

    Args:
        embeddings (np.ndarray): Text embedding matrix.

    Returns:
        np.ndarray: Scores in [0, 1], rounded to 2 decimals.
    """
    means = embeddings.mean(axis=1)
    # Scale and add some variation for demonstration
    scaled_means = means * 10 + np.random.normal(0, 0.5, len(means))
    scores = 1 / (1 + np.exp(-scaled_means))  # Sigmoid function
    return np.round(scores, 2)

In [21]:
def image_dummy_classifier(embeddings):
    """
    Dummy classifier for images: returns sigmoid-based scores between 0 and 1.

    Args:
        embeddings (np.ndarray): Image embedding matrix.

    Returns:
        np.ndarray: Scores in [0, 1], rounded to 2 decimals.
    """
    means = embeddings.mean(axis=1)
    # Scale and add some variation for demonstration
    scaled_means = means * 15 + np.random.normal(0, 0.8, len(means))
    scores = 1 / (1 + np.exp(-scaled_means))  # Sigmoid function
    return np.round(scores, 2)

In [None]:
text_scores = text_dummy_classifier(text_embeds)
image_scores = image_dummy_classifier(image_embeds)
print("Text scores:", text_scores)
print("Image scores:", image_scores)

---

### Implement Late Fusion

In [23]:
def late_fusion(scores, weights, threshold=0.4):
    """
    Weighted sum fusion with detailed calculation display and binary classification.

    Args:
        scores (list of np.ndarray): List of score arrays.
        weights (list of float): List of weights.
        threshold (float): Threshold for binary classification (default 0.4).

    Returns:
        tuple: (fused_scores, binary_labels) both rounded/formatted appropriately.
    """
    text_scores, image_scores = scores
    text_weight, image_weight = weights
    
    print("\nDetailed fusion calculation:")
    print("Sample | Text Score | Image Score | Text*Weight | Image*Weight | Fused Score | Label")
    print("-" * 85)
    
    fused_scores = []
    binary_labels = []
    
    for i in range(len(text_scores)):
        text_weighted = text_scores[i] * text_weight
        image_weighted = image_scores[i] * image_weight
        fused = text_weighted + image_weighted
        fused_scores.append(fused)
        
        # Binary classification based on threshold
        label = "A" if fused >= threshold else "B"
        binary_labels.append(label)
        
        print(f"   {i}   |    {text_scores[i]:.2f}    |     {image_scores[i]:.2f}     |    {text_weighted:.2f}     |     {image_weighted:.2f}     |    {fused:.2f}     |   {label}")
    
    return np.round(np.array(fused_scores), 2), binary_labels

In [None]:
fused_scores, labels = late_fusion([text_scores, image_scores], [0.5, 0.5])
print(f"\nFused scores: {fused_scores}")
print(f"Binary labels: {labels}")

In [None]:
# data model

# hardware - CPU, GPU

# data/model -> CPU -> GPU
# outputs -> GPU

# outputs -> GPU -> CPU

# array (GPU) -> array.cpu() (CPU)

---