# CLIP Tutorial Using HuggingFace

[View on Google Colab](https://colab.research.google.com/drive/1tpVJFdg5_7_k-Bsthcap3MNADgFiqHtx?usp=sharing)

Contents Covered: 
1. Loading CLIP Models from HuggingFace
2. Load Sample Image from URL
3. Zero Shot Image Classification
4. Compute Image Text Similarity
5. Image Features Extraction
6. Text Features Extraction
7. Similarity Matrix Computation

### Import the necessary libraries

In [1]:
# !pip install torch transformers pillow matplotlib
# !pip install "numpy<2.0.0"

import torch
from transformers import (
    CLIPProcessor, 
    CLIPModel, 
    CLIPTokenizer, 
    CLIPImageProcessor,
    pipeline
)
from PIL import Image
import requests
import numpy as np
from typing import List, Union, Tuple
import matplotlib.pyplot as plt

---

### Load CLIP Model from HuggingFace

In [None]:
def load_clip_model(model_name: str = "openai/clip-vit-base-patch32") -> Tuple[CLIPModel, CLIPProcessor]:
    """
    Load CLIP model and processor from Hugging Face.
    
    Args:
        model_name (str): Name of the CLIP model to load
        
    Returns:
        Tuple[CLIPModel, CLIPProcessor]: Loaded model and processor
    """
    print(f"Loading CLIP model: {model_name}")
    model = CLIPModel.from_pretrained(model_name)
    processor = CLIPProcessor.from_pretrained(model_name)
    
    # Set to evaluation mode
    model.eval()
    
    print(f"Model loaded successfully!")
    print(f"Vision config: {model.config.vision_config}")
    print(f"Text config: {model.config.text_config}")
    
    return model, processor

# Execute: Load CLIP model
print("=" * 60)
print("LOADING CLIP MODEL")
print("=" * 60)
model, processor = load_clip_model()

---

### Load Sample Image from URL

In [None]:
def load_image_from_url(url: str) -> Image.Image:
    """
    Load an image from a URL.
    
    Args:
        url (str): URL of the image
        
    Returns:
        Image.Image: PIL Image object
    """
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        image = Image.open(response.raw)
        return image
    except Exception as e:
        print(f"Error loading image from URL: {e}")
        raise


# Execute: Load sample image
print("\n" + "=" * 60)
print("LOADING SAMPLE IMAGE")
print("=" * 60)
image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
sample_image = load_image_from_url(image_url)
print(f"Image loaded from: {image_url}")
print(f"Image size: {sample_image.size}")

---

### Zero Shot Image Classification

In [None]:
def zero_shot_image_classification(
    image: Union[str, Image.Image], 
    candidate_labels: List[str],
    model_name: str = "openai/clip-vit-base-patch32"
) -> dict:
    """
    Perform zero-shot image classification using CLIP pipeline.
    
    Args:
        image (Union[str, Image.Image]): Image URL or PIL Image
        candidate_labels (List[str]): List of possible labels
        model_name (str): CLIP model name
        
    Returns:
        dict: Classification results with scores
    """
    print("Performing zero-shot image classification...")
    
    # Create pipeline
    clip_pipeline = pipeline(
        task="zero-shot-image-classification",
        model=model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device=0 if torch.cuda.is_available() else -1
    )
    
    # Perform classification
    results = clip_pipeline(image, candidate_labels=candidate_labels)
    
    print("Classification Results:")
    for result in results:
        print(f"  {result['label']}: {result['score']:.4f}")
    
    return results


# Execute: Zero-shot classification
print("\n" + "=" * 60)
print("ZERO-SHOT IMAGE CLASSIFICATION")
print("=" * 60)
labels = ["a photo of a cat", "a photo of a dog", "a photo of a car", "a photo of a bird"]
classification_results = zero_shot_image_classification(sample_image, labels)


---

### Compute Image Text Similarity

In [None]:
def compute_image_text_similarity(
    image: Union[str, Image.Image],
    texts: List[str],
    model: CLIPModel,
    processor: CLIPProcessor
) -> np.ndarray:
    """
    Compute similarity scores between an image and multiple text descriptions.
    
    Args:
        image (Union[str, Image.Image]): Image URL or PIL Image
        texts (List[str]): List of text descriptions
        model (CLIPModel): CLIP model
        processor (CLIPProcessor): CLIP processor
        
    Returns:
        np.ndarray: Similarity scores
    """
    print("Computing image-text similarities...")
    
    # Load image if URL provided
    if isinstance(image, str):
        image = load_image_from_url(image)
    
    # Process inputs
    inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
    
    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)
        
    # Get the logits (similarity scores)
    logits_per_image = outputs.logits_per_image
    probs = logits_per_image.softmax(dim=1)
    
    print("Similarity Scores:")
    for i, text in enumerate(texts):
        print(f"  '{text}': {probs[0][i].item():.4f}")
    
    return probs.numpy()


# Execute: Image-text similarity
print("\n" + "=" * 60)
print("IMAGE-TEXT SIMILARITY")
print("=" * 60)
text_descriptions = [
    "two cats lying on a couch",
    "dogs playing in a park", 
    "a car driving on a road",
    "cats sleeping together",
    "animals resting indoors"
]
similarities = compute_image_text_similarity(sample_image, text_descriptions, model, processor)

In [None]:
def visualize_results(
    image: Image.Image,
    texts: List[str],
    similarities: np.ndarray,
    title: str = "CLIP Image-Text Similarities"
):
    """
    Visualize CLIP results with image and similarity scores.
    
    Args:
        image (Image.Image): Input image
        texts (List[str]): Text descriptions
        similarities (np.ndarray): Similarity scores
        title (str): Plot title
    """
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # Display image
    ax1.imshow(image)
    ax1.axis('off')
    ax1.set_title('Input Image')
    
    # Display similarity scores
    y_pos = np.arange(len(texts))
    bars = ax2.barh(y_pos, similarities[0])
    ax2.set_yticks(y_pos)
    ax2.set_yticklabels(texts)
    ax2.set_xlabel('Similarity Score')
    ax2.set_title('Text Similarities')
    ax2.set_xlim(0, 1)
    
    # Add value labels on bars
    for i, bar in enumerate(bars):
        width = bar.get_width()
        ax2.text(width + 0.01, bar.get_y() + bar.get_height()/2, 
                f'{width:.3f}', ha='left', va='center')
    
    plt.suptitle(title)
    plt.tight_layout()
    plt.show()


# Execute: Visualize results
print("\n" + "=" * 60)
print("VISUALIZATION")
print("=" * 60)
visualize_results(sample_image, text_descriptions, similarities, 
                 "CLIP Tutorial: Image-Text Similarities")

---

### Image Feature Extraction

In [None]:
def extract_image_features(
    images: Union[List[Image.Image], List[str]],
    model: CLIPModel,
    processor: CLIPProcessor
) -> torch.Tensor:
    """
    Extract image features using CLIP vision encoder.
    
    Args:
        images (Union[List[Image.Image], List[str]]): List of images or URLs
        model (CLIPModel): CLIP model
        processor (CLIPProcessor): CLIP processor
        
    Returns:
        torch.Tensor: Image features
    """
    print(f"Extracting features for {len(images)} images...")
    
    # Load images if URLs provided
    processed_images = []
    for img in images:
        if isinstance(img, str):
            processed_images.append(load_image_from_url(img))
        else:
            processed_images.append(img)
    
    # Process images
    inputs = processor(images=processed_images, return_tensors="pt")
    
    # Extract features
    with torch.no_grad():
        image_features = model.get_image_features(**inputs)
        
    # Normalize features
    image_features = image_features / image_features.norm(dim=1, keepdim=True)
    
    print(f"Extracted features shape: {image_features.shape}")
    return image_features


# Execute: Extract image features
print("\n" + "=" * 60)
print("IMAGE FEATURE EXTRACTION")
print("=" * 60)
image_features = extract_image_features([sample_image], model, processor)

---

### Extract Text Features

In [None]:
def extract_text_features(
    texts: List[str],
    model: CLIPModel,
    processor: CLIPProcessor
) -> torch.Tensor:
    """
    Extract text features using CLIP text encoder.
    
    Args:
        texts (List[str]): List of text descriptions
        model (CLIPModel): CLIP model
        processor (CLIPProcessor): CLIP processor
        
    Returns:
        torch.Tensor: Text features
    """
    print(f"Extracting features for {len(texts)} texts...")
    
    # Process texts
    inputs = processor(text=texts, return_tensors="pt", padding=True)
    
    # Extract features
    with torch.no_grad():
        text_features = model.get_text_features(**inputs)
        
    # Normalize features
    text_features = text_features / text_features.norm(dim=1, keepdim=True)
    
    print(f"Extracted features shape: {text_features.shape}")
    return text_features


# Execute: Extract text features
print("\n" + "=" * 60)
print("TEXT FEATURE EXTRACTION")
print("=" * 60)
text_features = extract_text_features(text_descriptions, model, processor)


---

### Compute Similarity Matrix

In [None]:
def compute_similarity_matrix(
    image_features: torch.Tensor,
    text_features: torch.Tensor
) -> torch.Tensor:
    """
    Compute similarity matrix between image and text features.
    
    Args:
        image_features (torch.Tensor): Image features
        text_features (torch.Tensor): Text features
        
    Returns:
        torch.Tensor: Similarity matrix
    """
    # Compute cosine similarity
    similarity_matrix = torch.matmul(image_features, text_features.T)
    
    print(f"Similarity matrix shape: {similarity_matrix.shape}")
    return similarity_matrix


# Execute: Compute similarity matrix
print("\n" + "=" * 60)
print("SIMILARITY MATRIX COMPUTATION")
print("=" * 60)
similarity_matrix = compute_similarity_matrix(image_features, text_features)
print(f"Similarity scores: {similarity_matrix[0].tolist()}")


---