# Smart Product Cataloger - Assignment

[View on Google Colab](https://colab.research.google.com/drive/1fplzzeYAi-bo1oRrwC2hGnVN_OlbQysR?usp=sharing)

Week 8: Multimodal AI for E-commerce Product Analysis

OBJECTIVE: Build an AI system that can automatically analyze product images
and generate metadata for e-commerce listings using CLIP and BLIP models.

LEARNING GOALS:
- Use CLIP for zero-shot image classification
- Use BLIP for image captioning and visual question answering
- Combine multiple AI models for practical applications
- Build a complete product analysis pipeline

---

### Import the necessary libraries

In [1]:
import torch
from transformers import (
    CLIPProcessor, CLIPModel,
    BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering,
    pipeline
)
from PIL import Image
import requests
import numpy as np
from typing import Dict, List, Union

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Global variables to store models (we'll load them once)
clip_model = None
clip_processor = None
blip_caption_model = None
blip_caption_processor = None
blip_vqa_model = None
blip_vqa_processor = None

---

### Load the Models from HuggingFace

In [None]:
def load_models():
    """
    Load all required models for product analysis
    
    TODO: Load the following models and store them in global variables:
    1. CLIP model for image classification
    2. BLIP model for image captioning  
    3. BLIP model for visual question answering
    
    MODELS TO LOAD:
    - CLIP: "openai/clip-vit-base-patch32"
    - BLIP Caption: "Salesforce/blip-image-captioning-base"
    - BLIP VQA: "Salesforce/blip-vqa-base"
    
    HINT: Use the model loading patterns from Week8/session_2 notebooks
    HINT: Use 'global' keyword to modify global variables
    
    EXAMPLE:
    global clip_model, clip_processor
    clip_model = ?
    clip_processor = ?
    """
    global clip_model, clip_processor, blip_caption_model, blip_caption_processor, blip_vqa_model, blip_vqa_processor
    
    print("🚀 Loading models for Smart Product Cataloger...")
    
    # TODO: Load CLIP model and processor
    # clip_model = ?
    # clip_processor = ?
    
    # TODO: Load BLIP caption model and processor
    # blip_caption_model = ?
    # blip_caption_processor = ?
    
    # TODO: Load BLIP VQA model and processor
    # blip_vqa_model = ?
    # blip_vqa_processor = ?
    
    # DUMMY IMPLEMENTATION (Remove this when implementing)
    print("⚠️ DUMMY: Models not loaded yet - implement the TODO sections above")
    
    print("✅ All models loaded successfully!")

# TEST: Load models
print("🔧 TESTING: Loading models...")
load_models()
print()

---

### Load Image from URL
  
You can extend this function to load an image from a path on your local system and apply the required transforms to it.

In [None]:
def load_image_from_url(url: str) -> Image.Image:
    """
    Load an image from a URL
    
    Args:
        url (str): URL of the image to load
        
    Returns:
        Image.Image: PIL Image object or None if failed
        
    TODO: Implement image loading from URL
    HINT: Use requests.get() and Image.open()
    
    EXAMPLE INPUT: "https://images.unsplash.com/photo-1542291026-7eec264c27ff"
    EXAMPLE OUTPUT: PIL Image object of Nike shoes
    """
    
    # TODO: Implement image loading
    # 1. Use requests.get() to fetch the image
    # 2. Use Image.open() to create PIL Image
    # 3. Convert to RGB format
    # 4. Handle errors gracefully
    
    # DUMMY IMPLEMENTATION (Remove this when implementing)
    print(f"⚠️ DUMMY: Would load image from {url}")
    return None

# TEST: Load image
print("📸 TESTING: Loading image from URL...")
sample_url = "https://images.unsplash.com/photo-1542291026-7eec264c27ff"  # Nike shoes
image = load_image_from_url(sample_url)

print(f"Image loaded successfully: {image is not None}")
if image:
    print(f"Image size: {image.size}")
print()

---

### Product Classification using CLIP

In [None]:
def classify_product_image(image: Image.Image, candidate_labels: List[str]) -> List[Dict]:
    """
    Classify image using CLIP zero-shot classification
    
    Args:
        image (Image.Image): PIL Image to classify
        candidate_labels (List[str]): List of possible categories
        
    Returns:
        List[Dict]: Classification results with labels and scores
        
    TODO: Implement zero-shot classification using CLIP
    HINT: Use the pipeline approach from clip.ipynb
    
    EXAMPLE INPUT: 
        image = <PIL Image of shoes>
        candidate_labels = ["clothing", "shoes", "electronics", "furniture"]
        
    EXAMPLE OUTPUT:
        [
            {"label": "shoes", "score": 0.8945},
            {"label": "clothing", "score": 0.0823},
            {"label": "electronics", "score": 0.0156},
            {"label": "furniture", "score": 0.0076}
        ]
    """
    print("🔍 Classifying product category...")
    
    # TODO: Implement CLIP classification
    # 1. Create a zero-shot-image-classification pipeline
    # clip_pipeline = ?

    # 2. Use the pipeline to classify the image
    # results = ?

    # 3. Return the results
    # return results
    
    # DUMMY IMPLEMENTATION (Remove this when implementing)
    dummy_results = [
        {"label": candidate_labels[0], "score": 0.8945},
        {"label": candidate_labels[1], "score": 0.0823},
    ]
    print("⚠️ DUMMY: Returning fake classification results")
    
    return dummy_results

# TEST: Classify image
print("🔍 TESTING: Classifying product image...")
categories = ["clothing", "shoes", "electronics", "furniture", "books", "toys"]
classification_results = classify_product_image(image, categories)

print("Classification Results:")
for result in classification_results:
    print(f"  {result['label']}: {result['score']:.4f}")

---

### Generate Product Caption

In [None]:
def generate_product_caption(image: Image.Image) -> str:
    """
    Generate a descriptive caption for the image using BLIP
    
    Args:
        image (Image.Image): PIL Image to caption
        
    Returns:
        str: Generated caption describing the image
        
    TODO: Implement image captioning using BLIP
    HINT: Use the captioning approach from blip.ipynb
    HINT: Use the global blip_caption_model and blip_caption_processor
    
    EXAMPLE INPUT: <PIL Image of red Nike sneakers>
    EXAMPLE OUTPUT: "a pair of red nike sneakers on a white background"
    """
    print("📝 Generating image caption...")
    
    # TODO: Implement BLIP captioning
    # 1. Process the image using blip_caption_processor
    # inputs = ?

    # 2. Generate caption using blip_caption_model
    # with torch.no_grad():
    #     out = ?

    # 3. Decode and return the caption
    # caption = ?
    # return caption
    
    # DUMMY IMPLEMENTATION (Remove this when implementing)
    dummy_caption = "a product on a white background"
    print(f"⚠️ DUMMY: Generated caption: '{dummy_caption}'")
    return dummy_caption

# TEST: Generate caption
print("📝 TESTING: Generating product caption...")
caption = generate_product_caption(image)
print(f"Generated Caption: '{caption}'")

---

### Product Question and Answering

In [None]:
def ask_about_product(image: Image.Image, question: str) -> str:
    """
    Answer questions about the image using BLIP VQA
    
    Args:
        image (Image.Image): PIL Image to analyze
        question (str): Question to ask about the image
        
    Returns:
        str: Answer to the question
        
    TODO: Implement visual question answering using BLIP VQA
    HINT: Use the VQA approach from blip.ipynb
    HINT: Use the global blip_vqa_model and blip_vqa_processor
    
    EXAMPLE INPUT: 
        image = <PIL Image of red shoes>
        question = "What color are these shoes?"
        
    EXAMPLE OUTPUT: "red"
    """
    print(f"❓ Answering: '{question}'")
    
    # TODO: Implement BLIP VQA
    # 1. Process image and question using blip_vqa_processor
    # inputs = ?

    # 2. Generate answer using blip_vqa_model
    # with torch.no_grad():
    #     out = ?

    # 3. Decode and return the answer
    # answer = ?
    # return answer
    
    # DUMMY IMPLEMENTATION (Remove this when implementing)
    dummy_answer = "unknown"
    print(f"⚠️ DUMMY: Answer: {dummy_answer}")
    return dummy_answer

# TEST: Visual Question Answering
print("❓ TESTING: Visual Question Answering...")
test_questions = [
    "What color are these shoes?",
    "What brand are these shoes?",
    "Are these sneakers or dress shoes?"
]

print("VQA Results:")
for question in test_questions:
    answer = ask_about_product(image, question)
    print(f"  Q: {question}")
    print(f"  A: {answer}")
    print()

---

### Get Category Questions and Answers

In [None]:
def get_category_questions(category: str) -> List[str]:
    """
    Generate relevant questions based on product category
    
    Args:
        category (str): Product category (e.g., "shoes", "clothing")
        
    Returns:
        List[str]: List of relevant questions for the category
        
    TODO: Create category-specific questions for better product analysis
    
    EXAMPLE INPUT: "shoes"
    EXAMPLE OUTPUT: [
        "What color are these shoes?",
        "What type of shoes are these?", 
        "What brand are these shoes?",
        "What material are these shoes made of?"
    ]
    """
    
    # TODO: Create a mapping of categories to relevant questions
    # Categories to support: clothing, shoes, electronics, furniture, books, toys
    
    question_map = {
        "shoes": [
            "What color are these shoes?",
            "What type of shoes are these?",
            "What brand are these shoes?",
            "What material are these shoes made of?"
        ],
        "clothing": [
            # TODO: Add clothing-specific questions
        ],
        "electronics": [
            # TODO: Add electronics-specific questions  
        ],
        "furniture": [
            # TODO: Add furniture-specific questions
        ]
    }
    
    # TODO: Return questions for the category, or default questions if category not found
    return question_map.get(category, [
        "What color is this?",
        "What type of item is this?",
        "What is this made of?"
    ])

# TEST: Category questions
print("📋 TESTING: Category-specific questions...")
test_categories = ["shoes", "clothing", "electronics", "furniture"]

for category in test_categories:
    questions = get_category_questions(category)
    print(f"{category.title()} Questions:")
    for q in questions:
        print(f"  - {q}")
    print()


---

### Product Analyzer

In [None]:
def analyze_product(image_url_or_pil: Union[str, Image.Image]) -> Dict:
    """
    Main function to analyze a product image and generate complete metadata
    
    Args:
        image_url_or_pil (Union[str, Image.Image]): Image URL or PIL Image
        
    Returns:
        Dict: Complete product analysis including category, description, and attributes
        
    TODO: Implement the complete product analysis pipeline
    
    PIPELINE STEPS:
    1. Load image (if URL provided)
    2. Classify product category using CLIP
    3. Generate product description using BLIP captioning
    4. Ask category-specific questions using BLIP VQA
    5. Compile and return results
    
    EXAMPLE INPUT: "https://images.unsplash.com/photo-1542291026-7eec264c27ff"
    EXAMPLE OUTPUT: {
        "category": {"name": "shoes", "confidence": 0.8945},
        "description": "a pair of red nike sneakers on a white background",
        "attributes": {
            "What color are these shoes?": "red",
            "What type of shoes are these?": "sneakers",
            "What brand are these shoes?": "nike"
        },
        "status": "success"
    }
    """
    print("🚀 Starting product analysis...")
    print("=" * 50)
    
    try:
        # TODO: Step 1 - Load image if URL provided
        # if isinstance(image_url_or_pil, str):
        #     image = load_image_from_url(image_url_or_pil)
        #     if image is None:
        #         return {"error": "Failed to load image", "status": "failed"}
        # else:
        #     image = image_url_or_pil
        
        # TODO: Step 2 - Classify product category
        # product_categories = ["clothing", "shoes", "electronics", "furniture", "books", "toys"]
        # classification_results = classify_product_image(image, product_categories)
        # top_category = classification_results[0]
        
        # TODO: Step 3 - Generate product description
        # description = generate_product_caption(image)
        
        # TODO: Step 4 - Get category-specific questions and ask them
        # category = top_category['label']
        # questions = get_category_questions(category)
        # qa_results = {}
        # for question in questions:
        #     answer = ask_about_product(image, question)
        #     qa_results[question] = answer
        
        # TODO: Step 5 - Compile results
        # result = {
        #     "category": {"name": category, "confidence": top_category['score']},
        #     "description": description,
        #     "attributes": qa_results,
        #     "status": "success"
        # }
        
        # DUMMY IMPLEMENTATION (Remove this when implementing)
        result = {
            "category": {"name": "shoes", "confidence": 0.8945},
            "description": "a pair of red sneakers on a white background",
            "attributes": {
                "What color are these shoes?": "red",
                "What type of shoes are these?": "sneakers"
            },
            "status": "success"
        }
        print("⚠️ DUMMY: Returning fake analysis results")
        
        print("\n✅ Product analysis complete!")
        return result
        
    except Exception as e:
        print(f"❌ Error during processing: {e}")
        return {"error": str(e), "status": "failed"}

# TEST: Complete product analysis
print("🚀 TESTING: Complete product analysis pipeline...")
analysis_result = analyze_product(sample_url)
print("Complete Analysis Result:")
print(analysis_result)
print()

---