In [15]:
import torch
from PIL import Image
import clip
from transformers import BlipProcessor, BlipForConditionalGeneration

# -----------------------------
# 1. Load models
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load BLIP for image captioning
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# Load CLIP
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)

# -----------------------------
# 2. Load and process the image
# -----------------------------
image_path = "download (1).jpg" 
raw_image = Image.open(image_path).convert('RGB')

# BLIP captioning
blip_inputs = blip_processor(raw_image, return_tensors="pt").to(device)
caption_ids = blip_model.generate(**blip_inputs)
caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)

print(" Generated Caption:", caption)

# -----------------------------
# 3. Extract candidate keywords
# -----------------------------
# Simple method: use unique words (you can replace with spaCy or LLM)
stopwords = set(["a", "an", "the", "with", "and", "on", "of", "in", "this", "that", "it", "is", "to"])
words = list(set(caption.lower().split()))
candidates = [word.strip('.,') for word in words if word not in stopwords and len(word) > 2]

if not candidates:
    raise ValueError(" No brand candidates found. Try a clearer image or use spaCy/NER for better results.")

print(" Candidate Words:", candidates)

# -----------------------------
# 4. Use CLIP to find best-matching word
# -----------------------------
# Create prompts
prompts = [f"a product from {word}" for word in candidates]
text_tokens = clip.tokenize(prompts).to(device)
image_input = clip_preprocess(raw_image).unsqueeze(0).to(device)

# Encode with CLIP
with torch.no_grad():
    image_features = clip_model.encode_image(image_input)
    text_features = clip_model.encode_text(text_tokens)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    similarity = (image_features @ text_features.T).squeeze(0)
    best_idx = similarity.argmax().item()

# -----------------------------
# 5. Output result
# -----------------------------
print(f"\n🏷️ Predicted Brand: {candidates[best_idx]}")


 Generated Caption: louis vu mono cherry mini
 Candidate Words: ['mono', 'louis', 'mini', 'cherry']

🏷️ Predicted Brand: cherry


In [12]:
import torch
from PIL import Image
import clip
from transformers import BlipProcessor, BlipForConditionalGeneration

# -----------------------------
# 1. Load models
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load BLIP for image captioning
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# Load CLIP for similarity checking
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)

# -----------------------------
# 2. Load and process the image
# -----------------------------
image_path = "download4.jpg"  
raw_image = Image.open(image_path).convert('RGB')

# Generate caption using BLIP
blip_inputs = blip_processor(raw_image, return_tensors="pt").to(device)
caption_ids = blip_model.generate(**blip_inputs)
caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)

print("📝 Generated Caption:", caption)

# -----------------------------
# 3. Extract candidate words
# -----------------------------
stopwords = set([
    "a", "an", "the", "with", "and", "on", "of", "in", "this", "that", "it", "is", "to", "for", "from", "by"
])
words = list(set(caption.lower().split()))
candidates = [word.strip('.,') for word in words if word not in stopwords and len(word) > 2]

if not candidates:
    raise ValueError("No brand candidates found. Try a clearer image or improve captioning.")

print("🔍 Candidate Keywords:", candidates)

# -----------------------------
# 4. Use CLIP to score candidates
# -----------------------------
# Build prompts for CLIP
prompts = [f"a product from {word}" for word in candidates]
text_tokens = clip.tokenize(prompts).to(device)
image_input = clip_preprocess(raw_image).unsqueeze(0).to(device)

# Encode image and text using CLIP
with torch.no_grad():
    image_features = clip_model.encode_image(image_input)
    text_features = clip_model.encode_text(text_tokens)

    # Normalize features
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    # Compute cosine similarity
    similarity = (image_features @ text_features.T).squeeze(0)
    best_idx = similarity.argmax().item()

# -----------------------------
# 5. Output best match
# -----------------------------
print(f"\n🏷️ Predicted Brand or Keyword: **{candidates[best_idx]}**")
print(f"💡 CLIP Prompt Used: '{prompts[best_idx]}'")


📝 Generated Caption: puer puer puer puer puer puer puer puer puer puer
🔍 Candidate Keywords: ['puer']

🏷️ Predicted Brand or Keyword: **puer**
💡 CLIP Prompt Used: 'a product from puer'


In [None]:
import torch
from PIL import Image
import clip
import spacy
from transformers import BlipProcessor, BlipForConditionalGeneration

# -----------------------------
# 1. Load models
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load BLIP model
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# Load CLIP model
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)


# Load spaCy NER model
nlp = spacy.load("en_core_web_sm")

# -----------------------------
# 2. Load and caption image
# -----------------------------
image_path = "download5.jpg"  
raw_image = Image.open(image_path).convert('RGB')

# Generate caption with BLIP
inputs = blip_processor(raw_image, return_tensors="pt").to(device)
caption_ids = blip_model.generate(**inputs)
caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)

print("Generated Caption:", caption)

# -----------------------------
# 3. Extract named entities using spaCy
# -----------------------------
doc = nlp(caption)
entities = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT", "PERSON", "GPE"]]

if not entities:
    print("No named entities found. Falling back to keywords.")
    entities = list(set(caption.lower().split()))  # fallback to generic words

print("Candidate Brand Entities:", entities)



# -----------------------------
# # 4. Rerank using CLIP
# # -----------------------------
# prompts = [f"a product from {ent}" for ent in entities]
# text_tokens = clip.tokenize(prompts).to(device)
# image_input = clip_preprocess(raw_image).unsqueeze(0).to(device)

# # CLIP similarity
# with torch.no_grad():
#     image_features = clip_model.encode_image(image_input)
#     text_features = clip_model.encode_text(text_tokens)
#     image_features /= image_features.norm(dim=-1, keepdim=True)
#     text_features /= text_features.norm(dim=-1, keepdim=True)

#     similarity = (image_features @ text_features.T).squeeze(0)
#     best_idx = similarity.argmax().item()

# -----------------------------
# 5. Output
# -----------------------------
# print(f" Predicted Brand (NER + CLIP): {entities[best_idx]}")


Generated Caption: nike ky ky ky ky ky ky ky ky ky ky ky ky ky ky ky ky ky ky ky
No named entities found. Falling back to keywords.
Candidate Brand Entities: ['ky', 'nike']


In [23]:
print(entities[0])

nike


In [26]:
import torch
from PIL import Image
import spacy
from transformers import BlipProcessor, BlipForConditionalGeneration

# -----------------------------
# 1. Load models
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load BLIP model
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# Load spaCy NER model
nlp = spacy.load("en_core_web_sm")

# -----------------------------
# 2. Load and caption image
# -----------------------------
image_path = "images.jpg"  
raw_image = Image.open(image_path).convert('RGB')

# Generate caption with BLIP
inputs = blip_processor(raw_image, return_tensors="pt").to(device)
caption_ids = blip_model.generate(**inputs)
caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)

print("Generated Caption:", caption)

# -----------------------------
# 3. Extract named entities using spaCy
# -----------------------------
doc = nlp(caption)
entities = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT", "PERSON", "GPE"]]

if not entities:
    print("No named entities found. Falling back to keywords.")
    entities = list(set(caption.lower().split()))  # fallback to generic words

print("Candidate Brand Entities:", entities)

print(entities[0])


Generated Caption: nike zoom zoom low gs
No named entities found. Falling back to keywords.
Candidate Brand Entities: ['nike', 'zoom', 'gs', 'low']
nike


In [28]:
import torch
from PIL import Image
import spacy
from transformers import BlipProcessor, BlipForConditionalGeneration

# -----------------------------
# 1. Load models
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load BLIP model
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# Load spaCy NER model
nlp = spacy.load("en_core_web_sm")

# -----------------------------
# 2. Load and caption image
# -----------------------------
image_path = "images.jpg"  
raw_image = Image.open(image_path).convert('RGB')

# Generate caption with BLIP
inputs = blip_processor(raw_image, return_tensors="pt").to(device)
caption_ids = blip_model.generate(**inputs)
caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True).strip()

print("Generated Caption:", caption)

# Add punctuation to help spaCy NER (optional but helpful)
if not caption.endswith('.'):
    caption += '.'

# -----------------------------
# 3. Extract named entities using spaCy
# -----------------------------
doc = nlp(caption)
entities = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT", "PERSON", "GPE"]]

# -----------------------------
# 4. Fallback if no entities found
# -----------------------------
if not entities:
    print("No named entities found. Falling back to keyword-based heuristic.")

    # Tokenize caption
    tokens = caption.lower().replace('.', '').split()

    # Optional: remove common meaningless words
    ignored_words = {'the', 'a', 'an', 'is', 'low', 'gs', 'shoe', 'shoes', 'color', 'white', 'black'}
    filtered_tokens = [t for t in tokens if t not in ignored_words and len(t) > 1]

    # Frequency-based fallback
    if filtered_tokens:
        token_freq = {t: filtered_tokens.count(t) for t in set(filtered_tokens)}
        sorted_tokens = sorted(token_freq.items(), key=lambda x: (-x[1], caption.lower().find(x[0])))
        fallback_brand = sorted_tokens[0][0]
        entities = [fallback_brand]
    else:
        print("No valid tokens found for fallback.")
        entities = []

# -----------------------------
# 5. Print results
# -----------------------------
if entities:
    print("Candidate Brand Entities:", entities)
    print("Predicted Brand Name:", entities[0])
else:
    print("Brand name could not be identified.")


Generated Caption: nike zoom zoom low gs
No named entities found. Falling back to keyword-based heuristic.
Candidate Brand Entities: ['zoom']
Predicted Brand Name: zoom


In [30]:
import torch
from PIL import Image
import spacy
from transformers import BlipProcessor, BlipForConditionalGeneration
import re
from spacy.lang.en.stop_words import STOP_WORDS

# Optional: Add known brand names for matching (can be extended)
known_brands = [
    "nike", "adidas", "puma", "reebok", "new balance", "asics", "under armour",
    "vans", "converse", "fila", "skechers", "jordans", "balenciaga", "gucci"
]

# -----------------------------
# 1. Load models
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load BLIP model
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# Load spaCy NER
nlp = spacy.load("en_core_web_sm")

# -----------------------------
# 2. Load and caption image
# -----------------------------
image_path = "download (1).jpg"
raw_image = Image.open(image_path).convert("RGB")

# Generate caption
inputs = blip_processor(raw_image, return_tensors="pt").to(device)
caption_ids = blip_model.generate(**inputs)
caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)

print("Generated Caption:", caption)

# -----------------------------
# 3. Preprocess caption text
# -----------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    return text

clean_caption = clean_text(caption)
caption_tokens = [word for word in clean_caption.split() if word not in STOP_WORDS]

# -----------------------------
# 4. Named Entity Recognition
# -----------------------------
doc = nlp(caption)
entities = [ent.text.lower() for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT", "GPE", "PERSON"]]

# Combine NER and cleaned tokens
candidate_entities = list(set(entities + caption_tokens))

# -----------------------------
# 5. Match against known brand list
# -----------------------------
matched_brands = [word for word in candidate_entities if word in known_brands]

# -----------------------------
# 6. Output result
# -----------------------------
print("Cleaned Tokens:", caption_tokens)
print("NER Entities:", entities)
print("Candidate Brand Entities:", candidate_entities)

if matched_brands:
    print(f"Predicted Brand (matched): {matched_brands[0]}")
elif candidate_entities:
    print(f"Predicted Brand (top candidate): {candidate_entities[0]}")
else:
    print("Predicted Brand: Unknown")


Generated Caption: louis vu mono cherry mini
Cleaned Tokens: ['louis', 'vu', 'mono', 'cherry', 'mini']
NER Entities: ['louis vu mono']
Candidate Brand Entities: ['mono', 'louis', 'vu', 'louis vu mono', 'mini', 'cherry']
Predicted Brand (top candidate): mono


In [34]:
import torch
from PIL import Image
import spacy
from transformers import BlipProcessor, BlipForConditionalGeneration
import re
from spacy.lang.en.stop_words import STOP_WORDS

# -----------------------------
# 1. Load models
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load BLIP model for image captioning
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# Load spaCy NER model
nlp = spacy.load("en_core_web_sm")

# -----------------------------
# 2. Load and caption image
# -----------------------------
image_path = "download4.jpg"
raw_image = Image.open(image_path).convert("RGB")

inputs = blip_processor(raw_image, return_tensors="pt").to(device)
caption_ids = blip_model.generate(**inputs)
caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)

print("Generated Caption:", caption)

# -----------------------------
# 3. Preprocess caption
# -----------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    return text

clean_caption = clean_text(caption)
tokens = [word for word in clean_caption.split() if word not in STOP_WORDS]

# -----------------------------
# 4. Named Entity Recognition
# -----------------------------
doc = nlp(caption)
ner_entities = [ent.text.lower() for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT", "GPE", "PERSON"]]

# -----------------------------
# 5. Merge + Filter Duplicates
# -----------------------------
candidates = list(dict.fromkeys(ner_entities + tokens))  # preserves order, removes duplicates

# -----------------------------
# 6. Select Best Candidate
# -----------------------------
if ner_entities:
    predicted_brand = ner_entities[0]  # prioritize named entity
    source = "NER"
elif candidates:
    predicted_brand = candidates[0]  # fallback to clean keyword
    source = "Heuristic fallback"
else:
    predicted_brand = "Unknown"
    source = "None"

# -----------------------------
# 7. Output
# -----------------------------
print("NER Entities:", ner_entities)
print("Caption Tokens:", tokens)
print("Candidate Brand Entities:", candidates)
print(f"Predicted Brand ({source}): {predicted_brand}")


Generated Caption: puer puer puer puer puer puer puer puer puer puer
NER Entities: ['puer puer', 'puer puer', 'puer puer', 'puer puer', 'puer puer']
Caption Tokens: ['puer', 'puer', 'puer', 'puer', 'puer', 'puer', 'puer', 'puer', 'puer', 'puer']
Candidate Brand Entities: ['puer puer', 'puer']
Predicted Brand (NER): puer puer


In [2]:
import torch
from PIL import Image
import spacy
import re
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from spacy.lang.en.stop_words import STOP_WORDS


In [2]:
# -----------------------------
# 1. Load Models
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load BLIP-2 (OPT version - not instruction-tuned)
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:

# Load spaCy English model for NER
nlp = spacy.load("en_core_web_sm")

# -----------------------------
# 2. Load and Caption Image
# -----------------------------
image_path = "download12.jpg"  # Replace with your image path
image = Image.open(image_path).convert("RGB")

# No prompt — BLIP-2 OPT works as a caption generator here
inputs = processor(images=image, return_tensors="pt").to(device)
out = model.generate(**inputs, max_new_tokens=30)
caption = processor.decode(out[0], skip_special_tokens=True)

print("Generated Caption:", caption)

# -----------------------------
# 3. Clean Caption & Tokenize
# -----------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

clean_caption = clean_text(caption)
tokens = [word for word in clean_caption.split() if word not in STOP_WORDS]

# -----------------------------
# 4. Named Entity Recognition
# -----------------------------
doc = nlp(caption)
ner_entities = [ent.text.lower() for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT", "GPE", "PERSON"]]

# -----------------------------
# 5. Combine & Select Brand
# -----------------------------
candidates = list(dict.fromkeys(ner_entities + tokens))  # Deduplicate, keep order

if ner_entities:
    predicted_brand = ner_entities[0]  # Prefer NER entity
    source = "NER"
elif candidates:
    predicted_brand = candidates[0]    # Fallback to token
    source = "Heuristic"
else:
    predicted_brand = "Unknown"
    source = "None"

# -----------------------------
# 6. Output Results
# -----------------------------
print("NER Entities:", ner_entities)
print("Tokens:", tokens)
print("Candidates:", candidates)
print(f"Predicted Brand ({source}): {predicted_brand}")


Generated Caption: the women's reebok crossfit running shoe is black and pink

NER Entities: []
Tokens: ['womens', 'reebok', 'crossfit', 'running', 'shoe', 'black', 'pink']
Candidates: ['womens', 'reebok', 'crossfit', 'running', 'shoe', 'black', 'pink']
Predicted Brand (Heuristic): womens


In [24]:
# -----------------------------
# 2. Load Image & Prompt
# -----------------------------
image_path = "download13.jpg"
image = Image.open(image_path).convert("RGB")
prompt = "Question: What brand is this product? Answer:"

# -----------------------------
# 3. Generate Caption (Brand Name)
# -----------------------------
inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
output = model.generate(**inputs, max_new_tokens=30)
caption = processor.decode(output[0], skip_special_tokens=True)

# -----------------------------
# 4. Display Result
# -----------------------------
print("Predicted Brand:", caption.strip())


Predicted Brand: Question: What brand is this product? Answer: louis vuitton


In [27]:
mage = Image.open("download13.jpg")
prompt = "Question: What brand is this product? Answer:"

inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
output = model.generate(**inputs, max_new_tokens=10)
caption = processor.decode(output[0], skip_special_tokens=True)

# Extract only brand name if prompt is repeated in output
brand = caption.split("Answer:")[-1].strip() if "Answer:" in caption else caption.strip()

print("Predicted Brand:", brand)


Predicted Brand: louis vuitton
