In [None]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

load_dotenv() 

mongo_uri = os.getenv("MONGO_URI")
client = MongoClient(mongo_uri)
db = client["product_db"]
collection = db["products"]

# Test insert
collection.insert_one({"name": "red shoe", "path": "data/red_shoe.jpg"})
print(list(collection.find()))


In [None]:
from transformers import CLIPProcessor

# Load the processor
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Inspect the processor object
print(dir(processor))

In [None]:
from groundingdino.util.inference import load_model, load_image, predict, annotate
import cv2
from dotenv import load_dotenv
import os

load_dotenv()
model = load_model(os.path.join("../", os.getenv("GROUDNINGDINO_CONFIG_PATH")), "../weights/groundingdino_swint_ogc.pth")
IMAGE_PATH = "cat_dog.jpeg"
TEXT_PROMPT = "chair . person . dog ."
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25

image_source, image = load_image(IMAGE_PATH)

boxes, logits, phrases = predict(
    model=model,
    image=image,
    caption=TEXT_PROMPT,
    box_threshold=BOX_TRESHOLD,
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
cv2.imwrite("annotated_image.jpg", annotated_frame)

print(os.getenv("GROUDNINGDINO_CONFIG_PATH"))

In [None]:
import numpy as np
import torch
import cv2
from segment_anything import sam_model_registry, SamPredictor

# Chọn model
model_type = "vit_b"

if model_type == "vit_h":
    sam_checkpoint = "../weights/sam_vit_h_4b8939.pth" # 2.4gb
elif model_type == "vit_l":
    sam_checkpoint = "../weights/sam_vit_l_0b3195.pth" # 1.2gb
elif model_type == "vit_b":
    sam_checkpoint = "../weights/sam_vit_b_01ec64.pth" # 358mb
    
device = "cuda" if torch.cuda.is_available() else "cpu"
sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
sam.to(device=device)

predictor = SamPredictor(sam)

# Load ảnh test
image = cv2.imread("cat_dog.jpeg")
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
predictor.set_image(image)


# Box format: [x1, y1, x2, y2]
box = np.array([[50, 50, 300, 300]])   # shape (1,4)
masks, scores, logits = predictor.predict(
    box=box,
    multimask_output=True
)

print("Masks shape:", masks.shape)


In [None]:
from transformers import CLIPModel, CLIPProcessor
import torch
from PIL import Image
model_name="openai/clip-vit-base-patch32"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = CLIPModel.from_pretrained(model_name, dtype=torch.bfloat16, attn_implementation="sdpa", use_safetensors=True).to(device)

In [None]:
processor = CLIPProcessor.from_pretrained(model_name,  use_safetensors=True)

In [None]:
inputs = processor(text="a photo of cat", images=None)

In [None]:
inputs

In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", attn_implementation="sdpa", use_safetensors=True).to(device)

In [None]:
outputs = model(input_ids=inputs.input_ids)

In [None]:
# help(CLIPProcessor.__call__)
# help(CLIPModel.__call__)

In [None]:
import inspect

print(inspect.signature(CLIPProcessor.__call__))
print(inspect.signature(CLIPModel.__call__))


In [1]:
import sys
import os

# Add the path to the 'B' directory to sys.path
sys.path.append(os.path.abspath("../src/models"))

from clip_encoder import CLIPEncoder


In [2]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
ce = CLIPEncoder(device=device)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
from PIL import Image

img = Image.open("cat_dog.jpeg").convert("RGB")
text = "a photo of a cat"

text_embed, img_embed = ce.encode(text=None, image=img)

In [15]:
text_embed, img_embed.shape

(None, (1, 512))

In [None]:
from transformers import CLIPModel, CLIPProcessor
import torch
from PIL import Image
device = 'cuda' if torch.cuda.is_available() else 'cpu'
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", use_safetensors=True, use_fast=True)
model = AutoModel.from_pretrained("openai/clip-vit-base-patch32", attn_implementation="sdpa", use_safetensors=True).to(device)

In [None]:
image_processor = processor.image_processor
tokenizer = processor.tokenizer

In [None]:
img_embeded.pixel_values.device

In [None]:
model.get_image_features(**img_embeded).shape


In [None]:
text_tokenize = tokenizer(text, return_tensors='pt', padding=True).to(device)

In [None]:
model.get_text_features(**text_tokenize).detach().cpu().numpy().shape


In [None]:
import requests
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModel

model = AutoModel.from_pretrained("openai/clip-vit-base-patch32", dtype=torch.bfloat16, attn_implementation="sdpa", use_safetensors=True)
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32", use_safetensors=True)

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
labels = ["a photo of a cat", "a photo of a dog", "a photo of a car"]

inputs = processor(text=labels, images=None, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
most_likely_idx = probs.argmax(dim=1).item()
most_likely_label = labels[most_likely_idx]
print(f"Most likely label: {most_likely_label} with probability: {probs[0][most_likely_idx].item():.3f}")

In [17]:
from transformers import CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", use_safetensors=True)
print(model.config.projection_dim)


512
