# Clip Classfier
___
Sep 10, 2024

In [1]:
import os
import torch
import torch.nn.functional as F
from loguru import logger
from open_clip import create_model_from_pretrained, get_tokenizer
from PIL import Image

from dotenv import load_dotenv



In [2]:
# load_dotenv()

device = torch.device("cuda:0") # os.getenv("device")
model = "hf-hub:timm/ViT-SO400M-14-SigLIP" # os.getenv("model")



# with open('data.json', 'r') as f:
#     data = json.load(f)

data = {
    "indoor": [
        "in a living room",
        "in a bathroom",
        "in the kitchen in the Art Nouveau style",
        "in front of a large bay window in a library with a stained-glass ceiling",
        "in a majestic library with high ceilings and ornate chandeliers",
        "in a cozy library with a fireplace",
        "in a cozy dining room, with a warm fireplace",
        "in a country-style dining room with a view of a garden"
    ],
    "outdoor": [
        "next to the house on the grass",
        "in the middle of the barbecue area",
        "on the outdoor terrace",
        "placed in a gazebo adorned with climbing vines",
        "in a charming gazebo, bathed in warm sunlight",
        "under a blooming cherry tree in a lush garden",
        "on a sun-drenched patio surrounded by colorful flowers"
    ],
    "heuristics": {
        "chair": [
            0.55,
            1.0
        ],
        "table": [
            0.55,
            1.0
        ],
        "sofa": [
            0.55,
            1.0
        ],
        "bed": [
            0.55,
            1.0
        ],
        "lamp": [
            0.55,
            1.0
        ],
        "simple chandelier": [
            0.45,
            0.0
        ],
        "modern chandelier": [
            0.45,
            0.0
        ],
        "vintage chandelier": [
            0.45,
            0.0
        ],
        "table": [
            0.55,
            1.0
        ],
        "lounge chair": [
            0.55,
            1.0
        ],
        "beach umbrella": [
            0.55,
            0.5
        ],
        "swimming pool": [
            0.55,
            1.0
        ],
        "beach lounger": [
            0.55,
            1.0
        ],
        "swing": [
            0.55,
            1.0
        ],
        "hammock": [
            0.55,
            1.0
        ],
        "hanging chair": [
            0.55,
            1.0
        ],
        "gazebo": [
            0.55,
            1.0
        ],
        "grill": [
            0.55,
            1.0
        ],
        "heating radiator": [
            0.55,
            1.0
        ]
    },
    "clip_data": {
        "furniture_types": [
            "chair",
            "table",
            "sofa",
            "bed",
            "lamp",
            "simple chandelier",
            "modern chandelier",
            "vintage chandelier",
            "table",
            "lounge chair",
            "beach umbrella",
            "swimming pool",
            "beach lounger",
            "swing",
            "hammock",
            "hanging chair",
            "gazebo",
            "grill",
            "heating radiator"
        ],
        "furniture_categories": [
            "garden furniture",
            "house furniture",
            "lightning"
        ]
    }
}

In [3]:
class ClipClassfier:
    def __init__(self, device, model, data: dict[str, list[str]]):
        self.furniture_types = data["furniture_types"]
        self.model, self.preprocess = create_model_from_pretrained(model)
        self.tokenizer = get_tokenizer(model)
        self.cat_types = ["indoor", "outdoor"]
        self.furniture_categories = data["furniture_categories"]

    def describe_image(self, img):
        image = self.preprocess(img).unsqueeze(0)

        descriptions = [f"photo of {item}" for item in self.furniture_types]
        furnitrure_index = self.__get_clip_prediction(image, descriptions)

        furniture_description = descriptions[furnitrure_index]
        furniture_guess = self.furniture_types[furnitrure_index]
        
        category_list = [f"{cat} {furniture_description}" for cat in self.cat_types]
        cat_index = self.__get_clip_prediction(image, category_list)
        category_guess = self.cat_types[cat_index]

        furniture_cat_prompts = [f"photo of {item}" for item in self.furniture_categories]
        furniture_cat_index = self.__get_clip_prediction(image, furniture_cat_prompts)
        furniture_cat_guess = self.furniture_categories[furniture_cat_index]
        
        logger.info(f"Provided item classified as {furniture_guess} which must placed {category_guess}")
        return {"furniture": furniture_guess, "category": category_guess, "furniture_category": furniture_cat_guess}

    def __get_clip_prediction(self, image, item_list):
        text = self.tokenizer(item_list, context_length=self.model.context_length)
        with torch.no_grad(), torch.cuda.amp.autocast():
            image_features = self.model.encode_image(image)
            text_features = self.model.encode_text(text)
            image_features = F.normalize(image_features, dim=-1)
            text_features = F.normalize(text_features, dim=-1)
            text_probs = torch.sigmoid(
                image_features @ text_features.T * self.model.logit_scale.exp() + self.model.logit_bias
            )

        return text_probs[0].argmax().item()

In [5]:
preprocessor = ClipClassfier(device, model, data['clip_data'])

open_clip_pytorch_model.bin:   0%|          | 0.00/3.51G [00:00<?, ?B/s]

open_clip_config.json:   0%|          | 0.00/917 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [6]:
img = Image.open(img_path)

raw_img = img.copy()
item_description = self.clip_clf.describe_image(raw_img)

<__main__.ClipClassfier at 0x7f7b8a26ff40>