In [1]:
import pandas as pd
import os, sys, importlib
sys.path.append(os.path.abspath(".."))
from utils import images_clf
import json

## TEST : autoclf

## labels2emb

In [None]:
## labels_text (prompt):
#？
is_default_pic_labels = {
    "1": "the official Airbnb default profile picture, a gray geometric human silhouette",
    "0": "a normal user-uploaded profile picture"
}

has_person_labels = {
    "1": "a photo that contains one or more people",
    "0": "a photo without any people"
}
# type_labels = {
#     "life": "a person shown in a real-life scene or activity, with visible environment or lifestyle context.",
#     "pro": "a clean portrait or headshot focused mainly on the face, with little or no background information.",
#     "UNK": "no visible person, or not enough information to determine lifestyle vs portrait."
# }

type_labels = {
    "life": 
        "a photo of a person in a visible daily scene or some activities",
    "pro": 
        "a portrait or headshot,focused mainly on the face, with little or no background information.",
    "UNK": 
        "an image without any people or cannot determine whether it is lifestyle or portrait"
}
quality_labels = {
    "high": 
        "a clear, high-quality photo with good lighting and sharp details",
    "low": 
        "a low-quality photo with blur, noise, poor lighting or distortion",
    "UNK": 
        "quality cannot be determined"
}
is_smiling_labels = {
    "1": "a person smiling visibly",
    "0": "a person not smiling",
    "UNK": "no person or cannot see their face"
}
sex_labels = {
    "M": "a photo of a man",
    "F": "a photo of a woman",
    "MIX": "a photo with multiple people of mixed gender",
    "UNK": "the gender of the person cannot be determined or no person present"
}
labels_text = {
    "type": type_labels,
    "quality": quality_labels,
    "is_smiling": is_smiling_labels,
    "sex": sex_labels,
    "has_person": has_person_labels,
    "is_default_pic": is_default_pic_labels
}

# with open ("labels/labels_text.json","w", encoding='utf-8') as f :
#     json.dump(labels_text, f, indent=2)


In [2]:
# embed labels :
import torch
import numpy as np
from transformers import CLIPProcessor, CLIPModel
device='cuda' if torch.cuda.is_available() else "cpu" 
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:

# 1) 读取你的 labels JSON
with open("labels/labels_text.json", "r") as f:
    labels_text = json.load(f)

def embed_text_by_clip(text_list):
    """
    text_list: list of strings
    return: np.array of shape (len(text_list), embedding_dim)
    label 被省去，只留下具体描述

    """
    with torch.no_grad():
        inputs = processor(text=text_list, return_tensors="pt", padding=True).to(device)
        text_features = model.get_text_features(**inputs)  # (N, 512)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
        return text_features.cpu().numpy().astype("float32")

# 2) 生成 embedding
labels_emb = {}
for category, dic in labels_text.items():
    texts = list(dic.values())  # e.g. ["life prompt", "pro prompt", "UNK prompt"]
    emb = embed_text_by_clip(texts)  # shape = (num_classes, 512)
    labels_emb[category] = emb


# # 3) 保存到单个 npz 文件
# np.savez("labels/labels_emb.npz", **labels_emb)
# print("[SUCCES] Saved text embeddings → labels/labels_emb.npz")


[SUCCES] Saved text embeddings → labels/labels_emb.npz


In [3]:
#检查：
emb = np.load("labels/labels_emb.npz")
type_emb = emb["type"]       # (3, 512)
quality_emb = emb["quality"] # (3, 512)
sex_emb = emb["sex"]         # (4, 512)
print(type_emb.shape)
# 假设有一个 image embedding img_emb (1, 512)

# import numpy as np
# pred_idx = np.argmax(np.dot(img_emb, type_emb.T))
# pred_label = list(labels_text["type"].keys())[pred_idx]
# print(pred_label)


(3, 512)


In [4]:
# prediction：
def zero_shot_predict(image_emb, text_emb_dict):
    """
    image_emb: shape (512,)
    text_emb_dict: {"life": (512,), "pro": (512,), ...}
    """
    labels = list(text_emb_dict.keys())
    text_embs = np.stack([text_emb_dict[k] for k in labels])  # (K, 512)

    # cosine similarity
    scores = image_emb @ text_embs.T
    best = labels[np.argmax(scores)]
    return best, scores

In [None]:
import numpy as np
import json
import os
from tqdm import tqdm
import time
start_time=time.time()


# ------------------------
# 文件路径
# ------------------------
image_emb_path = "embeddings_SAMPLE/emb_SAMPLE.npz"
text_emb_path = "labels/labels_emb.npz"
text_json_path = "labels/labels_text.json"
output_json_path = "annotations_SAMPLE/autoclf_predictions.json"

# ------------------------
# 读取 embeddings
# ------------------------
image_embs = np.load(image_emb_path)  # keys: "host_id.jpg"
text_embs_np = np.load(text_emb_path)
with open(text_json_path, "r") as f:
    labels_text = json.load(f)
default_pic_emb=image_embs["336591839.jpg"]


# ------------------------
# 预测函数
# ------------------------
def zero_shot_predict(img_emb, text_emb_dict):
    """
    img_emb: np.array (512,)
    text_emb_dict: np.array (num_classes, 512)
    """
    sims = img_emb @ text_emb_dict.T        # cosine similarity
    idx = np.argmax(sims)
    return idx, sims


def is_default_pic(image_emb, default_pic_emb, threshold=0.95):
    """
    image_emb: np.array (512,)
    default_emb: np.array (512,)
    threshold: cosine similarity threshold
    """
    sim = image_emb @ default_pic_emb  # cosine similarity, embeddings 已经 L2-normalized
    return "1" if sim >= threshold else "0"

# ------------------------
# 遍历每张图片
# ------------------------
predictions = {}  # 存储结果
for fname in tqdm(image_embs.files, desc='predict on images...'):
    img_emb = image_embs[fname]
    
    # pred = {}
    # for category, text_emb_np in text_embs_np.items():
    #     idx, sims = zero_shot_predict(img_emb, text_emb_np)
    #     label_keys = list(labels_text[category].keys())
    #     pred_label = label_keys[idx]
    #     pred[category] = pred_label
    
    # predictions[fname] = pred
    pred = {}
    for category, text_emb_np in text_embs_np.items():
        if category == "is_default_pic":
            pred_label = is_default_pic(img_emb, default_pic_emb)
        else:
            idx, sims = zero_shot_predict(img_emb, text_emb_np)
            label_keys = list(labels_text[category].keys())
            pred_label = label_keys[idx]
        pred[category] = pred_label

    predictions[fname] = pred

# ------------------------
# 保存 JSON
# ------------------------
with open(output_json_path, "w") as f:
    json.dump(predictions, f, indent=2)


end_time=time.time()
print(f"✅ Auto predictions on {len(image_embs)} images saved → {output_json_path}: {end_time-start_time:.2f} sec!")


predict on images...: 100%|██████████| 20/20 [00:00<00:00, 1632.41it/s]

Prediction on 102571900.jpg:
Prediction on 106294215.jpg:
Prediction on 106365215.jpg:
Prediction on 137154154.jpg:
Prediction on 212791574.jpg:
Prediction on 2379345.jpg:
Prediction on 24654560.jpg:
Prediction on 2798386.jpg:
Prediction on 28470251.jpg:
Prediction on 32741638.jpg:
Prediction on 336591839.jpg:
Prediction on 425502119.jpg:
Prediction on 517697918.jpg:
Prediction on 52438163.jpg:
Prediction on 52801103.jpg:
Prediction on 553099349.jpg:
Prediction on 57226046.jpg:
Prediction on 71320446.jpg:
Prediction on 873444.jpg:
Prediction on 88933385.jpg:
✅ Auto predictions on 20 images saved → annotations_SAMPLE/autoclf_predictions.json: 0.02 sec!



