In [5]:
import os
import zipfile
import pandas as pd
from PIL import Image
from tqdm import tqdm
import faiss, numpy as np
from transformers import CLIPModel, CLIPProcessor
import torch

In [6]:
# ==== 路径配置 ====
# "nlphuji/flickr30k" on huggingface
DATA_ROOT = "/workspace/datasets/flickr30k"
IMG_ZIP_PATH = os.path.join(DATA_ROOT, 
                            "flickr30k-images.zip")
IMG_DIR = os.path.join(DATA_ROOT, "images")
CAPTION_FILE = os.path.join(DATA_ROOT, 
                            "flickr_annotations_30k.csv")

In [7]:
# ==== 解压图像 ====
if not os.path.exists(IMG_DIR):
    with zipfile.ZipFile(IMG_ZIP_PATH, "r") as zip_ref:
        zip_ref.extractall(IMG_DIR)

In [10]:
# ==== 读取描述文件 ====
df = pd.read_csv(CAPTION_FILE)
print(df.columns)
print(df.head())
# 只取前 1000 张图像及对应一条描述
df = df.groupby("filename").first().reset_index().\
    iloc[:1000]

Index(['raw', 'sentids', 'split', 'filename', 'img_id'], dtype='object')
                                                 raw               sentids  \
0  ["Two young guys with shaggy hair look at thei...       [0, 1, 2, 3, 4]   
1  ["Several men in hard hats are operating a gia...       [5, 6, 7, 8, 9]   
2  ["A child in a pink dress is climbing up a set...  [10, 11, 12, 13, 14]   
3  ["Someone in a blue shirt and hat is standing ...  [15, 16, 17, 18, 19]   
4  ["Two men, one in a gray shirt, one in a black...  [20, 21, 22, 23, 24]   

   split        filename  img_id  
0  train  1000092795.jpg       0  
1  train    10002456.jpg       1  
2  train  1000268201.jpg       2  
3  train  1000344755.jpg       3  
4  train  1000366164.jpg       4  


In [14]:
# 加载预训练的 CLIP 模型与处理器
# ==== 加载 CLIP 模型 ====
clip_model = CLIPModel.from_pretrained(
    "openai/clip-vit-base-patch16").eval()
clip_processor = CLIPProcessor.from_pretrained(
    "openai/clip-vit-base-patch16")

In [22]:
# ==== 编码图像 ====
def encode_images(image_paths):
    vectors = []
    for img_path in tqdm(image_paths, desc="Encoding images"):
        img = Image.open(os.path.join(IMG_DIR + "/flickr30k-images", img_path)).convert("RGB")
        inputs = clip_processor(images=img, return_tensors="pt")
        with torch.no_grad():
            emb = clip_model.get_image_features(**inputs).cpu()
        vectors.append(emb)
    return torch.cat(vectors).numpy().astype("float32")

# ==== 编码文本 ====
def encode_texts(texts):
    inputs = clip_processor(text=texts, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        emb = clip_model.get_text_features(**inputs).cpu()
    return emb.numpy().astype("float32")

# ==== 提取向量 ====
image_vecs = encode_images(df["filename"].tolist())
text_vecs = encode_texts(df["raw"].tolist())

Encoding images: 100%|██████████| 1000/1000 [01:12<00:00, 13.85it/s]


## Test: HNSW数据集中只包含图片数据

In [None]:
# ==== 构建 FAISS HNSW 索引 ====
d = image_vecs.shape[1]
index = faiss.IndexHNSWFlat(d, 64)
index.hnsw.efConstruction = 256
index.add(image_vecs)
index.hnsw.efSearch = 256

In [42]:
NumTest = 1

# 执行文本→图像检索
# D: top-K 结果对应的距离distance
# I: top-K 结果对应的索引index
D, I = index.search(text_vecs[:NumTest], k=10)
print("I shape:", I.shape)
print("D shape:", D.shape)
print("Distance:", D)

# 输出前 NumTest 条文本对应的 Top-10 图像索引
for qid in range(NumTest):
    print(f"\n[Query {qid}] Text: {df['raw'].iloc[qid]}")
    print("Top-10 Retrieved Images:")
    for rank, idx in enumerate(I[qid]):
        print(idx, rank)
print(df['filename'])

I shape: (1, 10)
D shape: (1, 10)
Distance: [[100.57727  102.65476  102.883026 103.006096 104.107254 104.40054
  104.589485 105.162735 105.351585 105.717834]]

[Query 0] Text: ["Two young guys with shaggy hair look at their hands while hanging out in the yard.", "Two young, White males are outside near many bushes.", "Two men in green shirts are standing in a yard.", "A man in a blue shirt standing in a garden.", "Two friends enjoy time spent together."]
Top-10 Retrieved Images:
98 0
192 1
221 2
625 3
175 4
528 5
131 6
628 7
0 8
273 9
0      1000092795.jpg
1        10002456.jpg
2      1000268201.jpg
3      1000344755.jpg
4      1000366164.jpg
            ...      
995    1321651400.jpg
996    1321723162.jpg
997    1321949151.jpg
998    1322323208.jpg
999     132298659.jpg
Name: filename, Length: 1000, dtype: object


## Test-HNSW中包含图片 + 文本数据

In [None]:
# ==== 构建 FAISS HNSW 索引 ====
d = image_vecs.shape[1]
index = faiss.IndexHNSWFlat(d, 64)
index.hnsw.efConstruction = 256
index.add(image_vecs)  # 0-999 是图片vector
index.add(text_vecs)   # 1000-1999 是文本vector
index.hnsw.efSearch = 256

In [40]:
NumTest = 1

# 执行文本→图像检索
# D: top-K 结果对应的距离distance
# I: top-K 结果对应的索引index
D, I = index.search(text_vecs[:NumTest], k=10)
print("I shape:", I.shape)
print("D shape:", D.shape)
print("Distance:", D)

# 输出前 NumTest 条文本对应的 Top-10 图像索引
for qid in range(NumTest):
    print(f"\n[Query {qid}] Text: {df['raw'].iloc[qid]}")
    print("Top-10 Retrieved Images:")
    for rank, idx in enumerate(I[qid]):
        print(idx, rank)
print(df['filename'])
#         print(f"  {rank+1}. {df['filename'].iloc[idx]}")

I shape: (1, 10)
D shape: (1, 10)
Distance: [[ 0.       29.725046 30.609203 33.122566 33.70261  33.81827  33.835007
  33.90899  34.715187 35.466103]]

[Query 0] Text: ["Two young guys with shaggy hair look at their hands while hanging out in the yard.", "Two young, White males are outside near many bushes.", "Two men in green shirts are standing in a yard.", "A man in a blue shirt standing in a garden.", "Two friends enjoy time spent together."]
Top-10 Retrieved Images:
1000 0
1639 1
1647 2
1156 3
1152 4
1873 5
1331 6
1042 7
1466 8
1447 9
0      1000092795.jpg
1        10002456.jpg
2      1000268201.jpg
3      1000344755.jpg
4      1000366164.jpg
            ...      
995    1321651400.jpg
996    1321723162.jpg
997    1321949151.jpg
998    1322323208.jpg
999     132298659.jpg
Name: filename, Length: 1000, dtype: object


## 模态聚类问题
- 经过上面的两轮测试，不难发现当将文本数据纳入图index，检索结果主要包含文本数据