In [1]:
import pandas as pd
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import torch.nn.functional as F
import json
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../output/dataset.tsv", sep="\t")

In [3]:
df.head()

Unnamed: 0,item_id,item_name_en_us,path
0,B074J5TWYL,"365 Everyday Value, Organic Black Tea (70 Tea ...",03/03fde183.jpg
1,B019OL9E02,Otterbox Symmetry Series iPhone 6/6s Case and ...,29/291959a2.jpg
2,B0832ZNTS4,"Fresh Brand – Mixed Squash Spirals, 14 oz",51/51c21968.jpg
3,B07PXFVNXR,"Ameriwood Home Classic 5 Drawer Dresser, White",19/19196356.jpg
4,B07KG3F83Z,"Whole Foods Market, Triple Milled Soap, Sandal...",d9/d9f07a0a.jpg


In [4]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
model.to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [5]:
images_root = "../dataset/images/small"

In [9]:
def process_batch(batch):
    # get embedding
    images = [Image.open(images_root + "/" + path) for path in batch["path"]]
    inputs = processor(
        text=batch["item_name_en_us"].tolist(),
        images=images,
        return_tensors="pt",
        padding=True
    ).to(device)
    with torch.no_grad():
        outputs = model(**inputs)

    # make feed data
    feed_data_list = []
    for i, (_, row) in enumerate(batch.iterrows()):
        feed_data = {
            "put": "id:item:item::" + row["item_id"],
            "fields": {
                "item_id": row["item_id"],
                "item_name_en_us": row["item_name_en_us"],
                "path": row["path"],
                "text_embedding": [round(x, 8) for x in outputs["text_embeds"][i].tolist()],
                "image_embedding": [round(x, 8) for x in outputs["image_embeds"][i].tolist()],
                "synthetic_embedding": [round(x, 8) for x in (outputs["text_embeds"][i] + outputs["image_embeds"][i]).tolist()]
            }
        }
        feed_data_list.append(feed_data)

    return feed_data_list

In [10]:
batch_size = 32

In [13]:
with open("../output/feed.jsonl", "w") as fp:
    for start_idx in tqdm(range(0, len(df), batch_size)):
        batch = df.iloc[start_idx:start_idx + batch_size]
        feed_data_list = process_batch(batch)
        for feed_data in feed_data_list:
            json.dump(feed_data, fp)
            fp.write("\n")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 638/638 [03:51<00:00,  2.76it/s]
