In [None]:
from urllib.request import urlretrieve
import pandas as pd

# Download the parquet table
table_url = f'https://huggingface.co/datasets/poloclub/diffusiondb/resolve/main/metadata.parquet'
urlretrieve(table_url, 'metadata.parquet')

In [None]:
# Read the table using Pandas
metadata_df = pd.read_parquet('metadata.parquet')

In [None]:
metadata_df

In [None]:
# extract embeddings, images, and correlations

In [None]:
import os

image_folder = "./diffusiondb-2m/"

# get all image_name and correspond prompt in metadata_df then filter by images exist in image_folder
image_name = [image_name for image_name in os.listdir(image_folder) if image_name.endswith(".png")]
metadata_df = metadata_df[metadata_df["image_name"].isin(image_name)]

image_paths = [os.path.join(image_folder, image_name) for image_name in metadata_df["image_name"].values]
prompts = metadata_df["prompt"].values


In [None]:
len(prompts)

In [None]:
from sentence_transformers import SentenceTransformer, models
st_model = SentenceTransformer('/home/thanh/shared_disk/thanh/sditp/data/all-MiniLM-L6-v2')

In [None]:
from tqdm import tqdm

embeddings = []

for prompt in tqdm(prompts):
    prompt_embeddings = st_model.encode(prompt).flatten()
    embeddings.append(prompt_embeddings)

In [None]:
# generate image_df, embedding_df, and correlation_df
# image_df will have 2 columns: id and path
# embedding_df will have 3 columns: id, text, and emb
# correlation_df will have 2 columns: prompt_id, image_id
import uuid


image_df = pd.DataFrame({
    "id": [os.path.basename(image_path).split(".")[0] for image_path in image_paths],
    "path": image_paths
})

prompt_df = pd.DataFrame({
    "id": [str(uuid.uuid4()) for _ in range(len(prompts))],
    "text": prompts,
    "emb": embeddings
})



In [None]:
correlation_df = pd.DataFrame({
    "prompt_id": prompt_df["id"].values,
    "image_id": image_df["id"].values
})

In [None]:
image_df

In [None]:
prompt_df

In [None]:
correlation_df

In [None]:
# create positive pairs then pairs_df from correlation_df
# split correlation_df to k-fold
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
correlation_df["fold"] = -1
for fold, (train_idx, val_idx) in enumerate(kf.split(correlation_df)):
    correlation_df.loc[val_idx, "fold"] = fold


pairs_df = pd.DataFrame({
    "prompt_id": correlation_df["prompt_id"].values,
    "image_id": correlation_df["image_id"].values,
    "target": 1
})

In [None]:
# save all dataframes to csv files
image_df.to_csv("image.csv", index=False)
prompt_df.to_csv("prompt.csv", index=False)
pairs_df.to_csv("pairs.csv", index=False)
correlation_df.to_csv("correlation.csv", index=False)


