In [1]:
from urllib.request import urlretrieve
import pandas as pd

# Download the parquet table
table_url = f'https://huggingface.co/datasets/poloclub/diffusiondb/resolve/main/metadata.parquet'
urlretrieve(table_url, 'metadata.parquet')

('metadata.parquet', <http.client.HTTPMessage at 0x7fe45292f340>)

In [9]:
# Read the table using Pandas
metadata_df = pd.read_parquet('metadata.parquet')

In [10]:
metadata_df

Unnamed: 0,image_name,prompt,part_id,seed,step,cfg,sampler,width,height,user_name,timestamp,image_nsfw,prompt_nsfw
0,2217ccbd-a1c6-47ac-9a2d-79649727c834.png,"a portrait of a female robot made from code, v...",1,2615452544,50,7.0,8,512,512,0167cc490c3dc6047be855b096a9b9d8f20e2b034532bd...,2022-08-20 11:56:00+00:00,0.554853,0.001621
1,c78807b7-d55a-4a2d-a6b6-9192b18941ad.png,a portrait of a female robot made from a cloud...,1,4231506957,50,7.0,8,512,512,0167cc490c3dc6047be855b096a9b9d8f20e2b034532bd...,2022-08-20 11:59:00+00:00,0.153645,0.000707
2,dc71658a-5e4b-4dca-861a-e1535510348b.png,"only memories remain, trending on artstation",1,544607824,50,7.0,8,512,512,0167cc490c3dc6047be855b096a9b9d8f20e2b034532bd...,2022-08-20 12:02:00+00:00,0.062496,0.000425
3,48eb7e17-a3cf-4eb8-96a9-d8e3e23fa1af.png,dream swimming pool with nobody,1,3500509234,50,7.0,8,512,512,019084defdd44d2dfcd200bf7aaae53034263168e10d6b...,2022-08-20 07:22:00+00:00,0.030799,0.003586
4,601d9792-eccd-4850-97a7-edbe91d3464c.png,a dog doing weights. epic oil painting.,1,1312381211,50,12.0,8,512,768,03cb377ebf364ea2761b5710625974c0d2d25285fec982...,2022-08-20 12:28:00+00:00,0.181035,0.030822
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999995,4a34854e-e117-4b69-b1da-cc74534524c0.png,david bowie giving a piggy back ride to ziggy ...,2000,3003198679,50,7.0,8,512,512,fff4138cde262b5632a036752b40ddec8cb0a53c549123...,2022-08-16 04:51:00+00:00,0.152736,0.022676
1999996,d4ae6d4f-762c-42f0-9481-a1c233e6a541.png,david bowie giving a piggy back ride to ziggy ...,2000,3003198679,50,7.0,8,512,512,fff4138cde262b5632a036752b40ddec8cb0a53c549123...,2022-08-16 04:53:00+00:00,0.127884,0.014115
1999997,67dac228-3631-4036-8498-a40a67a7d354.png,funny computer,2000,3602197422,50,7.0,8,512,512,fff4138cde262b5632a036752b40ddec8cb0a53c549123...,2022-08-16 05:41:00+00:00,0.093476,0.000463
1999998,9d57179b-de12-460b-a699-a66dfca528be.png,hilarious witty computing machine,2000,1604720580,50,7.0,8,512,512,fff4138cde262b5632a036752b40ddec8cb0a53c549123...,2022-08-16 06:47:00+00:00,0.124379,0.002547


In [11]:
# extract embeddings, images, and correlations

In [12]:
import os

image_folder = "./diffusiondb-2m/"

# get all image_name and correspond prompt in metadata_df then filter by images exist in image_folder
image_name = [image_name for image_name in os.listdir(image_folder) if image_name.endswith(".png")]
metadata_df = metadata_df[metadata_df["image_name"].isin(image_name)]

image_paths = [os.path.join(image_folder, image_name) for image_name in metadata_df["image_name"].values]
prompts = metadata_df["prompt"].values


In [13]:
len(prompts)

100000

In [14]:
from sentence_transformers import SentenceTransformer, models
st_model = SentenceTransformer('/home/thanh/shared_disk/thanh/sditp/data/all-MiniLM-L6-v2')

In [15]:
from tqdm import tqdm

embeddings = []

for prompt in tqdm(prompts):
    prompt_embeddings = st_model.encode(prompt).flatten()
    embeddings.append(prompt_embeddings)

100%|██████████| 100000/100000 [07:49<00:00, 212.82it/s]


In [16]:
# generate image_df, embedding_df, and correlation_df
# image_df will have 2 columns: id and path
# embedding_df will have 3 columns: id, text, and emb
# correlation_df will have 2 columns: prompt_id, image_id
import uuid


image_df = pd.DataFrame({
    "id": [os.path.basename(image_path).split(".")[0] for image_path in image_paths],
    "path": image_paths
})

prompt_df = pd.DataFrame({
    "id": [str(uuid.uuid4()) for _ in range(len(prompts))],
    "text": prompts,
    "emb": embeddings
})



In [22]:
correlation_df = pd.DataFrame({
    "prompt_id": prompt_df["id"].values,
    "image_id": image_df["id"].values
})

In [17]:
image_df

Unnamed: 0,id,path
0,2217ccbd-a1c6-47ac-9a2d-79649727c834,./diffusiondb-2m/2217ccbd-a1c6-47ac-9a2d-79649...
1,c78807b7-d55a-4a2d-a6b6-9192b18941ad,./diffusiondb-2m/c78807b7-d55a-4a2d-a6b6-9192b...
2,dc71658a-5e4b-4dca-861a-e1535510348b,./diffusiondb-2m/dc71658a-5e4b-4dca-861a-e1535...
3,48eb7e17-a3cf-4eb8-96a9-d8e3e23fa1af,./diffusiondb-2m/48eb7e17-a3cf-4eb8-96a9-d8e3e...
4,601d9792-eccd-4850-97a7-edbe91d3464c,./diffusiondb-2m/601d9792-eccd-4850-97a7-edbe9...
...,...,...
99995,f091c225-1451-4161-80b2-e4c956c183e9,./diffusiondb-2m/f091c225-1451-4161-80b2-e4c95...
99996,425e532c-1c67-47f8-b758-58cf3fee1bb3,./diffusiondb-2m/425e532c-1c67-47f8-b758-58cf3...
99997,839fcf92-1c2e-4a22-affd-7510b70b6bde,./diffusiondb-2m/839fcf92-1c2e-4a22-affd-7510b...
99998,09019207-7eb2-408c-a30b-c6c028e424a6,./diffusiondb-2m/09019207-7eb2-408c-a30b-c6c02...


In [18]:
prompt_df

Unnamed: 0,id,text,emb
0,e56b25f3-63c0-458a-bece-6764c3e5bc79,"a portrait of a female robot made from code, v...","[-0.06348677, -0.0107782455, -0.01301214, -0.0..."
1,d3cac251-e839-4230-b1df-8df4ac47e632,a portrait of a female robot made from a cloud...,"[-0.047303002, -0.0005425827, 0.027641837, -0...."
2,4f3d2eac-1fe5-46c2-b73d-86fc3d30d18d,"only memories remain, trending on artstation","[0.10905098, -0.06318686, 0.017796768, 0.02220..."
3,1f60516d-a83d-4b42-933a-e867bd7e709f,dream swimming pool with nobody,"[0.04241174, 0.038279288, 0.010689388, -0.0214..."
4,39b2cdd9-d3df-44e9-bd64-75c46cd10d86,a dog doing weights. epic oil painting.,"[-0.05192509, -0.018941695, 0.039370626, 0.070..."
...,...,...,...
99995,f71e4eee-ac92-49dd-ad40-b505098c7887,hyperrealistic portrait of a handsome male hea...,"[-0.007985528, 0.041286927, -0.0148528395, -0...."
99996,b339f122-79a3-4077-a785-be5e36435b39,hyperdetailed portrait of elizabeth olsen as s...,"[0.011702541, -0.02646702, -0.028342813, -0.01..."
99997,2019a846-dad3-434e-9ba8-bd7e9b271fb4,hyperdetailed portrait of henry cavill as a ma...,"[-0.03869299, 0.006565126, -0.00032936322, -0...."
99998,e70b7921-e9c4-467f-92b9-63e5ad8047d7,hyperdetailed portrait of chris evans rugged r...,"[-0.043777753, 0.02233673, 0.0133271385, -0.00..."


In [23]:
correlation_df

Unnamed: 0,prompt_id,image_id
0,e56b25f3-63c0-458a-bece-6764c3e5bc79,2217ccbd-a1c6-47ac-9a2d-79649727c834
1,d3cac251-e839-4230-b1df-8df4ac47e632,c78807b7-d55a-4a2d-a6b6-9192b18941ad
2,4f3d2eac-1fe5-46c2-b73d-86fc3d30d18d,dc71658a-5e4b-4dca-861a-e1535510348b
3,1f60516d-a83d-4b42-933a-e867bd7e709f,48eb7e17-a3cf-4eb8-96a9-d8e3e23fa1af
4,39b2cdd9-d3df-44e9-bd64-75c46cd10d86,601d9792-eccd-4850-97a7-edbe91d3464c
...,...,...
99995,f71e4eee-ac92-49dd-ad40-b505098c7887,f091c225-1451-4161-80b2-e4c956c183e9
99996,b339f122-79a3-4077-a785-be5e36435b39,425e532c-1c67-47f8-b758-58cf3fee1bb3
99997,2019a846-dad3-434e-9ba8-bd7e9b271fb4,839fcf92-1c2e-4a22-affd-7510b70b6bde
99998,e70b7921-e9c4-467f-92b9-63e5ad8047d7,09019207-7eb2-408c-a30b-c6c028e424a6


In [24]:
# create positive pairs then pairs_df from correlation_df
# split correlation_df to k-fold
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
correlation_df["fold"] = -1
for fold, (train_idx, val_idx) in enumerate(kf.split(correlation_df)):
    correlation_df.loc[val_idx, "fold"] = fold


pairs_df = pd.DataFrame({
    "prompt_id": correlation_df["prompt_id"].values,
    "image_id": correlation_df["image_id"].values,
    "target": 1
})

In [25]:
# save all dataframes to csv files
image_df.to_csv("image.csv", index=False)
prompt_df.to_csv("prompt.csv", index=False)
pairs_df.to_csv("pairs.csv", index=False)
correlation_df.to_csv("correlation.csv", index=False)


