In [1]:
import os
import torch
import numpy as np
import unicodedata
import pandas as pd
from tqdm.notebook import tqdm
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def is_english_only(string):
    for s in string:
        cat = unicodedata.category(s)         
        if not cat in ['Ll', 'Lu', 'Nd', 'Po', 'Pd', 'Zs']:
            return False
    return True

# SD_data

In [3]:
df = pd.read_csv("data/sd2_data/metadata.csv")

print("Base data size: ", df.shape[0])

# Only 512x512 images
# df = df[(df['width'] == 512) & (df['height'] == 512)]

# Strip prompts
df['prompt'] = df['prompt'].str.strip()

# Filter NaN prompts
df = df[~df['prompt'].str.contains('^(?:\s*|NULL|null|NaN)$', na=True)]
print("After `filter NaNs`: ", df.shape[0])

# Filter too small prompts
df = df[df['prompt'].map(lambda x: len(x.split())) >= 5]
print("After `filter small`: ", df.shape[0])

# Filter not English prompts
df = df[df['prompt'].apply(is_english_only)]
print("After `filter not Eng`: ", df.shape[0])
 
# Filter head & tail duplicates
df['head'] = df['prompt'].str[:15]
df['tail'] = df['prompt'].str[-15:]
df.drop_duplicates(subset='head', inplace=True)
df.drop_duplicates(subset='tail', inplace=True)
print("After `filter duplicates`: ", df.shape[0])

df.reset_index(drop=True, inplace=True)

print("Filtered data size: ", df.shape[0])

df.to_csv("data/sd2_data/metadata_clean.csv", index=False)

Base data size:  81910
After `filter NaNs`:  81910
After `filter small`:  81807
After `filter not Eng`:  77602
After `filter duplicates`:  5465
Filtered data size:  5465


# SD_DB2M

In [3]:
metadata = pd.read_parquet("/drives/drive4/competitions/SD/metadata.parquet", engine="pyarrow")

In [4]:
# filter exist images
# images_names = [p.name for p in Path("/drives/drive4/competitions/SD/images/").iterdir()]
# images_names += [p.name for p in Path("/drives/drive2/competitions/SD/images/").iterdir()]

print("Base data size: ", metadata.shape[0])
images_paths = list(Path("./data/sd_data_resize224/images").rglob("*.jpg"))
images_names = [p.name for p in images_paths]
metadata["image_name"] = metadata["image_name"].str.replace(".png", ".jpg")

metadata = metadata[metadata.image_name.isin(images_names)]
tmp = pd.DataFrame({"path": images_paths})
tmp["image_name"] = tmp.path.apply(lambda p: p.name)
metadata = metadata.merge(tmp, on="image_name")

print("Only existed images: ", metadata.shape[0])

Base data size:  2000000


  metadata["image_name"] = metadata["image_name"].str.replace(".png", ".jpg")


Only existed images:  2000000


In [6]:
df = metadata.copy()
print("Base size: ", df.shape[0])

# Only 512x512 images
df = df[((df['width']== 512) & (df['height'] == 512)) | ((df['width']== 768) & (df['height'] == 768))]
print("After `filter size`: ", df.shape[0])

# Strip prompts
df['prompt'] = df['prompt'].str.strip()

# Filter NaN prompts
df = df[~df['prompt'].str.contains('^(?:\s*|NULL|null|NaN)$', na=True)]
print("After `filter NaNs`: ", df.shape[0])

# Filter too small prompts
df = df[df['prompt'].map(lambda x: len(x.split())) >= 5]
print("After `filter small`: ", df.shape[0])

# Filter not English prompts
df = df[df['prompt'].apply(is_english_only)]
print("After `filter not Eng`: ", df.shape[0])

# Filter steps
df = df[df['step'] > 0]
print("After `filter steps=0`: ", df.shape[0])


# # Filter head & tail duplicates
# df['head'] = df['prompt'].str[:20]
# df['tail'] = df['prompt'].str[-20:]
# df.drop_duplicates(subset='head', inplace=True)
# df.drop_duplicates(subset='tail', inplace=True)
# print("After `filter duplicates`: ", df.shape[0])

df.reset_index(drop=True, inplace=True)

print("Filtered data size: ", df.shape[0])

# train_idxs, test_idxs = train_test_split(df.index, test_size=0.05, random_state=42)
# print(f"Train: {len(train_idxs)} Test: {len(test_idxs)}")
# df.loc[train_idxs, "stage"] = "train"
# df.loc[test_idxs, "stage"] = "test"
m = pd.read_csv("data/sd_data/metadata.csv")
m = m.loc[m.stage == "test"].reset_index(drop=True)
m = m[["image_name", "stage"]]
m["image_name"] = m["image_name"].str.replace(".png", ".jpg")
df = df.merge(m, "left", on="image_name")
df["stage"] = df["stage"].fillna("train")

print("Filtered data size: ", df.shape[0])

df.to_csv("./metadata_sd2b_resize224.csv", index=False)

Base size:  2000000
After `filter size`:  1057211
After `filter NaNs`:  1056685
After `filter small`:  962590
After `filter not Eng`:  913481
After `filter steps=0`:  913466
Filtered data size:  913466


  m["image_name"] = m["image_name"].str.replace(".png", ".jpg")


Filtered data size:  913466


In [7]:
df = pd.read_csv("./metadata_sd2b_resize224.csv")

In [8]:
from sentence_transformers import SentenceTransformer, models
st_model = SentenceTransformer('all-MiniLM-L6-v2', device="cuda")
embeddings = st_model.encode(df.prompt.values, batch_size=1024, normalize_embeddings=False, show_progress_bar=True)

Batches:   0%|          | 0/893 [00:00<?, ?it/s]

In [9]:
torch.save(torch.from_numpy(embeddings), "embeddings.pth")

In [10]:
import gc
from joblib import Parallel, delayed

# filter.py

def get_ignoring_idxs(embeddings, batch_size=1024, thresh=0.95):
    bs = batch_size
    n_embeddings = len(embeddings)
    idxs_to_ignore = set()
    pbar = tqdm(total=n_embeddings)
    i = 0
    while i < n_embeddings:
        idxs = []
        while len(idxs) < bs and i < n_embeddings:
            if i not in idxs_to_ignore:
                idxs.append(i)
            i += 1
        cos_sim = cosine_similarity(embeddings[idxs], embeddings)
        np.fill_diagonal(cos_sim, 0)
        new_to_ignore = set(np.nonzero(cos_sim > thresh)[1])
        idxs_to_ignore = idxs_to_ignore.union(new_to_ignore)
        pbar.update(len(idxs))
        
    del embeddings
    gc.collect()
    
    return list(idxs_to_ignore)

IGNORING = get_ignoring_idxs(embeddings)

  0%|          | 0/913466 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [10]:
torch.save(torch.tensor(list(dropped)), "duplicates_idxs_sd_2b.pth")

In [20]:
# dropped = torch.load("dropped_idxs.pth").tolist()

In [11]:
filtered_df = df.loc[~df.index.isin(dropped)].reset_index(drop=True)

In [12]:
df.shape[0], filtered_df.shape[0]

(900539, 642738)

In [None]:
filtered_df.to_csv("./metadata_sd2b_resize224_filtered_similar.csv", index=False)

In [44]:
# images_names1 = [p.name for p in Path("/drives/drive4/competitions/SD/images/").iterdir()]
# images_names2 = [p.name for p in Path("/drives/drive2/competitions/SD/images/").iterdir()]
filtered_df.loc[filtered_df.image_name.isin(images_names1), "path"] = ("/drives/drive4/competitions/SD/images/" + filtered_df.loc[filtered_df.image_name.isin(images_names1), "image_name"]).values
filtered_df.loc[filtered_df.image_name.isin(images_names2), "path"] = "/drives/drive2/competitions/SD/images/" + (filtered_df.loc[filtered_df.image_name.isin(images_names2), "image_name"]).values

filtered_df.to_csv("./metadata_sd2b_filtered_similar.csv", index=False)

# Check prompts similarity

In [12]:
from sentence_transformers import SentenceTransformer, models
st_model = SentenceTransformer('all-MiniLM-L6-v2', device="cuda")

p1 = "photorealistic, 8k, superresolution art of trump riding motocross"
p2 = "photorealistic, 8k, superresolution art of girl riding a horse"

e1 = st_model.encode(p1)[None]
e2 = st_model.encode(p2)[None]

cosine_similarity(e1, e2)

array([[0.6388701]], dtype=float32)

# Tests

In [1]:
import pandas as pd
pd.options.display.max_colwidth = 500

df = pd.read_csv("data/sd_data/metadata.csv")
df.shape[0]

323614

In [2]:
df_new = df.loc[df.step >= 1].reset_index(drop=True)
df_new.to_csv("data/sd_data/metadata_min_step=1.csv", index=False)
df_new.shape[0]

323608

# Testing similar filtering

In [18]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# embeddings numpy array
embeddings = np.array([
    [1,1,1],
    [1,1,1],
    [0,1,1],
    [1,1,1],
    [0,0,1],
])

# compute cosine similarity matrix
cos_sim_matrix = cosine_similarity(embeddings, embeddings)

# set diagonal values to False
np.fill_diagonal(cos_sim_matrix, False)

# set threshold for cosine similarity score
threshold = 0.9

# create boolean mask based on threshold
sim_mask = cos_sim_matrix >= threshold
print(np.round(cos_sim_matrix, 2))

# apply boolean masks to get indices of too similar embeddings
similar_indices = np.nonzero(sim_mask)
ignore_indices = np.unique(np.nonzero(sim_mask)[1])

print(similar_indices)


[[0.   1.   0.82 1.   0.58]
 [1.   0.   0.82 1.   0.58]
 [0.82 0.82 0.   0.82 0.71]
 [1.   1.   0.82 0.   0.58]
 [0.58 0.58 0.71 0.58 0.  ]]
(array([0, 0, 1, 1, 3, 3]), array([1, 3, 0, 3, 0, 1]))


In [None]:
thresh = 0.95
bs = 1024
n_embeddings = len(embeddings)
idxs_to_ignore = set()
pbar = tqdm()
i = 0
while i < n_embeddings:
    idxs = []
    while len(idxs) < bs and i < n_embeddings:
        if i not in idxs_to_ignore:
            idxs.append(i)
        i += 1
    cos_sim = cosine_similarity(embeddings[idxs], embeddings)
    np.fill_diagonal(cos_sim, 0)
    mask = cos_sim >= thresh
    indices_to_drop = []
    new_to_ignore = set(np.nonzero(sim_mask)[1])
    idxs_to_ignore = idxs_to_ignore.union(new_to_ignore)
    
    pbar.total = n_embeddings - len(dropped)
    pbar.update(len(idxs))
