In [2]:
""" 
identify placeholder images by: 
- Compute pairwise similarity between a subset of image embeddings - compute similarity between a vector and the 10k vectors after it
- Those with >0.99 cosine similarity are deemed to be placeholder images. 
- This boundary was chosen using some experimentation and visual inspection of sample images.
"""
%load_ext autoreload
%autoreload 2
import numpy as np
from tqdm import tqdm

In [3]:
# specify dataset and paths to work on
DATASET_ENTITY_COUNT = 1900000
DATASET_BASE_PATH = f"/Users/yavuz/data/LAION-{DATASET_ENTITY_COUNT}/"

METADATA_PATH = DATASET_BASE_PATH + "metadata.parquet"
IMAGES_PATH = DATASET_BASE_PATH + "images/"

vector_path = DATASET_BASE_PATH + "vectors/"

In [4]:
placeholder_images = np.load(vector_path+"placeholder_images.npy")

In [5]:
image_embeddings = np.load(vector_path + "image_vectors.npy")
image_embeddings.shape

(1187524, 768)

In [6]:
# Normalize embeddings and compute similarity matrix
normalised_image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, axis=1, keepdims=True)

In [7]:
normalised_image_embeddings.shape

(1187524, 768)

In [8]:
near_duplicates = set()
for i in tqdm(range(0, len(normalised_image_embeddings))):
    for j in range(i+1, min(len(normalised_image_embeddings), i+10000)):
        # 0.99 has been chosen the boundary after some experimentation, by viewing sample images
        score = np.dot(normalised_image_embeddings[i], normalised_image_embeddings[j])
        if score > 0.99:
            #near_duplicates.append((i, j, score))
            near_duplicates.add(i)
            near_duplicates.add(j)
    
    # save every 50k iterations
    if i % 50000 == 0:
        np.save(vector_path+"placeholder_images10kWindow", np.array(list(near_duplicates)))

#near_duplicates.sort(key=lambda x: x[2]) # sort to view sample least similar items
len(near_duplicates)

100%|██████████| 10000/10000 [05:12<00:00, 31.98it/s]


127

In [9]:
np.save(vector_path+"placeholder_images10kWindowFor1M", np.array(list(near_duplicates)))

In [22]:
placeholder_images = np.load(vector_path+"placeholder_images10kWindowFor1M.npy")
# there are 164 images identified in 20k dataset
# there are 5-10 images which are not placeholder, but are identified because exact duplicates exist in the dataset
placeholder_images.shape

(127,)

In [23]:
placeholder_images

array([1178627, 1181702, 1183247, 1179669, 1184790, 1182745, 1183261,
       1183262, 1184287, 1178146, 1182755, 1182252, 1181752, 1180729,
       1180731, 1186363, 1183810, 1184329, 1181777, 1179732, 1182292,
       1186901, 1180759, 1180763, 1183835, 1177695, 1184363, 1183851,
       1186923, 1182836, 1179255, 1177723, 1179264, 1180289, 1183362,
       1178755, 1183872, 1179782, 1187468, 1186959, 1181840, 1183897,
       1186476, 1185453, 1177773, 1183922, 1179835, 1182397, 1187521,
       1177795, 1180357, 1184965, 1180870, 1180361, 1179339, 1183446,
       1181919, 1184479, 1184492, 1178349, 1180915, 1182452, 1186049,
       1187074, 1185039, 1184017, 1184030, 1180960, 1186085, 1184550,
       1187111, 1183531, 1183023, 1182515, 1180980, 1182518, 1178423,
       1186103, 1182030, 1187160, 1186152, 1180530, 1178483, 1185657,
       1185147, 1184125, 1178496, 1181576, 1186703, 1185681, 1184145,
       1183125, 1180567, 1180570, 1179035, 1186205, 1178030, 1184687,
       1179055, 1184

In [24]:
from IPython.display import display, Image
def get_image(vector_id: int, images_path: str):
    """
    Given a vector id and base images path (IMAGES_PATH), returns the image.
    """
    shard = str(vector_id // 10000).zfill(5)
    index = str(vector_id % 10000).zfill(4)
    image_path = f"{images_path}/{shard}/{shard}{index}.jpg"
    return Image(filename=image_path) 

In [29]:
# print sample placeholder images
LIMIT = 10
for i in placeholder_images[len(placeholder_images) - LIMIT:]:
    print(i)
    display(get_image(i, IMAGES_PATH))