In [None]:
""" 
identify placeholder images by: 
- Compute pairwise similarity between a subset of image embeddings - compute similarity between a vector and the 10k vectors after it
- Those with >0.99 cosine similarity are deemed to be placeholder images. 
- This boundary was chosen using some experimentation and visual inspection of sample images.
"""
%load_ext autoreload
%autoreload 2
import numpy as np
from tqdm import tqdm

In [None]:
# specify dataset and paths to work on
DATASET_ENTITY_COUNT = 150 #150 #1900000
DATASET_BASE_PATH = f"/Users/yavuz/data/LAION-{DATASET_ENTITY_COUNT}/"

METADATA_PATH = DATASET_BASE_PATH + "metadata.parquet"
IMAGES_PATH = DATASET_BASE_PATH + "images/"

vector_path = DATASET_BASE_PATH + "vectors/"

In [None]:
placeholder_images = np.load(vector_path+"placeholder_images.npy")

In [None]:
image_embeddings = np.load(vector_path + "image_vectors.npy")
image_embeddings.shape

In [None]:
# Normalize embeddings and compute similarity matrix
normalised_image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, axis=1, keepdims=True)

In [None]:
normalised_image_embeddings.shape

In [None]:
near_duplicates = set()
threshold = 0.99
window_size = 100000
for i in tqdm(range(0, len(normalised_image_embeddings))):
    start = i + 1
    end = min(i + window_size, len(normalised_image_embeddings))
    if start < end:
        scores = np.dot(normalised_image_embeddings[i], normalised_image_embeddings[start:end].T)

        near_duplicate_indices = np.where(scores > threshold)[0] + start
        if len(near_duplicate_indices) > 0:
            near_duplicates.update([i] + list(near_duplicate_indices))

    # Save checkpoint every 50k iterations
    if i % 50000 == 0:
        np.save(vector_path+"placeholder_images"+str(window_size)+"Window_checkpoint", np.array(list(near_duplicates)))
len(near_duplicates)

In [None]:
np.save(vector_path+"placeholder_images"+str(window_size)+"Window", np.array(list(near_duplicates)))

In [None]:
# there are 164 images identified in 20k dataset
# there are 5-10 images which are not placeholder, but are identified because exact duplicates exist in the dataset
placeholder_images.shape

In [None]:
placeholder_images

In [None]:
from IPython.display import display, Image
def get_image(vector_id: int, images_path: str):
    """
    Given a vector id and base images path (IMAGES_PATH), returns the image.
    """
    shard = str(vector_id // 10000).zfill(5)
    index = str(vector_id % 10000).zfill(4)
    image_path = f"{images_path}/{shard}/{shard}{index}.jpg"
    return Image(filename=image_path) 

In [None]:
# print sample placeholder images
LIMIT = 10
for i in placeholder_images[len(placeholder_images) - LIMIT:]:
    print(i)
    display(get_image(i, IMAGES_PATH))