In [None]:
"""
identify placeholder images by: 
- Compute pairwise similarity between a subset of image embeddings - compute similarity between a vector and the 10k vectors after it
- Those with >0.99 cosine similarity are deemed to be placeholder images. 
- This boundary was chosen using some experimentation and visual inspection of sample images.
"""
%load_ext autoreload
%autoreload 2
import numpy as np
from tqdm import tqdm
import pandas as pd
import os
import random
from src.dataset_processing.find_duplicates import batch_compute_all_duplicates, batch_compute_all_duplicate_pairs, compute_set_from_duplicate_pairs, compute_all_duplicates_from_placeholder_candidiates, compute_duplicate_set_from_window

In [None]:
# specify dataset and paths to work on
DATASET_ENTITY_COUNT = 1900000 #150 #20000 #1900000
DATASET_BASE_PATH = f"/Users/yavuz/data/LAION-{DATASET_ENTITY_COUNT}/"

METADATA_PATH = DATASET_BASE_PATH + "metadata.parquet"
IMAGES_PATH = DATASET_BASE_PATH + "images/"

vector_path = DATASET_BASE_PATH + "vectors/"

metadata = pd.read_parquet(METADATA_PATH)

In [None]:
from IPython.display import display, Image
def get_image(vector_id: int, images_path: str):
    """
    Given a vector id and base images path (IMAGES_PATH), returns the image.
    """
    shard = str(vector_id // 10000).zfill(5)
    index = str(vector_id % 10000).zfill(4)
    image_path = f"{images_path}/{shard}/{shard}{index}.jpg"
    return Image(filename=image_path) 

In [None]:
NUM_IMAGE_EMBEDDINGS = None # None 100_000

image_embeddings = np.load(vector_path + "image_vectors.npy")
if NUM_IMAGE_EMBEDDINGS is None:
    NUM_IMAGE_EMBEDDINGS = len(image_embeddings)
image_embeddings = image_embeddings[:NUM_IMAGE_EMBEDDINGS]
image_embeddings.shape

In [None]:
# Normalize embeddings and compute similarity matrix
normalised_image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, axis=1, keepdims=True)

In [None]:
normalised_image_embeddings.shape

In [None]:
# calculate placeholders using window size approach
window_size = 10_000
placeholder_images_path = os.path.join(vector_path, "placeholder_images" + str(window_size) + "_window")
duplicates_for_window = compute_duplicate_set_from_window(normalised_image_embeddings, placeholder_images_path, window_size=window_size)

In [None]:
print(len(duplicates_for_window))

In [None]:
# update the placeholder images by computing duplicates for the window candidates
updated_placeholder_ids_for_window = compute_all_duplicates_from_placeholder_candidiates(duplicates_for_window, normalised_image_embeddings, threshold=0.99)
print(len(updated_placeholder_ids_for_window))

In [None]:
# Now look at saved pleacholder ids - from 50k window size
saved_placeholder_ids = np.load(vector_path+"placeholder_images50000_window.npy")
print(len(saved_placeholder_ids))

In [None]:
# update 50k placeholder ids by computing duplicates for the window candidates
updated_placeholder_ids = compute_all_duplicates_from_placeholder_candidiates(saved_placeholder_ids, normalised_image_embeddings, threshold=0.99)
print(len(updated_placeholder_ids))
# previous: 21601 in 28 min

In [None]:
# identify entries in one set and not the other
new_placeholder_ids = set(updated_placeholder_ids) - set(updated_placeholder_ids_for_window)
print(len(new_placeholder_ids))

# display 10 sample images from this difference
SAMPLE_SIZE = 10
for i in random.sample(list(new_placeholder_ids), SAMPLE_SIZE):
    print(i)
    display(get_image(i, IMAGES_PATH))

In [None]:
# identify duplicate pairs in the dataset
duplicate_pairs = batch_compute_all_duplicate_pairs(normalised_image_embeddings, batch_size=1000)
set_from_duplicate_pairs = compute_set_from_duplicate_pairs(duplicate_pairs)
print(len(set_from_duplicate_pairs))

In [None]:
print(len(duplicate_pairs))
print(len(set_from_duplicate_pairs))

In [None]:
# print sample placeholder images - how many are placeholders and which are duplicates
SAMPLE_SIZE = 10
# select LIMIT random samples from list(set_from_duplicate_pairs)
for i in random.sample(list(set_from_duplicate_pairs), SAMPLE_SIZE):
    print(i)
    display(get_image(i, IMAGES_PATH))

In [None]:
investigation_id = 20041 # id of the flagged non-placeholder image we want to investigate

In [None]:
# find duplicate ids for the investigation id
def find_duplicate_ids(id, duplicate_pairs):
    duplicate_ids = []
    for pair in duplicate_pairs:
        if id == pair[0]:
            duplicate_ids.append(pair[1])
        elif id == pair[1]:
            duplicate_ids.append(pair[0])
    return duplicate_ids

In [None]:
# now print the text, url, and image for all duplicates of this investigated id
LIMIT = 20
duplicate_ids = find_duplicate_ids(investigation_id, duplicate_pairs)

if len(duplicate_ids) > LIMIT:
    print(f"Found {len(duplicate_ids)} duplicates, truncating to {LIMIT}")
    duplicate_ids = duplicate_ids[:LIMIT]

In [None]:
print(investigation_id, metadata["TEXT"].iloc[investigation_id], metadata["URL"].iloc[investigation_id])
display(get_image(investigation_id, IMAGES_PATH))
for duplicate_id in duplicate_ids:
    print(duplicate_id, metadata["TEXT"].iloc[duplicate_id], metadata["URL"].iloc[duplicate_id])
    display(get_image(duplicate_id, IMAGES_PATH))

In [None]:
# MANUAL COMPUTATION WITH CHECKPOINTING
near_duplicates = set()
threshold = 0.99
window_size = 100000
for i in tqdm(range(0, len(normalised_image_embeddings))):
    start = i + 1
    end = min(i + window_size, len(normalised_image_embeddings))
    if start < end:
        scores = np.dot(normalised_image_embeddings[i], normalised_image_embeddings[start:end].T)

        near_duplicate_indices = np.where(scores > threshold)[0] + start
        if len(near_duplicate_indices) > 0:
            near_duplicates.update([i] + list(near_duplicate_indices))

    # Save checkpoint every 50k iterations
    if i % 50000 == 0:
        np.save(vector_path+"placeholder_images"+str(window_size)+"Window_checkpoint", np.array(list(near_duplicates)))
len(near_duplicates)

In [None]:
np.save(vector_path+"placeholder_images"+str(window_size)+"Window", np.array(list(near_duplicates)))

In [None]:
# there are 164 images identified in 20k dataset
# there are 5-10 images which are not placeholder, but are identified because exact duplicates exist in the dataset
placeholder_images = np.load(vector_path+"placeholder_images.npy")
placeholder_images.shape

In [None]:
placeholder_images

In [None]:
# print sample placeholder images
LIMIT = 10
for i in placeholder_images[len(placeholder_images) - LIMIT:]:
    print(i)
    display(get_image(i, IMAGES_PATH))