In [None]:
"""
This script is used to generate and save vector embeddings for the text and image data prepared in dataset_preparation.ipynb. Update the parameters in the cell below. Additionally, the number of threads used to generate text/image embeddings can be amended in their corresponding function calls below. The last part of this notebook identifies placeholder images by computing pairwise similarities across every generated image embedding.
"""
%load_ext autoreload
%autoreload 2
import pandas as pd
import glob
from src.embedding_generation.text_embeddings import *
from src.common.logger import *

In [None]:
# specify dataset and paths to work on
DATASET_ENTITY_COUNT = 222
DATASET_BASE_PATH = f"/Users/yavuz/data/LAION-{DATASET_ENTITY_COUNT}/"

NUM_VECTORS_TO_GENERATE = None # set to None to generate vectors for all available entities

METADATA_PATH = DATASET_BASE_PATH + "metadata.parquet"
IMAGES_PATH = DATASET_BASE_PATH + "images/"

vector_path = DATASET_BASE_PATH + "vectors/"
if not os.path.exists(vector_path):
    print("Creating path", vector_path)
    os.makedirs(vector_path)

In [None]:
# Text Embedding Generation
model = "BAAI/bge-small-en-v1.5"
df = pd.read_parquet(METADATA_PATH)
texts = list(df["TEXT"])
if NUM_VECTORS_TO_GENERATE is not None:
    texts = texts[:NUM_VECTORS_TO_GENERATE]

embedding_generator: TextEmbeddingGenerator = SentenceTransformerEmbeddingGenerator(model)
embeddings = embedding_generator.generate_text_embeddings(texts, normalize_embeddings=False, batch_size=128)
embeddings.shape

In [None]:
# save text embeddings
# check if path already exists
text_vector_path = vector_path+"text_vectors"
if os.path.exists(text_vector_path + ".npy"):
    new_path = text_vector_path + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    print(f"Path {text_vector_path} already exists. Instead saving to {new_path}.npy")
    text_vector_path = new_path
np.save(text_vector_path, embeddings)

In [None]:
# test text embeddings retrieval and confirm shape and type
read_embeddings = np.load(text_vector_path+".npy")
read_embeddings.shape, read_embeddings.dtype

In [None]:
# Image Embedding Generation
from src.embedding_generation.image_embeddings import *

In [None]:
image_paths = glob.glob(IMAGES_PATH+"/*/*.jpg")
image_paths.sort()

if NUM_VECTORS_TO_GENERATE is not None:
    image_paths = image_paths[:NUM_VECTORS_TO_GENERATE]

len(image_paths)

In [None]:
image_embedding_generator: ImageEmbeddingGenerator = HFImageEmbeddingGenerator("google/vit-base-patch16-224-in21k")
image_embeddings = image_embedding_generator.batch_generate_image_embeddings(image_paths, normalize_embeddings=False, batch_size=128)
image_embeddings.shape, image_embeddings.dtype

In [None]:
# save image embeddings
# check if path already exists
image_vector_path = vector_path+"image_vectors"
if os.path.exists(image_vector_path + ".npy"):
    new_path = image_vector_path + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    print(f"Path {image_vector_path} already exists. Instead saving to {new_path}.npy")
    image_vector_path = new_path

np.save(image_vector_path, image_embeddings)

In [None]:
# test image embeddings retrieval and confirm shape and type
read_embeddings = np.load(image_vector_path+".npy")
read_embeddings.shape, read_embeddings.dtype

In [None]:
read_embeddings[0]

In [None]:
# Optional: identify placeholder images. Note: Performance will be slow for large datasets ( O(n^2) ).

# Identify placeholder images by computing similarity across pairs of image embeddings
# Those with >0.99 are deemed to be placeholder images. 
# This boundary was chosen using some experimentation and visual inspection of sample images.
from IPython.display import display, Image

In [None]:
image_embeddings = np.load(vector_path + "image_vectors.npy")
image_embeddings.shape

In [None]:
image_embeddings

In [None]:
# Normalize embeddings and compute similarity matrix
normalised_image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, axis=1, keepdims=True)
similarity_matrix = np.dot(normalised_image_embeddings, normalised_image_embeddings.T)

In [None]:
similarity_matrix

In [None]:
near_duplicates = []
for i in range(len(similarity_matrix)):
    for j in range(i+1, len(similarity_matrix)):
        # 0.99 has been chosen the boundary after some experimentation, by viewing sample images
        if similarity_matrix[i][j] > 0.99:
            near_duplicates.append((i, j, similarity_matrix[i][j]))
near_duplicates.sort(key=lambda x: x[2]) # sort to view sample least similar items
len(near_duplicates)

In [None]:
# extract ids that are near-duplicates, ordered by similarity
near_duplicate_ids = set()
for i, j, _ in near_duplicates:
    near_duplicate_ids.add(i)
    near_duplicate_ids.add(j)
near_duplicate_ids

In [None]:
np.save(vector_path+"placeholder_images", np.array(list(near_duplicate_ids)))

In [None]:
placeholder_images = np.load(vector_path+"placeholder_images.npy")
# there are 164 images identified in 20k dataset
# there are 5-10 images which are not placeholder, but are identified because exact duplicates exist in the dataset
placeholder_images.shape

In [None]:
def get_image(vector_id: int, images_path: str):
    """
    Given a vector id and base images path (IMAGES_PATH), returns the image.
    """
    shard = str(vector_id // 10000).zfill(5)
    index = str(vector_id % 10000).zfill(4)
    image_path = f"{images_path}/{shard}/{shard}{index}.jpg"
    return Image(filename=image_path) 

In [None]:
# print sample placeholder images
LIMIT = 10
for i in placeholder_images[:LIMIT]:
    print(i)
    display(get_image(i, IMAGES_PATH))