In [None]:
import numpy as np
%load_ext autoreload
%autoreload 2
import pandas as pd
import glob
from src.embedding_generation.text_embeddings import *
from src.common.logger import *

In [None]:
#Specify dataset and paths to work on
DATASET_ENTITY_COUNT = 100
DATASET_BASE_PATH = f"/Users/yavuz/data/LAION-{DATASET_ENTITY_COUNT}/"

METADATA_PATH = DATASET_BASE_PATH + "metadata.parquet"
IMAGES_PATH = DATASET_BASE_PATH + "images/"

vector_path = DATASET_BASE_PATH + "vectors/"
if not os.path.exists(vector_path):
    print("Creating path", vector_path)
    os.makedirs(vector_path)

In [None]:
# Text Embedding Generation
model = "BAAI/bge-small-en-v1.5"
df = pd.read_parquet(METADATA_PATH)
texts = list(df["TEXT"])
embedding_generator: TextEmbeddingGenerator = SentenceTransformerEmbeddingGenerator(model)
embeddings = embedding_generator.generate_text_embeddings(texts, True)
embeddings

In [None]:
# Save text embeddings
np.save(vector_path+"text_vectors", embeddings)

In [None]:
#Test text embedding retrieval and confirm shape
read_embeddings = np.load(vector_path+"text_vectors.npy")
read_embeddings

In [None]:
#Image Embedding Generation
from src.embedding_generation.image_embeddings import *

In [None]:
image_paths = glob.glob(IMAGES_PATH+"/*/*.jpg")
image_paths.sort()
len(image_paths)

In [None]:
image_embedding_generator: ImageEmbeddingGenerator = HFImageEmbeddingGenerator("google/vit-base-patch16-224-in21k")
image_embeddings = image_embedding_generator.batch_generate_image_embeddings(image_paths, True)
image_embeddings.shape

In [None]:
# Save image embeddings
np.save(vector_path+"image_vectors", image_embeddings)

In [None]:
# Identify placeholder images by computing similarity across paris of image embeddings
# Those with >0.99 are deemed to be placeholder images. 
# This boundary was chosen using some experimentation and visual inspection of sample images.
from IPython.display import display, Image

image_embeddings = np.load(vector_path + "image_vectors.npy")
image_embeddings.shape

In [None]:
def get_image(vector_id: int, images_path: str):
    """
    Given a vector id and base images path (IMAGES_PATH), returns the image.
    """
    shard = str(vector_id // 10000).zfill(5)
    index = str(vector_id % 10000).zfill(4)
    image_path = f"{images_path}/{shard}/{shard}{index}.jpg"
    return Image(filename=image_path) 

In [None]:
# Normalize embeddings and compute similarity matrix
image_embeddings /  np.linalg.norm(image_embeddings, axis=1, keepdims=True)
similarity_matrix = np.dot(image_embeddings, image_embeddings.T)

In [None]:
near_duplicates = []
for i in range(len(similarity_matrix)):
    for j in range(i+1, len(similarity_matrix)):
        # 0.99 has been chosen the boundary after some experimentation, by viewing sample images
        if similarity_matrix[i][j] > 0.99:
            near_duplicates.append((i, j, similarity_matrix[i][j]))
near_duplicates.sort(key=lambda x: x[2]) # sort to view sample least similar items
len(near_duplicates)

In [None]:
display(get_image(18, IMAGES_PATH))

In [None]:
# extract ids that are near-duplicates, ordered by similarity
near_duplicate_ids = set()
for i, j, _ in near_duplicates:
    near_duplicate_ids.add(i)
    near_duplicate_ids.add(j)
near_duplicate_ids

In [None]:
np.save(vector_path+"placeholder_images", np.array(list(near_duplicate_ids)))