In [2]:
from vislearnlabpy.embeddings.generate_embeddings import EmbeddingGenerator
from vislearnlabpy.embeddings.embedding_store import EmbeddingStore
from vislearnlabpy.embeddings.utils import display_search_results, zscore_embeddings 
import numpy as np
import pandas as pd
import os
from pathlib import Path



In [7]:
exp2_pairs = pd.read_csv("exp2_pairs.csv")
THINGS_DIR = "/Users/visuallearninglab/Documents/visvocab/data/raw/THINGS"

In [8]:
exp2_pairs

Unnamed: 0.1,Unnamed: 0,text1,text2,source,aoa_word1,aoa_word2,is_animate,image1,image2
0,1,airplane,baby,Kuperman,3.94,3.84,False,airplane.jpg,baby.jpg
1,2,airplane,bag,Kuperman,3.94,4.28,False,airplane.jpg,bag.jpg
2,3,airplane,balloon,Kuperman,3.94,4.37,False,airplane.jpg,balloon.jpg
3,4,airplane,bell,Kuperman,3.94,3.89,False,airplane.jpg,bell.jpg
4,5,airplane,bench,Kuperman,3.94,4.21,False,airplane.jpg,bench.jpg
...,...,...,...,...,...,...,...,...,...
27522,27523,vacuum,watch,Wordbank,2.05,2.12,False,vacuum.jpg,watch.jpg
27523,27524,vacuum,window,Wordbank,2.05,2.06,False,vacuum.jpg,window.jpg
27524,27525,vacuum,zipper,Wordbank,2.05,2.08,False,vacuum.jpg,zipper.jpg
27525,27526,washing machine,watch,Wordbank,2.27,2.12,False,washing_machine.jpg,watch.jpg


In [24]:
clip_generator = EmbeddingGenerator(model_type="clip", device="mps", output_type="doc") 

Only retrieve the unique images and texts for more efficiency

In [9]:
df1 = exp2_pairs[['image1', 'text1']]
df2 = exp2_pairs[['image2', 'text2']].rename(columns={'image2': 'image1', 'text2': 'text1'})
exp2_values_unique = pd.concat([df1, df2], ignore_index=True).drop_duplicates()
exp2_values_unique.to_csv("exp2_unique_pairs.csv", index=False)

In [43]:
exp2_values_unique

Unnamed: 0,image1,text1
0,airplane.jpg,airplane
120,alligator.jpg,alligator
146,anchor.jpg,anchor
303,ankle.jpg,ankle
497,ant.jpg,ant
...,...,...
43809,zipper.jpg,zipper
47055,spider.jpg,spider
47262,whale.jpg,whale
86399,walker1.jpg,walker


In [44]:
clip_generator.generate_image_embeddings(output_path="exp2_embeddings", overwrite=True, input_csv="exp2_unique_pairs.csv",
                                       input_dir="/Users/visuallearninglab/Documents/visvocab/data/raw/THINGS",
                                       batch_size=100, id_column="image1")

100it [00:00, 16515.61it/s]:   0%|          | 0/6 [00:00<?, ?it/s]
100it [00:00, 53649.32it/s]:  17%|█▋        | 1/6 [00:07<00:39,  7.98s/it]
100it [00:00, 58278.50it/s]:  33%|███▎      | 2/6 [00:11<00:21,  5.30s/it]
100it [00:00, 30572.96it/s]:  50%|█████     | 3/6 [00:14<00:12,  4.29s/it]
100it [00:00, 79663.89it/s]:  67%|██████▋   | 4/6 [00:17<00:07,  3.89s/it]
50it [00:00, 81696.61it/s]s:  83%|████████▎ | 5/6 [00:21<00:03,  3.91s/it]
Calculating clip embeddings: 100%|██████████| 6/6 [00:23<00:00,  3.86s/it]
Calculating text embeddings: 100%|██████████| 533/533 [00:06<00:00, 76.76it/s]


In [3]:
text_embedding_store = EmbeddingStore.from_doc("exp2_embeddings/text_embeddings/clip_text_embeddings_doc.docs")
image_embedding_store = EmbeddingStore.from_doc("exp2_embeddings/image_embeddings/clip_image_embeddings_doc.docs")





In [46]:
import copy
image_embedding_store_sims = copy.deepcopy(image_embedding_store)

# Remove 'url' key from each embedding dict
for embedding in image_embedding_store_sims.EmbeddingList:
    embedding.url = None 

In [10]:
image_pairs = exp2_pairs[['image1', 'image2']].values.tolist()
# Prepend THINGS_DIR to each image path in each pair
full_image_paths = [[os.path.join(THINGS_DIR, img) for img in pair] for pair in image_pairs]
image_sims = image_embedding_store.retrieve_similarities(output_path="exp2_image_sims.csv", text_pairs=full_image_paths)

In [11]:
text_sims = text_embedding_store.retrieve_similarities(text_pairs=exp2_pairs[['text1', 'text2']].values.tolist())

In [12]:
text_sims[text_sims['text1'] == 'soap']

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,cosine_similarity,text1,text2
23792,0.774902,soap,sock
23793,0.776855,soap,toe
23794,0.810059,soap,toilet
23795,0.804688,soap,tooth
23796,0.822754,soap,toothpaste
23797,0.783691,soap,towel
23798,0.824219,soap,toy
23799,0.759277,soap,tree
27425,0.745605,soap,spaghetti
27426,0.729004,soap,stick


Now combining image and text sim values

In [70]:
from pathlib import Path

# Step 1: Rename columns
image_sims = image_sims.rename(columns={'text1': 'image1', 'text2': 'image2', 'cosine_similarity': 'image_sim'})
text_sims = text_sims.rename(columns={'cosine_similarity': 'text_sim'})

# Step 2: Replace full paths with only file names (path.name)
image_sims[['image1', 'image2']] = image_sims[['image1', 'image2']].applymap(lambda p: Path(p).name)

# Step 3: Combine image_sims and text_sims column-wise (cbind)
combined_sims = pd.concat([image_sims.reset_index(drop=True), text_sims.reset_index(drop=True)], axis=1)


  image_sims[['image1', 'image2']] = image_sims[['image1', 'image2']].applymap(lambda p: Path(p).name)


In [71]:
combined_sims.to_csv("exp2_sims.csv", index=False)