# Visualizing Concatinated Image and Question Embeddings 
I'll be using DINO vision encoder and SBERT question encoder to check t-SNE plots on VQA-v2 and VQA-Abstract datasets

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from transformers import BertModel, BertTokenizer
import torch

# BERT with sum of embeddings

In [None]:
# Initialize BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert = BertModel.from_pretrained(model_name)

# Function to get sentence embeddings
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors='pt')
    with torch.no_grad():
        outputs = bert(**inputs)
        embedding = outputs.last_hidden_state.squeeze().sum(dim=0).numpy()
        
    return embedding

In [None]:
# Sample sentences
sentences = [
    'i loves mangos',
    'i hates mangos',
    
    'i loves apples',
    'i hates apples',

    'i loves bananas',
    'i hates bananas',
    
    'i loves strawberries',
    'i hates strawberries',
    
    'i loves blueberries',
    'i hates blueberries',

    'i love people',
    'i hate people',
    
    'i love dogs',
    'i hate dogs',
    
    'i love cats',
    'i hate cats',
    
    'i love monkeys',
    'i hate monkeys',
]

In [None]:
# Get embeddings for all sentences
embeddings = np.array([get_sentence_embedding(sentence) for sentence in sentences])

# Apply t-SNE
# perplexity = min(30, len(embeddings) - 1)
perplexity = 4
tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
tsne_embeddings = tsne.fit_transform(embeddings)

# Plot the t-SNE reduced embeddings
plt.figure(figsize=(10, 6))
for i, sentence in enumerate(sentences):
    plt.scatter(tsne_embeddings[i, 0], tsne_embeddings[i, 1])
    plt.text(tsne_embeddings[i, 0] + 0.1, tsne_embeddings[i, 1], sentence, fontsize=9)
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.title('t-SNE Visualization of BERT Sentence Embeddings')
plt.show()

Observation: I'd expect fruits and non fruits to be on different sides but it seems to be a mess in here 

# SBERT: Better sentence embeddings 

In [None]:
from sentence_transformers import SentenceTransformer

sbert = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
embeddings = sbert.encode(sentences)

# Apply t-SNE
# perplexity = min(30, len(embeddings) - 1)
perplexity = 4
tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
tsne_embeddings = tsne.fit_transform(embeddings)

# Plot the t-SNE reduced embeddings
plt.figure(figsize=(10, 6))
for i, sentence in enumerate(sentences):
    plt.scatter(tsne_embeddings[i, 0], tsne_embeddings[i, 1])
    plt.text(tsne_embeddings[i, 0] + 0.1, tsne_embeddings[i, 1], sentence, fontsize=9)
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.title('t-SNE Visualization of BERT Sentence Embeddings')
plt.show()

In [None]:
embeddings.shape

Observation: clearly the seperation is much better although I'd expect better seperation for 'love' and 'hate'. Apparently the model thinks they are more similar than our monkey brains think :P

# DINO vision encoder

In [None]:
import os
from transformers import AutoImageProcessor, AutoModel
from PIL import Image
import torch
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Function to get image embeddings
def get_image_embedding(image_path, processor, model):
    image = Image.open(image_path)
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [None]:
# Directory of images
image_dir = "images/cats_dogs"

# Load pre-trained DINOv2 model and processor
processor_small = AutoImageProcessor.from_pretrained('facebook/dinov2-small')
dino_small = AutoModel.from_pretrained('facebook/dinov2-small')

processor_base = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
dino_base = AutoModel.from_pretrained('facebook/dinov2-base')

def small_embeddings():
    # Get embeddings and file names
    embeddings = []
    file_names = []
    for file_name in os.listdir(image_dir):
        if file_name.endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(image_dir, file_name)
            embeddings.append(get_image_embedding(image_path, processor_small, dino_small))
            file_names.append(os.path.splitext(file_name)[0])

    embeddings = np.array(embeddings)
    return embeddings


def base_embeddings():
    # Get embeddings and file names
    embeddings = []
    file_names = []
    for file_name in os.listdir(image_dir):
        if file_name.endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(image_dir, file_name)
            embeddings.append(get_image_embedding(image_path, processor_base, dino_base))
            file_names.append(os.path.splitext(file_name)[0])

    embeddings = np.array(embeddings)
    print(embeddings.shape)
    return embeddings


def plot_tsne(embeddings, perplexity):
    # Apply t-SNE
    # perplexity = min(30, len(embeddings) - 1)
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
    tsne_embeddings = tsne.fit_transform(embeddings)

    # Plot the t-SNE reduced embeddings
    plt.figure(figsize=(6,6))
    for i, file_name in enumerate(file_names):
        plt.scatter(tsne_embeddings[i, 0], tsne_embeddings[i, 1])
        plt.text(tsne_embeddings[i, 0] + 0.1, tsne_embeddings[i, 1], file_name, fontsize=9)
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.title('t-SNE Visualization of Image Embeddings')
    plt.show()

In [None]:
embeddings = small_embeddings()
plot_tsne(embeddings, 4)

In [None]:
embeddings = base_embeddings()
plot_tsne(embeddings, 4)

Observation: I'd need to check more on image embedding quality

# Vision & Text Embeddings Visualised Together (concat)
## VQA-V2
### 1. Loading datasets

In [None]:
import json

file_path = 'data/vqa-v2/v2_OpenEnded_mscoco_val2014_questions.json'

with open(file_path, 'r') as file:
    vqa_v2_data = json.load(file)

print(vqa_v2_data.keys())

In [None]:
v2_questions = vqa_v2_data['questions']
print(v2_questions[0])

### 2. Vision and squence encoders (coz i dont wanna run cells above :P)

In [None]:
import os
from PIL import Image
import torch
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

from transformers import AutoImageProcessor, AutoModel
from sentence_transformers import SentenceTransformer

In [None]:
sbert = SentenceTransformer("all-MiniLM-L6-v2")

processor_small = AutoImageProcessor.from_pretrained('facebook/dinov2-small')
dino_small = AutoModel.from_pretrained('facebook/dinov2-small')

processor_base = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
dino_base = AutoModel.from_pretrained('facebook/dinov2-base')

In [None]:
def get_image_embedding(image_root, image_id, processor, model, show_img=False):
    image_id = str(image_id).zfill(6)
    image_path = f'{image_root}{image_id}.jpg'
    image = Image.open(image_path)
    
    if show_img:
        tiny_image = image.resize((64,64))
        tiny_image.show()
    
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def plot_tsne(embeddings, plot_labels, perplexity):
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
    tsne_embeddings = tsne.fit_transform(embeddings)

    plt.figure(figsize=(6,6))
    for i, plot_label in enumerate(plot_labels):
        plt.scatter(tsne_embeddings[i, 0], tsne_embeddings[i, 1])
        plt.text(tsne_embeddings[i, 0] + 0.1, tsne_embeddings[i, 1], plot_label, fontsize=9)
    plt.show()

In [None]:
image_root = 'data/vqa-v2/val2014/val2014/COCO_val2014_000000'
processor, image_encoder = processor_small, dino_small
# processor, image_encoder = processor_base, dino_base

v2_image_embeddings, v2_question_embeddings = [], []
v2_image_labels, v2_question_labels = [], []
for i, question in enumerate(v2_questions[:1000]):
    image_id = question['image_id']
    v2_image_labels.append(image_id)
    
    q = question['question']
    q_id = question['question_id']
    v2_question_labels.append(q)
    v2_question_embeddings.append(sbert.encode(q))

    v2_image_embeddings.append(get_image_embedding(image_root, image_id, processor, image_encoder))

In [None]:
v2_image_embeddings = np.array(v2_image_embeddings)
v2_question_embeddings = np.array(v2_question_embeddings)

empty_labels = [''] * len(v2_image_labels)
# plot_tsne(v2_image_embeddings, v2_image_labels, 4)
# plot_tsne(v2_question_embeddings, v2_image_labels, 6)

plot_tsne(v2_image_embeddings, empty_labels, 32)

In [None]:
plot_tsne(v2_question_embeddings, empty_labels, 16)
plot_tsne(v2_question_embeddings, empty_labels, 32)
plot_tsne(v2_question_embeddings, empty_labels, 64)

In [None]:
v2_cat = np.concatenate((v2_image_embeddings, v2_question_embeddings), axis=-1)

In [None]:
plot_tsne(v2_cat, empty_labels, 8)

## VQA-Abstract

In [None]:
import json

file_path = 'data/vqa-abstract/questions_train/OpenEnded_abstract_v002_train2015_questions.json'

with open(file_path, 'r') as file:
    vqa_abstract_data = json.load(file)

abs_questions = vqa_abstract_data['questions']

In [None]:
def get_image_embedding(image_root, image_id, processor, model, show_img=False):
    image_id = str(image_id).zfill(5)
    image_path = f'{image_root}{image_id}.png'
    image = Image.open(image_path)
    
    if show_img:
        tiny_image = image.resize((64,64))
        tiny_image.show()
    
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [None]:
image_root = 'data/vqa-abstract/img_train/abstract_v002_train2015_0000000'
processor, image_encoder = processor_small, dino_small
# processor, image_encoder = processor_base, dino_base

abs_image_embeddings, abs_question_embeddings = [], []
abs_image_labels, abs_question_labels = [], []
for i, question in enumerate(abs_questions[:1000]):
    image_id = question['image_id']
    abs_image_labels.append(image_id)
    # print(i, image_id)
    
    q = question['question']
    q_id = question['question_id']
    abs_question_labels.append(q)
    abs_question_embeddings.append(sbert.encode(q))

    # print(f'image_id: {image_id}\t {q_id} - {q}')
    abs_image_embeddings.append(get_image_embedding(image_root, image_id, processor, image_encoder))

In [None]:
abs_image_embeddings = np.array(abs_image_embeddings)
abs_question_embeddings = np.array(abs_question_embeddings)

empty_labels = [''] * len(abs_image_labels)

# plot_tsne(abs_image_embeddings, abs_image_labels, 4)
# plot_tsne(abs_question_embeddings, abs_image_labels, 6)

plot_tsne(abs_image_embeddings, empty_labels, 2)
plot_tsne(abs_image_embeddings, empty_labels, 4)
plot_tsne(abs_image_embeddings, empty_labels, 8)
plot_tsne(abs_image_embeddings, empty_labels, 16)
plot_tsne(abs_image_embeddings, empty_labels, 32)
plot_tsne(abs_image_embeddings, empty_labels, 64)

In [None]:
plot_tsne(abs_question_embeddings, empty_labels, 4)
plot_tsne(abs_question_embeddings, empty_labels, 8)
plot_tsne(abs_question_embeddings, empty_labels, 16)
plot_tsne(abs_question_embeddings, empty_labels, 32)
plot_tsne(abs_question_embeddings, empty_labels, 64)

In [None]:
abs_cat = np.concatenate((abs_image_embeddings, abs_question_embeddings), axis=-1)

In [None]:
plot_tsne(abs_cat, empty_labels, 4)
plot_tsne(abs_cat, empty_labels, 8)
plot_tsne(abs_cat, empty_labels, 16)
plot_tsne(abs_cat, empty_labels, 32)
plot_tsne(abs_cat, empty_labels, 64)

## Putting both on the same scale

In [None]:
image_embeddings = np.concatenate((v2_image_embeddings, abs_image_embeddings), axis=0)
question_embeddings = np.concatenate((v2_question_embeddings, abs_question_embeddings), axis=0)
cat = np.concatenate((v2_cat, abs_cat), axis=0)

labels = ['V'] * len(v2_image_embeddings) + ['A'] * len(abs_image_embeddings)

In [None]:
def plot_tsne(embeddings, plot_labels, perplexity):
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
    tsne_embeddings = tsne.fit_transform(embeddings)

    plt.figure(figsize=(6,6))
    for i, plot_label in enumerate(plot_labels):
        plt.scatter(tsne_embeddings[i, 0], tsne_embeddings[i, 1], color='blue' if plot_label == 'V' else 'red')
        plt.text(tsne_embeddings[i, 0] + 0.1, tsne_embeddings[i, 1], '', fontsize=9)
    plt.show()

In [None]:
plot_tsne(image_embeddings, labels, 4)
plot_tsne(image_embeddings, labels, 8)
plot_tsne(image_embeddings, labels, 16)
plot_tsne(image_embeddings, labels, 32)

In [None]:
plot_tsne(question_embeddings, labels, 4)
plot_tsne(question_embeddings, labels, 8)
plot_tsne(question_embeddings, labels, 16)
plot_tsne(question_embeddings, labels, 32)

In [None]:
plot_tsne(cat, labels, 4)
plot_tsne(cat, labels, 8)
plot_tsne(cat, labels, 16)
plot_tsne(cat, labels, 32)