# Embedding Visualization

This notebook visualizes embeddings in 2D space using PCA and t-SNE to understand how the fine-tuned model clusters similar sentences together.


In [None]:
# Import functions from the scripts directory
from src.data.loaders import load_toy_dataset
from src.models.embedding_pipeline import load_embeddinggemma_model, embed_texts
from src.models.lora_setup import setup_lora_model
from src.visualization.embedding_viz import (
    reduce_to_2d_pca,
    reduce_to_2d_tsne,
    plot_paired_embeddings
)
from src.utils.paths import timestamped_path
import numpy as np


## Load Model and Data

Load the fine-tuned model and compute embeddings for all sentences.


In [None]:
# Load model (use trained model if available, otherwise base with LoRA)
tokenizer, base_model = load_embeddinggemma_model()
model = setup_lora_model(base_model, r=16, lora_alpha=32, lora_dropout=0.1)

# Load dataset
train_data = load_toy_dataset()

# Get all unique sentences
all_sentences = [item["anchor"] for item in train_data] + [item["positive"] for item in train_data]

# Compute embeddings
embeddings = embed_texts(all_sentences, model, tokenizer)

print(f"Computed embeddings for {len(all_sentences)} sentences")
print(f"Embedding shape: {embeddings.shape}")


## PCA Visualization

Use Principal Component Analysis to reduce 768-dimensional embeddings to 2D for visualization.


In [None]:
# Reduce to 2D using PCA
embeddings_2d_pca = reduce_to_2d_pca(embeddings)

# Separate anchors and positives
num_pairs = len(train_data)
anchor_embeddings_2d = embeddings_2d_pca[:num_pairs]
positive_embeddings_2d = embeddings_2d_pca[num_pairs:]

# Plot paired embeddings
plot_paired_embeddings(
    anchor_embeddings_2d,
    positive_embeddings_2d,
    title="2D PCA of Sentence Embeddings (after fine-tuning)",
    save_path=str(timestamped_path("outputs/visualizations", "embedding_pca", "png"))
)


## t-SNE Visualization

Use t-SNE for a different perspective on the embedding space. t-SNE preserves local neighborhoods better than PCA.


In [None]:
# Reduce to 2D using t-SNE
# Note: t-SNE can be slow for larger datasets
from src.visualization.embedding_viz import reduce_to_2d_tsne

embeddings_2d_tsne = reduce_to_2d_tsne(embeddings, perplexity=min(30, len(embeddings) - 1))

# Separate anchors and positives
anchor_embeddings_2d_tsne = embeddings_2d_tsne[:num_pairs]
positive_embeddings_2d_tsne = embeddings_2d_tsne[num_pairs:]

# Plot paired embeddings
plot_paired_embeddings(
    anchor_embeddings_2d_tsne,
    positive_embeddings_2d_tsne,
    title="2D t-SNE of Sentence Embeddings (after fine-tuning)",
    save_path=str(timestamped_path("outputs/visualizations", "embedding_tsne", "png"))
)


## Interpretation

After fine-tuning, we expect:
- Each anchor (X) should be close to its corresponding positive (O) of the same color
- Different pairs (different colors) should be separated in the embedding space
- The model has learned to cluster semantically similar sentences together
