# Token Embedding Visualization

**Duration:** ~30 min | **Platform:** Kaggle dual Tesla T4

This notebook explores **text embeddings** — generating embeddings with the
server's embedding endpoint, applying dimensionality reduction for visualization,
clustering, and semantic similarity search.

### What you'll learn
1. Generate text embeddings
2. Dimensionality reduction (PCA/t-SNE)
3. Semantic clustering
4. Similarity search
5. Embedding quality analysis

In [None]:
!pip install -q git+https://github.com/llamatelemetry/llamatelemetry.git@v1.2.0
!pip install -q matplotlib scikit-learn

import llamatelemetry
from llamatelemetry.llama import ServerManager, LlamaCppClient
from huggingface_hub import hf_hub_download

llamatelemetry.init(service_name="embedding-viz")

model_path = hf_hub_download(
    repo_id="bartowski/google_gemma-3-1b-it-GGUF",
    filename="google_gemma-3-1b-it-Q4_K_M.gguf",
    cache_dir="/root/.cache/huggingface",
)

# Start server with embedding support
mgr = ServerManager()
mgr.start_server(model_path=model_path, gpu_layers=99, ctx_size=2048)
mgr.wait_until_ready(timeout=60)
client = LlamaCppClient(base_url="http://127.0.0.1:8090")
print("Ready")

## Generating Embeddings

Use the server's embedding endpoint to create vector representations
of diverse texts across multiple categories.

In [None]:
import numpy as np

# Diverse texts across categories
texts = {
    "ML": [
        "Neural networks learn representations from data through backpropagation.",
        "Gradient descent optimizes model parameters by minimizing the loss function.",
        "Convolutional neural networks excel at image classification tasks.",
        "Transformer models use self-attention for sequence processing.",
        "Transfer learning adapts pre-trained models to new downstream tasks.",
    ],
    "GPU": [
        "CUDA enables massively parallel computation on NVIDIA GPUs.",
        "Tensor cores accelerate matrix multiplication operations.",
        "GPU memory bandwidth is crucial for large model inference.",
        "Multi-GPU training distributes model and data across devices.",
        "Flash attention reduces memory usage for long sequences.",
    ],
    "Science": [
        "Photosynthesis converts sunlight into chemical energy in plants.",
        "DNA replication ensures genetic information is copied accurately.",
        "The periodic table organizes elements by atomic number.",
        "Quantum mechanics describes behavior at subatomic scales.",
        "Evolution through natural selection drives species adaptation.",
    ],
}

# Generate embeddings
all_texts = []
all_labels = []
all_embeddings = []

for category, category_texts in texts.items():
    for text in category_texts:
        emb = client.embed(text)
        all_texts.append(text)
        all_labels.append(category)
        all_embeddings.append(emb)

embeddings = np.array(all_embeddings)
print(f"Generated {len(embeddings)} embeddings, dimension={embeddings.shape[1]}")

## Dimensionality Reduction

Reduce high-dimensional embeddings to 2D for visualization using PCA and t-SNE.

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

@llamatelemetry.task(name="dimensionality-reduction")
def reduce_dimensions(embeddings, labels):
    # PCA
    pca = PCA(n_components=2, random_state=42)
    pca_2d = pca.fit_transform(embeddings)

    # t-SNE
    tsne = TSNE(n_components=2, random_state=42, perplexity=min(5, len(embeddings) - 1))
    tsne_2d = tsne.fit_transform(embeddings)

    return pca_2d, tsne_2d, pca.explained_variance_ratio_

pca_2d, tsne_2d, variance_ratio = reduce_dimensions(embeddings, all_labels)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
colors = {"ML": "blue", "GPU": "green", "Science": "red"}

for label in set(all_labels):
    mask = [l == label for l in all_labels]
    axes[0].scatter(pca_2d[mask, 0], pca_2d[mask, 1], c=colors[label], label=label, s=60, alpha=0.8)
    axes[1].scatter(tsne_2d[mask, 0], tsne_2d[mask, 1], c=colors[label], label=label, s=60, alpha=0.8)

axes[0].set_title(f"PCA (var explained: {sum(variance_ratio):.1%})")
axes[0].legend()
axes[1].set_title("t-SNE")
axes[1].legend()

plt.suptitle("Embedding Space Visualization", fontsize=13)
plt.tight_layout()
plt.show()

## Semantic Clustering

Apply K-means clustering to discover natural groupings in the embedding space.

In [None]:
from sklearn.cluster import KMeans

@llamatelemetry.task(name="semantic-clustering")
def cluster_embeddings(embeddings, n_clusters=3):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(embeddings)
    return cluster_labels, kmeans.inertia_

cluster_labels, inertia = cluster_embeddings(embeddings, n_clusters=3)

# Compare clusters with actual categories
print("Clustering Results:")
print(f"{'Text':<60} {'True':<8} {'Cluster'}")
print("-" * 75)
for text, true_label, cluster in zip(all_texts, all_labels, cluster_labels):
    print(f"{text[:57]+'...' if len(text) > 57 else text:<60} {true_label:<8} {cluster}")

# Cluster purity
from collections import Counter
for c in range(3):
    cluster_labels_list = [all_labels[i] for i in range(len(all_labels)) if cluster_labels[i] == c]
    most_common = Counter(cluster_labels_list).most_common(1)[0] if cluster_labels_list else ("N/A", 0)
    purity = most_common[1] / len(cluster_labels_list) if cluster_labels_list else 0
    print(f"\nCluster {c}: {len(cluster_labels_list)} items, majority={most_common[0]}, purity={purity:.0%}")

## Semantic Similarity Search

Find the most similar documents to a query using cosine similarity.

In [None]:
from llamatelemetry.embeddings import cosine_similarity

queries = [
    "How do GPUs accelerate machine learning?",
    "What is the role of attention in transformers?",
    "Tell me about biological processes.",
]

for query in queries:
    query_emb = np.array(client.embed(query))

    # Compute similarities
    similarities = []
    for i, emb in enumerate(embeddings):
        sim = cosine_similarity(query_emb, emb)
        similarities.append((sim, all_texts[i], all_labels[i]))

    # Top 3 results
    similarities.sort(reverse=True)
    print(f"\nQuery: \"{query}\"")
    for sim, text, label in similarities[:3]:
        print(f"  [{label}] {sim:.3f} — {text[:70]}")

## Embedding Quality Analysis

Analyze the embedding space properties: inter-cluster distance, intra-cluster
cohesion, and isotropy.

In [None]:
# Inter-category similarity (should be low)
# Intra-category similarity (should be high)
categories = list(texts.keys())

print(f"{'Cat1':<10} {'Cat2':<10} {'Avg Similarity'}")
print("-" * 35)
for i, cat1 in enumerate(categories):
    for j, cat2 in enumerate(categories):
        if j < i:
            continue
        mask1 = [l == cat1 for l in all_labels]
        mask2 = [l == cat2 for l in all_labels]
        emb1 = embeddings[mask1]
        emb2 = embeddings[mask2]

        sims = []
        for e1 in emb1:
            for e2 in emb2:
                if not np.array_equal(e1, e2):
                    sims.append(cosine_similarity(e1, e2))

        avg_sim = np.mean(sims) if sims else 0
        relation = "intra" if cat1 == cat2 else "inter"
        print(f"{cat1:<10} {cat2:<10} {avg_sim:.3f} ({relation})")

# Isotropy: how uniformly distributed are embeddings in space
centroid = np.mean(embeddings, axis=0)
dists = [np.linalg.norm(e - centroid) for e in embeddings]
print(f"\nEmbedding space statistics:")
print(f"  Mean distance from centroid: {np.mean(dists):.3f}")
print(f"  Std distance: {np.std(dists):.3f}")
print(f"  Dimension: {embeddings.shape[1]}")

## Summary

This notebook demonstrated:
- **Embedding generation** via the llama-server embedding endpoint
- **Visualization** with PCA and t-SNE dimensionality reduction
- **K-means clustering** for unsupervised topic discovery
- **Similarity search** using cosine similarity
- **Quality analysis** of the embedding space

In [None]:
mgr.stop_server()
llamatelemetry.shutdown()
print("Done.")