In [6]:
1+2

3

In [18]:
import ollama
import numpy as np
from numpy.linalg import norm

def get_embedding(text):
    response = ollama.embeddings(model='embeddinggemma:300m', prompt=text)
    return np.array(response['embedding'])

def cosine_similarity(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))

# 1. Generate embeddings
king = get_embedding("king")
man = get_embedding("man")
woman = get_embedding("woman")
queen = get_embedding("queen")

# 2. Perform the arithmetic: King - Man + Woman
# We are essentially "subtracting" the concept of masculinity 
# and "adding" the concept of femininity to the concept of royalty.
masculinity_vector = man - woman
result_vector = king - masculinity_vector

# 3. Verify the result
similarity = cosine_similarity(result_vector, queen)

print(f"Similarity between (King - Man + Woman) and Queen: {similarity:.4f}")

Similarity between (King - Man + Woman) and Queen: 0.7030


In [None]:
import json
import heapq
from pathlib import Path
from typing import Iterable, List, Optional, Tuple, Union

import numpy as np
import gguf

# This cell uses the *Ollama-downloaded* model directly (no HuggingFace).
# For qwen3:0.6b, Ollama stores a GGUF blob under ~/.ollama/models/blobs/ ...

OLLAMA_MANIFEST = (
    Path.home() / ".ollama/models/manifests/registry.ollama.ai/library/qwen3/0.6b"
)


def _resolve_ollama_gguf_blob(manifest_path: Path) -> Path:
    obj = json.loads(manifest_path.read_text(encoding="utf-8"))
    for layer in obj.get("layers", []):
        if layer.get("mediaType") == "application/vnd.ollama.image.model":
            digest = layer["digest"]  # e.g. sha256:...
            if not digest.startswith("sha256:"):
                raise ValueError(f"Unexpected digest format: {digest}")
            blob = (
                Path.home()
                / ".ollama/models/blobs"
                / ("sha256-" + digest.split(":", 1)[1])
            )
            if not blob.exists():
                raise FileNotFoundError(f"Resolved blob does not exist: {blob}")
            return blob
    raise ValueError(f"No model layer found in manifest: {manifest_path}")


GGUF_PATH = _resolve_ollama_gguf_blob(OLLAMA_MANIFEST)
print("Using GGUF:", GGUF_PATH)

# 1) Load the GGUF and grab the full token embedding matrix (quantized)
reader = gguf.GGUFReader(str(GGUF_PATH))
token_tensor = next(t for t in reader.tensors if t.name == "token_embd.weight")
vocab_size = token_tensor.data.shape[0]

# Determine the dequantized embedding dimensionality robustly.
embed_dim = int(
    gguf.dequantize(token_tensor.data[:1], token_tensor.tensor_type).shape[1]
)
print(f"Vocab size: {vocab_size}")
print(f"Embedding dim: {embed_dim}")
print(f"Embedding quant type: {token_tensor.tensor_type}")

# 2) Load tokenizer token strings so we can decode token IDs back to text
tok_field = reader.fields["tokenizer.ggml.tokens"]
tokens_raw: Union[List[str], List[bytes]] = tok_field.contents()


def _tok_to_str(x: Union[str, bytes]) -> str:
    if isinstance(x, bytes):
        return x.decode("utf-8", errors="replace")
    return str(x)


tokens: List[str] = [_tok_to_str(x) for x in tokens_raw]
assert len(tokens) == vocab_size, "Token list size must match embedding vocab size"


def find_token_id(word: str) -> int:
    """Best-effort lookup for a *single token* ID (not full tokenization)."""
    candidates = [word, "Ġ" + word, "▁" + word, " " + word]
    for c in candidates:
        try:
            return tokens.index(c)
        except ValueError:
            continue
    raise ValueError(
        f"Could not find a single-token match for {word!r}. Try another word."
    )


def dequantize_rows(start: int, end: int) -> np.ndarray:
    """Dequantize embedding rows [start:end) to float32 shape [N, embed_dim]."""
    q = token_tensor.data[start:end]  # uint8, quant-packed
    return gguf.dequantize(q, token_tensor.tensor_type)


def get_token_vector(token_id: int) -> np.ndarray:
    return dequantize_rows(token_id, token_id + 1)[0]


def normalize(v: np.ndarray) -> np.ndarray:
    n = np.linalg.norm(v)
    return v if n == 0 else (v / n)


def find_closest_tokens(
    vector: np.ndarray,
    k: int = 5,
    ignore_ids: Optional[Iterable[int]] = None,
    chunk_size: int = 2048,
) -> List[Tuple[str, float, int]]:
    """
    Approximate 'unembedding': find top-k cosine-similar tokens across the *entire* vocab.
    Works by dequantizing the GGUF embedding matrix in chunks to avoid huge RAM spikes.
    Returns (token_string, similarity, token_id).
    """
    ignore = set(ignore_ids or [])
    target = normalize(vector).astype(np.float32, copy=False)

    heap: List[Tuple[float, int]] = []  # (score, token_id), min-heap

    for start in range(0, vocab_size, chunk_size):
        end = min(start + chunk_size, vocab_size)
        emb = dequantize_rows(start, end)  # float32 [N, D]
        norms = np.linalg.norm(emb, axis=1, keepdims=True)
        norms[norms == 0] = 1.0
        emb = emb / norms
        scores = emb @ target  # [N]

        for i, score in enumerate(scores):
            tid = start + i
            if tid in ignore:
                continue
            s = float(score)
            if len(heap) < k:
                heapq.heappush(heap, (s, tid))
            elif s > heap[0][0]:
                heapq.heapreplace(heap, (s, tid))

    best = sorted(heap, reverse=True)
    return [(tokens[tid], score, tid) for score, tid in best]


# 3) Demo: King - Man + Woman (single-token lookup)
king_id = find_token_id("king")
man_id = find_token_id("man")
woman_id = find_token_id("woman")

king_vec = get_token_vector(king_id)
man_vec = get_token_vector(man_id)
woman_vec = get_token_vector(woman_id)

target_vector = king_vec - man_vec + woman_vec

ignore = [king_id, man_id, woman_id]
print("\nMath: king - man + woman")
matches = find_closest_tokens(target_vector, k=5, ignore_ids=ignore, chunk_size=2048)

print("\nClosest matches in the entire vocabulary:")
for i, (tok, score, tid) in enumerate(matches, start=1):
    print(f"{i}. {tok!r} (id={tid}, similarity={score:.4f})")

Using GGUF: /home/max/.ollama/models/blobs/sha256-7f4030143c1c477224c5434f8272c662a8b042079a0a584f0a27a1684fe2e1fa
Vocab size: 151936
Embedding dim: 1024
Embedding quant type: 12

Math: king - man + woman

Closest matches in the entire vocabulary:
1. 'queen' (id=93114, similarity=0.4361)
2. 'ĠKING' (id=73811, similarity=0.4006)
3. 'women' (id=64662, similarity=0.3910)
4. 'King' (id=33555, similarity=0.3866)
5. 'Ġqueen' (id=27906, similarity=0.3709)
