In [6]:
1+2

3

In [65]:
import ollama
import numpy as np
from numpy.linalg import norm

def get_embedding(text):
    response = ollama.embeddings(model='embeddinggemma:300m', prompt=text)
    return np.array(response['embedding'])

def cosine_similarity(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))

# 1. Generate embeddings
king = get_embedding("king")
man = get_embedding("man")
woman = get_embedding("woman")
queen = get_embedding("queen")

# 2. Perform the arithmetic: King - Man + Woman
# We are essentially "subtracting" the concept of masculinity 
# and "adding" the concept of femininity to the concept of royalty.
masculinity_vector = man - woman
result_vector = king - masculinity_vector

# 3. Verify the result
similarity = cosine_similarity(king, queen)

print(f"Similarity between (King - Man + Woman) and Queen: {similarity:.4f}")

Similarity between (King - Man + Woman) and Queen: 0.7330


In [103]:
import json
import heapq
from pathlib import Path
from typing import Iterable, List, Optional, Tuple, Union

import numpy as np
import gguf

# Use an Ollama model reference, e.g. "qwen3:0.6b" or "gemma3:27B".
# This cell reads the local Ollama GGUF blob directly (no HuggingFace downloads).

OLLAMA_MODEL = "granite-embedding:30m"

MANIFESTS_ROOT = Path.home() / ".ollama/models/manifests"
BLOBS_ROOT = Path.home() / ".ollama/models/blobs"

def _parse_ollama_model_ref(model: str) -> Tuple[str, str, str, str]:
    """
    Parse Ollama model reference into (host, namespace, repo_path, tag).

    Supported inputs (best-effort):
    - repo:tag                    -> registry.ollama.ai/library/repo:tag
    - namespace/repo:tag          -> registry.ollama.ai/namespace/repo:tag
    - host/namespace/repo:tag     -> host/namespace/repo:tag

    If :tag is omitted, tag defaults to "latest".
    """
    model = model.strip()
    if not model:
        raise ValueError("Empty OLLAMA_MODEL")

    if ":" in model:
        repo_part, tag = model.rsplit(":", 1)
        tag = tag or "latest"
    else:
        repo_part, tag = model, "latest"

    parts = [p for p in repo_part.split("/") if p]
    if len(parts) >= 3:
        host = parts[0]
        namespace = parts[1]
        repo_path = "/".join(parts[2:])
    elif len(parts) == 2:
        host = "registry.ollama.ai"
        namespace = parts[0]
        repo_path = parts[1]
    else:
        host = "registry.ollama.ai"
        namespace = "library"
        repo_path = parts[0]

    return host, namespace, repo_path, tag

def _candidate_manifest_paths(model: str) -> List[Path]:
    host, namespace, repo_path, tag = _parse_ollama_model_ref(model)
    repo_parts = repo_path.split("/")

    candidates: List[Path] = []
    direct = MANIFESTS_ROOT / host / namespace
    for rp in repo_parts:
        direct = direct / rp
    direct = direct / tag
    candidates.append(direct)

    # Fallback: match any manifest that ends with /<repo_last>/<tag>.
    repo_last = repo_parts[-1]
    candidates.extend(MANIFESTS_ROOT.glob(f"**/{repo_last}/{tag}"))

    seen = set()
    out: List[Path] = []
    for p in candidates:
        if p not in seen:
            seen.add(p)
            out.append(p)
    return out

def _resolve_ollama_manifest(model: str) -> Path:
    candidates = _candidate_manifest_paths(model)
    existing = [p for p in candidates if p.is_file()]
    if not existing:
        raise FileNotFoundError(
            "Could not find an Ollama manifest for model "
            f"{model!r} under {MANIFESTS_ROOT}. Tried:\n"
            + "\n".join(str(p) for p in candidates[:10])
        )
    if len(existing) == 1:
        return existing[0]
    existing.sort(key=lambda p: p.stat().st_mtime, reverse=True)
    return existing[0]

def _resolve_ollama_gguf_blob(manifest_path: Path) -> Path:
    obj = json.loads(manifest_path.read_text(encoding="utf-8"))
    for layer in obj.get("layers", []):
        if layer.get("mediaType") == "application/vnd.ollama.image.model":
            digest = layer["digest"]  # e.g. sha256:...
            if not digest.startswith("sha256:"):
                raise ValueError(f"Unexpected digest format: {digest}")
            blob = BLOBS_ROOT / ("sha256-" + digest.split(":", 1)[1])
            if not blob.exists():
                raise FileNotFoundError(f"Resolved blob does not exist: {blob}")
            return blob
    raise ValueError(f"No model layer found in manifest: {manifest_path}")

def _assert_gguf_file(path: Path) -> None:
    with path.open("rb") as f:
        magic = f.read(4)
    assert magic == b"GGUF", f"Expected GGUF file, got magic={magic!r} at {path}"

manifest_path = _resolve_ollama_manifest(OLLAMA_MODEL)
gguf_path = _resolve_ollama_gguf_blob(manifest_path)
_assert_gguf_file(gguf_path)

print("Using Ollama model:", OLLAMA_MODEL)
print("Using manifest:", manifest_path)
print("Using GGUF:", gguf_path)

# 1) Load the GGUF and access the full token embedding matrix (quantized)
print("1) Load the GGUF and access the full token embedding matrix (quantized)")
reader = gguf.GGUFReader(str(gguf_path))
token_tensor = next(t for t in reader.tensors if t.name == "token_embd.weight")
vocab_size = token_tensor.data.shape[0]

# Determine embedding dimensionality from a dequantized row (robust across architectures).
embed_dim = int(gguf.dequantize(token_tensor.data[:1], token_tensor.tensor_type).shape[1])
print(f"Vocab size: {vocab_size}")
print(f"Embedding dim: {embed_dim}")
print(f"Embedding quant type: {token_tensor.tensor_type}")

# 2) Load tokenizer tokens so we can decode token IDs back to text
print("2) Load tokenizer tokens so we can decode token IDs back to text")
tok_field = reader.fields["tokenizer.ggml.tokens"]
tokens_raw: Union[List[str], List[bytes]] = tok_field.contents()

def _tok_to_str(x: Union[str, bytes]) -> str:
    if isinstance(x, bytes):
        return x.decode("utf-8", errors="replace")
    return str(x)

tokens: List[str] = [_tok_to_str(x) for x in tokens_raw]
# assert len(tokens) == vocab_size, "Token list size must match embedding vocab size"


def find_token_id(word: str) -> int:
    """Best-effort lookup for a *single token* ID (not full tokenization)."""
    candidates = [word, "Ġ" + word, "▁" + word, " " + word]
    for c in candidates:
        try:
            return tokens.index(c)
        except ValueError:
            continue
    raise ValueError(f"Could not find a single-token match for {word!r}. Try another word.")

def dequantize_rows(start: int, end: int) -> np.ndarray:
    """Dequantize embedding rows [start:end) to float32 shape [N, embed_dim]."""
    q = token_tensor.data[start:end]
    return gguf.dequantize(q, token_tensor.tensor_type)

def get_token_vector(token_id: int) -> np.ndarray:
    return dequantize_rows(token_id, token_id + 1)[0]

def get_token_vector_str(token: str) -> np.ndarray:
    tid = find_token_id(token)
    return get_token_vector(tid)

def normalize(v: np.ndarray) -> np.ndarray:
    n = np.linalg.norm(v)
    return v if n == 0 else (v / n)

def find_closest_tokens(
    vector: np.ndarray,
    k: int = 5,
    ignore_ids: Optional[Iterable[int]] = None,
    chunk_size: int = 2048,
) -> List[Tuple[str, float, int]]:
    """
    Find top-k cosine-similar tokens across the *entire* vocab by dequantizing in chunks.
    Returns (token_string, similarity, token_id).
    """
    ignore = set(ignore_ids or [])
    target = normalize(vector).astype(np.float32, copy=False)

    heap: List[Tuple[float, int]] = []  # (score, token_id), min-heap

    for start in range(0, vocab_size, chunk_size):
        end = min(start + chunk_size, vocab_size)
        emb = dequantize_rows(start, end) # float32  [N, embed_dim]
        norms = np.linalg.norm(emb, axis=1, keepdims=True)
        norms[norms == 0] = 1.0
        emb = emb / norms
        scores = emb @ target # float32 [N]

        for i, score in enumerate(scores):
            tid = start + i
            if tid in ignore:
                continue
            s = float(score)
            if len(heap) < k:
                heapq.heappush(heap, (s, tid))
            elif s > heap[0][0]:
                heapq.heapreplace(heap, (s, tid))

    best = sorted(heap, reverse=True)
    return [(tokens[tid], score, tid) for score, tid in best]



Using Ollama model: granite-embedding:30m
Using manifest: /home/max/.ollama/models/manifests/registry.ollama.ai/library/granite-embedding/30m
Using GGUF: /home/max/.ollama/models/blobs/sha256-27d24c87a53d110b95abecbff83f966206857a9dc0ba1efd336d08dbd0afc833
1) Load the GGUF and access the full token embedding matrix (quantized)
Vocab size: 50265
Embedding dim: 384
Embedding quant type: 1
2) Load tokenizer tokens so we can decode token IDs back to text


In [105]:

# 3) Demo: king - man + woman (single-token lookup)
print("3) Demo")
print("3a) find token IDs for inputs")
king_id = find_token_id("king")
man_id = find_token_id("man")
woman_id = find_token_id("woman")

king_vec = get_token_vector(king_id)
queen_vec = get_token_vector(find_token_id("queen"))
boy_vec = get_token_vector(find_token_id("boy"))

man_vec = get_token_vector(man_id)
woman_vec = get_token_vector(woman_id)
masculinity_vector = man_vec - woman_vec

target_vector = queen_vec + masculinity_vector
# target_vector = get_token_vector(find_token_id("The best selling book about wizards was written by JK Rowling and is called "))
target_vector = get_token_vector_str("Paris") - get_token_vector_str("France") + get_token_vector_str("Germany")

ignore = []
print("3b) find closest tokens")
matches = find_closest_tokens(target_vector, k=50, ignore_ids=ignore)

print("\nClosest matches in the entire vocabulary:")
for i, (tok, score, tid) in enumerate(matches, start=1):
    print(f"{i}. {tok!r} (id={tid}, similarity={score:.4f})")

3) Demo
3a) find token IDs for inputs
3b) find closest tokens

Closest matches in the entire vocabulary:
1. 'Paris' (id=32826, similarity=0.6739)
2. 'Germany' (id=24596, similarity=0.5715)
3. 'ĠParis' (id=2201, similarity=0.5415)
4. 'ĠGermany' (id=1600, similarity=0.4718)
5. 'ĠBerlin' (id=5459, similarity=0.4625)
6. 'ĠGerman' (id=1859, similarity=0.4014)
7. 'German' (id=27709, similarity=0.3981)
8. 'Chicago' (id=21897, similarity=0.3699)
9. 'ĠHamburg' (id=17938, similarity=0.3674)
10. 'utsche' (id=35816, similarity=0.3509)
11. 'ĠChicago' (id=1568, similarity=0.3423)
12. 'ĠFrankfurt' (id=14184, similarity=0.3395)
13. 'stanbul' (id=46770, similarity=0.3372)
14. 'ĠDresden' (id=39289, similarity=0.3319)
15. 'ĠDeutsche' (id=8043, similarity=0.3265)
16. 'ĠIstanbul' (id=12275, similarity=0.3245)
17. 'furt' (id=45464, similarity=0.3198)
18. 'ĠMunich' (id=10489, similarity=0.3192)
19. 'ĠGermans' (id=18415, similarity=0.3167)
20. 'London' (id=23122, similarity=0.3167)
21. 'Karl' (id=43397, simil