In [1]:
1 + 2

3

In [None]:
import ollama
import numpy as np
from numpy.linalg import norm


def get_embedding(text):
    response = ollama.embeddings(model="embeddinggemma:300m", prompt=text)
    return np.array(response["embedding"])


def cosine_similarity(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))


# 1. Generate embeddings
king = get_embedding("king")
man = get_embedding("man")
woman = get_embedding("woman")
queen = get_embedding("queen")

# 2. Perform the arithmetic: King - Man + Woman
# We are essentially "subtracting" the concept of masculinity
# and "adding" the concept of femininity to the concept of royalty.
masculinity_vector = man - woman
result_vector = king - masculinity_vector

# 3. Verify the result
similarity = cosine_similarity(king, queen)

print(f"Similarity between (King - Man + Woman) and Queen: {similarity:.4f}")

Similarity between (King - Man + Woman) and Queen: 0.7330


In [None]:
import json
import heapq
import os
from pathlib import Path
from typing import Iterable, List, Optional, Tuple, Union

import numpy as np
import gguf

# Use an Ollama model reference, e.g. "qwen3:0.6b" or "gemma3:27B".
# This notebook reads the local Ollama GGUF blob directly (no HuggingFace downloads).

OLLAMA_MODEL = "gemma3:270M"

MANIFESTS_ROOT = Path.home() / ".ollama/models/manifests"
BLOBS_ROOT = Path.home() / ".ollama/models/blobs"


def _parse_ollama_model_ref(model: str) -> Tuple[str, str, str, str]:
    """Parse Ollama model reference into (host, namespace, repo_path, tag)."""
    model = model.strip()
    if not model:
        raise ValueError("Empty OLLAMA_MODEL")

    if ":" in model:
        repo_part, tag = model.rsplit(":", 1)
        tag = tag or "latest"
    else:
        repo_part, tag = model, "latest"

    parts = [p for p in repo_part.split("/") if p]
    if len(parts) >= 3:
        host = parts[0]
        namespace = parts[1]
        repo_path = "/".join(parts[2:])
    elif len(parts) == 2:
        host = "registry.ollama.ai"
        namespace = parts[0]
        repo_path = parts[1]
    else:
        host = "registry.ollama.ai"
        namespace = "library"
        repo_path = parts[0]

    return host, namespace, repo_path, tag


def _candidate_manifest_paths(model: str) -> List[Path]:
    host, namespace, repo_path, tag = _parse_ollama_model_ref(model)
    repo_parts = repo_path.split("/")

    candidates: List[Path] = []
    direct = MANIFESTS_ROOT / host / namespace
    for rp in repo_parts:
        direct = direct / rp
    direct = direct / tag
    candidates.append(direct)

    # Fallback: match any manifest that ends with /<repo_last>/<tag>.
    repo_last = repo_parts[-1]
    candidates.extend(MANIFESTS_ROOT.glob(f"**/{repo_last}/{tag}"))

    # De-duplicate preserving order
    seen: set[Path] = set()
    out: List[Path] = []
    for p in candidates:
        if p not in seen:
            seen.add(p)
            out.append(p)
    return out


def _resolve_ollama_manifest(model: str) -> Path:
    candidates = _candidate_manifest_paths(model)
    existing = [p for p in candidates if p.is_file()]
    if not existing:
        raise FileNotFoundError(
            "Could not find an Ollama manifest for model "
            f"{model!r} under {MANIFESTS_ROOT}. Tried:\n"
            + "\n".join(str(p) for p in candidates[:10])
        )
    if len(existing) == 1:
        return existing[0]
    existing.sort(key=lambda p: p.stat().st_mtime, reverse=True)
    return existing[0]


def _resolve_ollama_gguf_blob(manifest_path: Path) -> Path:
    obj = json.loads(manifest_path.read_text(encoding="utf-8"))
    for layer in obj.get("layers", []):
        if layer.get("mediaType") == "application/vnd.ollama.image.model":
            digest = layer["digest"]  # e.g. sha256:...
            if not digest.startswith("sha256:"):
                raise ValueError(f"Unexpected digest format: {digest}")
            blob = BLOBS_ROOT / ("sha256-" + digest.split(":", 1)[1])
            if not blob.exists():
                raise FileNotFoundError(f"Resolved blob does not exist: {blob}")
            return blob
    raise ValueError(f"No model layer found in manifest: {manifest_path}")


def _assert_gguf_file(path: Path) -> None:
    with path.open("rb") as f:
        magic = f.read(4)
    assert magic == b"GGUF", f"Expected GGUF file, got magic={magic!r} at {path}"


def _tok_to_str(x: Union[str, bytes]) -> str:
    if isinstance(x, bytes):
        return x.decode("utf-8", errors="replace")
    return str(x)


def _get_chat_template_from_gguf(reader: gguf.GGUFReader) -> Optional[str]:
    field = reader.fields.get("tokenizer.chat_template")
    if not field:
        return None
    data = getattr(field, "data", None)
    if isinstance(data, list) and data:
        return str(data[0])
    return None


def load_ollama_gguf_context(model: str) -> dict:
    manifest_path = _resolve_ollama_manifest(model)
    gguf_path = _resolve_ollama_gguf_blob(manifest_path)
    _assert_gguf_file(gguf_path)

    print("Using Ollama model:", model)
    print("Using manifest:", manifest_path)
    print("Using GGUF:", gguf_path)

    # ---- Embedding-matrix utilities (GGUF) ----
    print("1) Load the GGUF and access the full token embedding matrix (quantized)")
    reader = gguf.GGUFReader(str(gguf_path))

    # token_embd.weight = embedding table (token id -> vector)
    token_tensor = next(t for t in reader.tensors if t.name == "token_embd.weight")
    vocab_size = token_tensor.data.shape[0]

    # Determine embedding dimensionality from a dequantized row (robust across architectures).
    embed_dim = int(
        gguf.dequantize(token_tensor.data[:1], token_tensor.tensor_type).shape[1]
    )

    print(f"Vocab size: {vocab_size}")
    print(f"Embedding dim: {embed_dim}")
    print(f"Embedding quant type: {token_tensor.tensor_type}")

    print("2) Load tokenizer tokens so we can decode token IDs back to text")
    tok_field = reader.fields["tokenizer.ggml.tokens"]
    tokens_raw: Union[List[str], List[bytes]] = tok_field.contents()

    tokens: List[str] = [_tok_to_str(x) for x in tokens_raw]

    chat_template = _get_chat_template_from_gguf(reader)

    return {
        "reader": reader,
        "gguf_path": gguf_path,
        "vocab_size": vocab_size,
        "embed_dim": embed_dim,
        "tokens": tokens,
        "token_tensor": token_tensor,
        "chat_template": chat_template,
    }


# %load_ext line_profiler
# %lprun -f load_ollama_gguf_context context = load_ollama_gguf_context(OLLAMA_MODEL)
context = load_ollama_gguf_context(OLLAMA_MODEL)
_CHAT_TEMPLATE = context["chat_template"]


def find_token_id(word: str, tokens: List[str]) -> int:
    """Best-effort lookup for a *single token* ID (not full tokenization)."""
    candidates = [word, "Ġ" + word, "▁" + word, " " + word]
    for c in candidates:
        try:
            return tokens.index(c)
        except ValueError:
            continue
    raise ValueError(
        f"Could not find a single-token match for {word!r}. Try another word."
    )


def dequantize_rows(start: int, end: int, token_tensor) -> np.ndarray:
    """Dequantize embedding rows [start:end) to float32 shape [N, embed_dim]."""
    q = token_tensor.data[start:end]
    return gguf.dequantize(q, token_tensor.tensor_type)


def get_token_vector(token_id: int, token_tensor) -> np.ndarray:
    return dequantize_rows(token_id, token_id + 1, token_tensor)[0]


def get_token_vector_str(token: str, tokens: List[str], token_tensor) -> np.ndarray:
    tid = find_token_id(token, tokens)
    return get_token_vector(tid, token_tensor)


def normalize(v: np.ndarray) -> np.ndarray:
    n = np.linalg.norm(v)
    return v if n == 0 else (v / n)


def find_closest_tokens(
    vector: np.ndarray,
    k: int = 5,
    ignore_ids: Optional[Iterable[int]] = None,
    chunk_size: int = 2048,
    vocab_size: int = None,
    token_tensor=None,
    tokens: List[str] = None,
) -> List[Tuple[str, float, int]]:
    """
    Find top-k cosine-similar tokens across the *entire* vocab by dequantizing in chunks.
    Returns (token_string, similarity, token_id).
    """
    ignore = set(ignore_ids or [])
    target = normalize(vector).astype(np.float32, copy=False)

    heap: List[Tuple[float, int]] = []  # (score, token_id), min-heap

    for start in range(0, vocab_size, chunk_size):
        end = min(start + chunk_size, vocab_size)
        emb = dequantize_rows(start, end, token_tensor)  # float32  [N, embed_dim]
        norms = np.linalg.norm(emb, axis=1, keepdims=True)
        norms[norms == 0] = 1.0
        emb = emb / norms
        scores = emb @ target  # float32 [N]

        for i, score in enumerate(scores):
            tid = start + i
            if tid in ignore:
                continue
            s = float(score)
            if len(heap) < k:
                heapq.heappush(heap, (s, tid))
            elif s > heap[0][0]:
                heapq.heapreplace(heap, (s, tid))

    best = sorted(heap, reverse=True)
    return [(tokens[tid], score, tid) for score, tid in best]


# ---- Chat-like function (local GGUF) ----
# Note: `gguf` provides reading/dequantization utilities but does not execute the model forward pass.
# For inference on a GGUF model we use llama.cpp via `llama-cpp-python`, still pointing at Ollama's local GGUF blob.

SYSTEM_PROMPT = "You are a helpful assistant."

_CHAT_MESSAGES: List[dict[str, str]] = [{"role": "system", "content": SYSTEM_PROMPT}]
_LLAMA = None


def _render_prompt(messages: List[dict[str, str]]) -> str:
    """Render a prompt using GGUF's chat template when available; fallback to a simple ChatML-ish format."""
    if _CHAT_TEMPLATE:
        try:
            from jinja2 import Environment, StrictUndefined

            def raise_exception(msg: str):
                raise RuntimeError(msg)

            env = Environment(
                undefined=StrictUndefined, trim_blocks=True, lstrip_blocks=True
            )
            j = env.from_string(_CHAT_TEMPLATE)
            return j.render(
                messages=messages,
                add_generation_prompt=True,
                raise_exception=raise_exception,
            )
        except Exception:
            pass

    parts: List[str] = []
    for m in messages:
        parts.append(f"<|im_start|>{m['role']}\n{m['content']}\n<|im_end|>")
    parts.append("<|im_start|>assistant\n")
    return "\n".join(parts)


def _get_llama(gguf_path: Path):
    global _LLAMA
    if _LLAMA is not None:
        return _LLAMA
    try:
        from llama_cpp import Llama
    except Exception as e:
        raise RuntimeError(
            "Missing dependency: llama-cpp-python. Install it in this notebook kernel, e.g.\n"
            "  pip install llama-cpp-python\n\n"
            "This still uses the Ollama-downloaded GGUF file locally and does not use the ollama Python package."
        ) from e

    n_ctx = 4096
    n_threads = max(1, (os.cpu_count() or 4) - 1)
    _LLAMA = Llama(
        model_path=str(gguf_path),
        n_ctx=n_ctx,
        n_threads=n_threads,
        logits_all=False,
        vocab_only=False,
    )
    return _LLAMA


def next_message(user_message: str, messages: List[dict[str, str]]) -> str:
    """
    Chat-like helper similar to `ollama.chat`, but using the local Ollama-downloaded GGUF blob.
    Does not import or call the `ollama` Python package.
    """
    msg = user_message.strip()
    if not msg:
        return ""

    messages.append({"role": "user", "content": msg})
    prompt = _render_prompt(messages)

    llm = _get_llama(context["gguf_path"])
    out = llm.create_completion(
        prompt=prompt,
        max_tokens=256,
        temperature=0.7,
        top_p=0.95,
        stop=["<|im_end|>", "</s>"],
    )
    text = (out.get("choices", [{}])[0].get("text") or "").strip()
    messages.append({"role": "assistant", "content": text})
    return text

Using Ollama model: gemma3:270M
Using manifest: /home/max/.ollama/models/manifests/registry.ollama.ai/library/gemma3/270M
Using GGUF: /home/max/.ollama/models/blobs/sha256-735af2139dc652bf01112746474883d79a52fa1c19038265d363e3d42556f7a2
1) Load the GGUF and access the full token embedding matrix (quantized)
Vocab size: 262144
Embedding dim: 640
Embedding quant type: 8
2) Load tokenizer tokens so we can decode token IDs back to text


In [15]:
# Example usage of `next_message` (chat-like helper backed by the local GGUF)

messages = [{"role": "system", "content": SYSTEM_PROMPT}]

prompt = "Give me a 1-sentence summary of what you can do."
print("User:", prompt)
print("Assistant:", next_message(prompt, messages))
# print()

# print(
#     "User: Write a Python function that computes cosine similarity between two numpy vectors."
# )
# print(
#     "Assistant:",
#     next_message(
#         "Write a Python function that computes cosine similarity between two numpy vectors.",
#         messages
#     ),
# )
# print()

# print("User: Now explain in 2 bullet points how the prompt is constructed here.")
# print(
#     "Assistant:",
#     next_message(
#         "Now explain in 2 bullet points how the prompt is constructed here.",
#         messages
#     ),
# )

llama_model_loader: loaded meta data with 36 key-value pairs and 236 tensors from /home/max/.ollama/models/blobs/sha256-735af2139dc652bf01112746474883d79a52fa1c19038265d363e3d42556f7a2 (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = gemma3
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                         general.size_label str              = 268M
llama_model_loader: - kv   3:                            general.license str              = gemma
llama_model_loader: - kv   4:                   general.base_model.count u32              = 1
llama_model_loader: - kv   5:                  general.base_model.0.name str              = Gemma 3 270m
llama_model_loader: - kv   6:          general.base_model.0.organization str              = Google


User: Give me a 1-sentence summary of what you can do.


load: control token: 261779 '<unused5877>' is not marked as EOG
load: control token: 261778 '<unused5876>' is not marked as EOG
load: control token: 261777 '<unused5875>' is not marked as EOG
load: control token: 261776 '<unused5874>' is not marked as EOG
load: control token: 261775 '<unused5873>' is not marked as EOG
load: control token: 261774 '<unused5872>' is not marked as EOG
load: control token: 261773 '<unused5871>' is not marked as EOG
load: control token: 261771 '<unused5869>' is not marked as EOG
load: control token: 261767 '<unused5865>' is not marked as EOG
load: control token: 261766 '<unused5864>' is not marked as EOG
load: control token: 261763 '<unused5861>' is not marked as EOG
load: control token: 261762 '<unused5860>' is not marked as EOG
load: control token: 261761 '<unused5859>' is not marked as EOG
load: control token: 261759 '<unused5857>' is not marked as EOG
load: control token: 261758 '<unused5856>' is not marked as EOG
load: control token: 261756 '<unused5854

In [None]:
# 3) Demo: king - man + woman (single-token lookup)
print("3) Demo")
print("3a) find token IDs for inputs")
king_id = find_token_id("king", context["tokens"])
man_id = find_token_id("man", context["tokens"])
woman_id = find_token_id("woman", context["tokens"])

king_vec = get_token_vector(king_id, context["token_tensor"])
queen_vec = get_token_vector(
    find_token_id("queen", context["tokens"]), context["token_tensor"]
)
boy_vec = get_token_vector(
    find_token_id("boy", context["tokens"]), context["token_tensor"]
)

man_vec = get_token_vector(man_id, context["token_tensor"])
woman_vec = get_token_vector(woman_id, context["token_tensor"])
masculinity_vector = man_vec - woman_vec

target_vector = queen_vec + masculinity_vector
target_vector = get_token_vector_str(
    "emoji", context["tokens"], context["token_tensor"]
) - get_token_vector_str(
    "emoji", context["tokens"], context["token_tensor"]
)  # + get_token_vector_str("apple", context['tokens'], context['token_tensor']) #+ get_token_vector_str("Playstation", context['tokens'], context['token_tensor'])

ignore = []
print("3b) find closest tokens")
matches = find_closest_tokens(
    target_vector,
    k=25,
    ignore_ids=ignore,
    vocab_size=context["vocab_size"],
    token_tensor=context["token_tensor"],
    tokens=context["tokens"],
)

print("\nClosest matches in the entire vocabulary:")
for i, (tok, score, tid) in enumerate(matches, start=1):
    print(f"{i}. {tok!r} (id={tid}, similarity={score:.4f})")

longest_token = max(context["tokens"], key=lambda t: len(t))
print(
    f"\nLongest token in vocabulary is {len(longest_token)} characters: {longest_token!r}"
)

ten_longest_tokens = heapq.nlargest(10, context["tokens"], key=lambda t: len(t))
print("\nTop 10 longest tokens in vocabulary:")
for i, tok in enumerate(ten_longest_tokens, start=1):
    print(f"{i}. {len(tok)} chars: {tok!r}")

3) Demo
3a) find token IDs for inputs
3b) find closest tokens

Closest matches in the entire vocabulary:
1. '<unused18>' (id=24, similarity=0.0000)
2. '<unused17>' (id=23, similarity=0.0000)
3. '<unused16>' (id=22, similarity=0.0000)
4. '<unused15>' (id=21, similarity=0.0000)
5. '<unused14>' (id=20, similarity=0.0000)
6. '<unused13>' (id=19, similarity=0.0000)
7. '<unused12>' (id=18, similarity=0.0000)
8. '<unused11>' (id=17, similarity=0.0000)
9. '<unused10>' (id=16, similarity=0.0000)
10. '<unused9>' (id=15, similarity=0.0000)
11. '<unused8>' (id=14, similarity=0.0000)
12. '<unused7>' (id=13, similarity=0.0000)
13. '<unused6>' (id=12, similarity=0.0000)
14. '<unused5>' (id=11, similarity=0.0000)
15. '<unused4>' (id=10, similarity=0.0000)
16. '<unused3>' (id=9, similarity=0.0000)
17. '<unused2>' (id=8, similarity=0.0000)
18. '<unused1>' (id=7, similarity=0.0000)
19. '<unused0>' (id=6, similarity=0.0000)
20. '[multimodal]' (id=5, similarity=0.0000)
21. '<mask>' (id=4, similarity=0.0000