# Export tokenâ†’embedding map (LLM backend)

Use this notebook on a machine with OpenAI API access (and `OPENAI_API_KEY` set) to embed the vocab you need (cluster words + transcript tokens), then save an `npz` map for the LLM backend.

After exporting, copy the resulting file (e.g., `embeddings/gpt_tokens.npz`) to Diego and run day26 with:

```
--category-embedding-backend llm \
--lm-embedding-path embeddings/gpt_tokens.npz \
[--no-lm-lowercase-tokens if you keep case]
```

In [None]:
# Configuration
from pathlib import Path
import json
import numpy as np
import pandas as pd
from typing import Iterable, List, Set
from openai import OpenAI

project_root = Path.cwd().parent  # assumes notebook lives in notebooks/
cluster_csv = project_root / "configs" / "cluster_words.csv"
# Point this to your transcript/TextGrid root (or replace with explicit file list)
transcript_roots = [Path("/flash/PaoU/seann/ds003020_copy/derivative/TextGrids")]

lowercase_tokens = True  # must match runtime flag (--lm-lowercase-tokens default True)
# OpenAI embedding model (API). Options: text-embedding-3-large (quality) or text-embedding-3-small (cheaper)
model_name = "text-embedding-3-large"
batch_size = 128

output_npz = project_root / "embeddings" / "gpt_tokens.npz"
metadata_path = project_root / "embeddings" / "gpt_tokens.meta.json"

output_npz.parent.mkdir(parents=True, exist_ok=True)

print("project_root:", project_root)
print("cluster_csv:", cluster_csv)
print("transcript_roots:", transcript_roots)
print("output_npz:", output_npz)


In [None]:
# Helpers
import re

word_re = re.compile(r"[A-Za-z0-9'_]+")

def normalize_token(tok: str) -> str:
    tok = tok.strip()
    if lowercase_tokens:
        tok = tok.lower()
    return tok

def tokens_from_text(text: str) -> List[str]:
    return [normalize_token(m.group(0)) for m in word_re.finditer(text)]

def extract_tokens_from_textgrid(path: Path) -> List[str]:
    """Lightweight fallback: treat TextGrid as text and regex tokens.
    If you prefer full parsing, replace with a proper TextGrid reader.
    """
    try:
        text = path.read_text(errors="ignore")
    except Exception:
        return []
    return tokens_from_text(text)

def extract_tokens_from_file(path: Path) -> List[str]:
    suffix = path.suffix.lower()
    if suffix in {".textgrid", ".tg"}:
        return extract_tokens_from_textgrid(path)
    if suffix in {".txt", ""}:
        try:
            return tokens_from_text(path.read_text(errors="ignore"))
        except Exception:
            return []
    if suffix in {".csv", ".tsv"}:
        try:
            df = pd.read_csv(path)
            for col in df.columns:
                if df[col].dtype == object:
                    return [normalize_token(t) for t in df[col].astype(str).tolist()]
        except Exception:
            return []
    return []

def collect_transcript_tokens(roots: Iterable[Path]) -> Set[str]:
    vocab: Set[str] = set()
    files: List[Path] = []
    for root in roots:
        if root.is_file():
            files.append(root)
        elif root.is_dir():
            files.extend(root.rglob("*.TextGrid"))
            files.extend(root.rglob("*.txt"))
            files.extend(root.rglob("*.csv"))
    for path in files:
        toks = extract_tokens_from_file(path)
        vocab.update(t for t in toks if t)
    return vocab


In [None]:
# Build vocab from cluster CSV
cluster_vocab: Set[str] = set()
df_clusters = pd.read_csv(cluster_csv)
cluster_vocab.update(
    normalize_token(w)
    for w in df_clusters["word"].dropna().astype(str).tolist()
    if normalize_token(w)
)
print(f"Cluster vocab size: {len(cluster_vocab)}")


In [None]:
# Build vocab from transcripts
transcript_vocab = collect_transcript_tokens(transcript_roots)
print(f"Transcript vocab size: {len(transcript_vocab)}")


In [None]:
# Final vocab
vocab = sorted({t for t in cluster_vocab.union(transcript_vocab) if t})
print(f"Final vocab size: {len(vocab)}")
vocab[:10]


In [None]:
# Embed via OpenAI API
from openai import OpenAI

client = OpenAI()
embeddings: List[List[float]] = []

for start in range(0, len(vocab), batch_size):
    chunk = vocab[start : start + batch_size]
    resp = client.embeddings.create(model=model_name, input=chunk)
    embeddings.extend([d.embedding for d in resp.data])
    print(f"Embedded {len(embeddings)}/{len(vocab)}")

assert len(embeddings) == len(vocab)
emb_array = np.asarray(embeddings, dtype=float)
print("Embedding array shape:", emb_array.shape)


In [None]:
# Save outputs
np.savez(output_npz, tokens=np.array(vocab), embeddings=emb_array)
meta = {
    "model": model_name,
    "lowercase_tokens": lowercase_tokens,
    "batch_size": batch_size,
    "vocab_size": len(vocab),
    "cluster_vocab_size": len(cluster_vocab),
    "transcript_vocab_size": len(transcript_vocab),
    "output_npz": str(output_npz),
}
metadata_path.write_text(json.dumps(meta, indent=2))
print("Wrote", output_npz)
print("Wrote", metadata_path)
