# Final SID metrics + SID-space visualization

完成两件事：

1. **最终版 SID 指标计算**（collision / bucket stats / prefix / CUR / entropy / top1_share / PAS + 类别混合度）
2. 基于 **SID（离散 token）本身** 的降维可视化：把每个 item 的 SID 当作 4 个离散特征，用 OneHotEncoder → TruncatedSVD 得到 2D 表示，在 2D 平面上用 **类别上色**，看不同品类是否在 SID 空间被分到不同区域。


In [1]:
import json
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

SID_JSON_PATH = Path("./data/Amazon2018/All_Amazon/SIDs/debug.index.json")   # SID json
EMB_DIR = Path("./data/Amazon2018/All_Amazon")  # embedding 目录（4 个 .npy）

## 0) 读入 embeddings（并自动生成类别标签）

In [2]:
def load_embeddings_dir_with_splits(emb_dir: Path):
    """读取目录下所有 .npy/.npz 并按文件名排序 stack，返回 X, y, splits。
    y 是类别字符串数组；splits 记录每个文件对应的行范围（0-based，含起止）。""

    约定：文件名形如 `Arts_Crafts_and_Sewing.emb-xxx.npy`，类别名取第一个 `.emb-` 之前的部分。
    """
    files = sorted(list(emb_dir.glob("*.npy")) + list(emb_dir.glob("*.npz")))
    if not files:
        raise FileNotFoundError(f"No .npy/.npz files in: {emb_dir}")

    arrays = []
    y = []
    splits = []
    start = 0
    dim = None

    print(f"[Emb] Loading directory: {emb_dir} (files={len(files)})")
    for f in files:
        obj = np.load(f, allow_pickle=False)
        if isinstance(obj, np.lib.npyio.NpzFile):
            keys = list(obj.files)
            arr = obj[keys[0]] if len(keys) == 1 else obj["arr_0"]
        else:
            arr = obj
        arr = np.asarray(arr)
        if arr.ndim == 1:
            arr = arr.reshape(1, -1)
        if arr.ndim != 2:
            raise ValueError(f"{f} must be 1D/2D, got {arr.shape}")
        if arr.dtype != np.float32:
            arr = arr.astype(np.float32, copy=False)

        # 清 NaN/Inf
        if np.isnan(arr).any() or np.isinf(arr).any():
            np.nan_to_num(arr, copy=False, nan=0.0, posinf=0.0, neginf=0.0)

        if dim is None:
            dim = arr.shape[1]
        elif arr.shape[1] != dim:
            raise ValueError(f"Dim mismatch: expected {dim}, got {arr.shape[1]} in {f.name}")

        n = arr.shape[0]
        end = start + n - 1

        # 类别名：取 `.emb-` 前的部分；否则用 stem
        name = f.name
        cat = name.split(".emb-")[0] if ".emb-" in name else f.stem

        print(f"[Emb] rows {start} ~ {end}: {f.name} (n={n}, dim={arr.shape[1]})  ->  cat={cat}")

        arrays.append(arr)
        y.extend([cat] * n)
        splits.append({"file": f.name, "cat": cat, "start": start, "end": end, "n": n})
        start = end + 1

    X = np.concatenate(arrays, axis=0)
    y = np.asarray(y)
    print(f"[Emb] Loaded embeddings shape: {X.shape}")
    return X, y, splits

X, y_cat, splits = load_embeddings_dir_with_splits(EMB_DIR)
N, D = X.shape
len(y_cat), splits


[Emb] Loading directory: data/Amazon2018/All_Amazon (files=4)
[Emb] rows 0 ~ 8093: Arts_Crafts_and_Sewing.emb-Qwen3-Embedding-4B-td.npy (n=8094, dim=2560)  ->  cat=Arts_Crafts_and_Sewing
[Emb] rows 8094 ~ 11526: Industrial_and_Scientific.emb-Qwen3-Embedding-4B-td.npy (n=3433, dim=2560)  ->  cat=Industrial_and_Scientific
[Emb] rows 11527 ~ 19595: Office_Products.emb-Qwen3-Embedding-4B-td.npy (n=8069, dim=2560)  ->  cat=Office_Products
[Emb] rows 19596 ~ 22740: Video_Games.emb-Qwen3-Embedding-4B-td.npy (n=3145, dim=2560)  ->  cat=Video_Games
[Emb] Loaded embeddings shape: (22741, 2560)


(22741,
 [{'file': 'Arts_Crafts_and_Sewing.emb-Qwen3-Embedding-4B-td.npy',
   'cat': 'Arts_Crafts_and_Sewing',
   'start': 0,
   'end': 8093,
   'n': 8094},
  {'file': 'Industrial_and_Scientific.emb-Qwen3-Embedding-4B-td.npy',
   'cat': 'Industrial_and_Scientific',
   'start': 8094,
   'end': 11526,
   'n': 3433},
  {'file': 'Office_Products.emb-Qwen3-Embedding-4B-td.npy',
   'cat': 'Office_Products',
   'start': 11527,
   'end': 19595,
   'n': 8069},
  {'file': 'Video_Games.emb-Qwen3-Embedding-4B-td.npy',
   'cat': 'Video_Games',
   'start': 19596,
   'end': 22740,
   'n': 3145}])

## 1) 读入 SID json

从 SID json 构建 codes_sem (N,3) + suffix (N,)

In [3]:
import re

TOKEN_RE = re.compile(r"^<([a-zA-Z]+)_(\d+)>$")

def load_sid_json(path: Path):
    sid_map = json.loads(path.read_text(encoding="utf-8"))
    # keys 可能是字符串数字
    # 转成 list[ list[str] ]，长度 N（若缺失会填 None）
    max_k = max(int(k) for k in sid_map.keys())
    return sid_map, max_k

sid_map, max_k = load_sid_json(SID_JSON_PATH)
print(f"[SID] loaded entries: {len(sid_map)}, max key={max_k}")

if max_k + 1 != N:
    print(f"[WARN] SID size ({max_k+1}) != embedding N ({N}). "
          f"后续会按 min(N, max_k+1) 对齐。")

M = min(N, max_k + 1)

def parse_tokens(tokens):
    layers = []
    ids = []
    for t in tokens:
        m = TOKEN_RE.match(t)
        if not m:
            raise ValueError(f"Bad token format: {t}")
        layers.append(m.group(1))
        ids.append(int(m.group(2)))
    return layers, ids

# 解析前几条看看
for k in ["0", "1"]:
    if k in sid_map:
        layers, ids = parse_tokens(sid_map[k])
        print(k, sid_map[k], "->", layers, ids)


[SID] loaded entries: 22741, max key=22740
0 ['<a_77>', '<b_50>', '<c_50>'] -> ['a', 'b', 'c'] [77, 50, 50]
1 ['<a_188>', '<b_229>', '<c_253>', '<d_1>'] -> ['a', 'b', 'c', 'd'] [188, 229, 253, 1]


In [4]:
import re
import numpy as np

TOKEN_RE = re.compile(r"^<([a-zA-Z]+)_(\d+)>$")

def parse_token(t: str):
    m = TOKEN_RE.match(t)
    if not m:
        raise ValueError(f"Bad token format: {t}")
    return m.group(1), int(m.group(2))

def build_semantic_codes_and_suffix(sid_map: dict, M: int, X: np.ndarray, y_cat: np.ndarray):
    """
    统一语义 SID = 前三层 (a,b,c)，suffix = 第四层 d（如果存在）
    - 支持每条 SID 长度为 3 或 4
    - 不再用“第一条的长度”过滤数据
    """
    codes3 = []
    suffix = []
    has_suffix = []
    kept_idx = []

    n_missing = 0
    n_bad = 0
    n_len3 = 0
    n_len4 = 0

    layer_names3 = None

    for i in range(M):
        k = str(i)
        if k not in sid_map:
            n_missing += 1
            continue

        toks = sid_map[k]
        if not isinstance(toks, list) or len(toks) < 3:
            n_bad += 1
            continue

        try:
            layers = []
            ids = []
            for t in toks:
                l, idx = parse_token(t)
                layers.append(l)
                ids.append(idx)
        except Exception:
            n_bad += 1
            continue

        # 语义部分：前三层
        if layer_names3 is None:
            layer_names3 = layers[:3]

        codes3.append(ids[:3])
        kept_idx.append(i)

        # suffix：若有第四层就记下来
        if len(ids) >= 4:
            suffix.append(ids[3])
            has_suffix.append(True)
            n_len4 += 1
        else:
            suffix.append(-1)
            has_suffix.append(False)
            n_len3 += 1

    codes3 = np.asarray(codes3, dtype=np.int32)
    suffix = np.asarray(suffix, dtype=np.int32)
    has_suffix = np.asarray(has_suffix, dtype=bool)
    kept_idx = np.asarray(kept_idx, dtype=np.int32)

    X_ok = X[:M][kept_idx]
    y_ok = y_cat[:M][kept_idx]

    print(f"[SID] M={M}, kept={len(kept_idx)}")
    print(f"[SID] missing={n_missing}, bad={n_bad}, len3={n_len3}, len4={n_len4}")
    print(f"[SID] codes3 shape={codes3.shape}, suffix shape={suffix.shape}, has_suffix={has_suffix.mean():.3f}")
    print(f"[SID] layer_names3={layer_names3}")

    return codes3, suffix, has_suffix, X_ok, y_ok, layer_names3

# 使用方式（替换你原来的 build_codes_matrix 调用）
codes_sem, suffix_d, has_suffix, X_aligned, y, layer_names3 = build_semantic_codes_and_suffix(
    sid_map, M, X, y_cat
)
N2, L_sem = codes_sem.shape
print("N2, L_sem =", N2, L_sem)


[SID] M=22741, kept=22741
[SID] missing=0, bad=0, len3=16718, len4=6023
[SID] codes3 shape=(22741, 3), suffix shape=(22741,), has_suffix=0.265
[SID] layer_names3=['a', 'b', 'c']
N2, L_sem = 22741 3


## 2) 最终指标计算（含类别混合度）

### 语义指标只用 codes_sem（前三层）

In [None]:
from collections import Counter, defaultdict
import numpy as np
import math

def entropy_and_perplexity(counter: Counter):
    total = sum(counter.values())
    if total == 0:
        return 0.0, 0.0
    H = 0.0
    for c in counter.values():
        p = c / total
        H -= p * math.log(p + 1e-12)
    return H, math.exp(H)

def bucket_stats_from_counter(cnt: Counter):
    arr = np.asarray(list(cnt.values()), dtype=np.int32)
    return {
        "max_bucket": int(arr.max()) if len(arr) else 0,
        "p95_bucket": float(np.percentile(arr, 95)) if len(arr) else 0.0,
        "p50_bucket": float(np.percentile(arr, 50)) if len(arr) else 0.0,
        "mean_bucket": float(arr.mean()) if len(arr) else 0.0,
        "num_buckets": int(len(arr)),
    }

def cosine_normalize(X):
    X = X.astype(np.float32, copy=False)
    return X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-12)

def compute_pas(bucket_keys, X_normed, pairs_per_bucket=20, max_keep_per_bucket=30, seed=2024):
    buckets = defaultdict(list)
    for i, k in enumerate(bucket_keys):
        if len(buckets[k]) < max_keep_per_bucket:
            buckets[k].append(i)

    rng = np.random.default_rng(seed)
    sims = []
    for idxs in buckets.values():
        m = len(idxs)
        if m < 2:
            continue
        num_pairs = min(pairs_per_bucket, m * (m - 1) // 2)
        for _ in range(num_pairs):
            a, b = rng.choice(idxs, size=2, replace=False)
            sims.append(float(np.dot(X_normed[a], X_normed[b])))
    return float(np.mean(sims)) if sims else float("nan")

def compute_bucket_purity(bucket_keys, labels):
    by = defaultdict(list)
    for i, k in enumerate(bucket_keys):
        by[k].append(i)

    total = len(bucket_keys)
    weighted = 0.0
    unweighted = 0.0
    for idxs in by.values():
        labs = labels[idxs]
        cnt = Counter(labs.tolist())
        m = len(idxs)
        p = max(cnt.values()) / m
        weighted += p * m
        unweighted += p
    return {"purity_weighted": weighted / total, "purity_unweighted": unweighted / len(by)}

def compute_semantic_metrics(codes_sem: np.ndarray, X: np.ndarray, y: np.ndarray, K_list=None):
    N, L = codes_sem.shape  # L=3

    # K_list 可手动指定；否则沿用你之前“至少 256”的策略
    if K_list is None:
        K_list = []
        for l in range(L):
            mx = int(codes_sem[:, l].max())
            K_list.append(256 if mx <= 256 else (mx + 1))

    out = {"N": int(N), "L_sem": int(L), "K_list": [int(k) for k in K_list]}

    # semantic SID keys
    sem_keys = [tuple(row) for row in codes_sem.tolist()]
    cnt = Counter(sem_keys)
    uniq = len(cnt)

    out["collision_rate_sem"] = (N - uniq) / N
    out["unique_rate_sem"] = uniq / N
    out.update({f"{k}_sem": v for k, v in bucket_stats_from_counter(cnt).items()})

    # prefix@1/@2/@3（语义层级）
    for d in range(1, L + 1):
        pref = [tuple(row[:d]) for row in codes_sem[:, :d].tolist()]
        pc = Counter(pref)
        out[f"collision_rate_prefix_{d}"] = (N - len(pc)) / N
        out[f"max_bucket_prefix_{d}"] = int(np.max(list(pc.values())))

    # per-layer distribution
    for l in range(L):
        K = int(K_list[l])
        c = Counter(codes_sem[:, l].tolist())
        used = len(c)
        H, ppl = entropy_and_perplexity(c)
        out[f"CUR_{l+1}"] = used / K
        out[f"entropy_{l+1}"] = H
        out[f"perplexity_{l+1}"] = ppl
        out[f"top1_share_{l+1}"] = max(c.values()) / N

    # PAS + purity（以 semantic bucket 为单位）
    Xn = cosine_normalize(X)
    out["PAS_sem"] = compute_pas(sem_keys, Xn)
    pur = compute_bucket_purity(sem_keys, y)
    out["bucket_purity_weighted_sem"] = pur["purity_weighted"]
    out["bucket_purity_unweighted_sem"] = pur["purity_unweighted"]

    return out

metrics_sem = compute_semantic_metrics(codes_sem, X_aligned, y)
metrics_sem

{'N': 22741,
 'L_sem': 3,
 'K_list': [256, 256, 256],
 'collision_rate_sem': 0.16305351567653137,
 'unique_rate_sem': 0.8369464843234686,
 'max_bucket_sem': 28,
 'p95_bucket_sem': 2.0,
 'p50_bucket_sem': 1.0,
 'mean_bucket_sem': 1.1948195239846582,
 'num_buckets_sem': 19033,
 'collision_rate_prefix_1': 0.9925684886328657,
 'max_bucket_prefix_1': 526,
 'collision_rate_prefix_2': 0.5696759157468889,
 'max_bucket_prefix_2': 46,
 'collision_rate_prefix_3': 0.16305351567653137,
 'max_bucket_prefix_3': 28,
 'CUR_1': 0.66015625,
 'entropy_1': 4.769737214658913,
 'perplexity_1': 117.88825858361827,
 'top1_share_1': 0.02313002946220483,
 'CUR_2': 1.0,
 'entropy_2': 5.485639766293805,
 'perplexity_2': 241.20320832885255,
 'top1_share_2': 0.008486873928147398,
 'CUR_3': 1.0,
 'entropy_3': 5.485301971313853,
 'perplexity_3': 241.12174485568147,
 'top1_share_3': 0.008926608328569544,
 'PAS_sem': 0.9693664501283891,
 'bucket_purity_weighted_sem': 0.9981531155182269,
 'bucket_purity_unweighted_sem': 

In [6]:
def compute_disambiguation_metrics(codes_sem: np.ndarray, suffix_d: np.ndarray, has_suffix: np.ndarray):
    """
    评估第四层“消歧”是否合理 & 消歧成本
    """
    N = codes_sem.shape[0]
    sem_keys = [tuple(row) for row in codes_sem.tolist()]

    # semantic bucket -> indices
    by = defaultdict(list)
    for i, k in enumerate(sem_keys):
        by[k].append(i)

    # 桶大小分布
    sizes = np.asarray([len(v) for v in by.values()], dtype=np.int32)

    # 需要消歧的 item/桶比例
    ambiguous_buckets = sum(1 for v in by.values() if len(v) > 1)
    ambiguous_items = sum(len(v) for v in by.values() if len(v) > 1)

    # suffix 是否在桶内足够“唯一”
    bad_suffix_buckets = 0
    checked_buckets = 0
    for idxs in by.values():
        if len(idxs) <= 1:
            continue
        checked_buckets += 1
        ds = suffix_d[idxs]
        hs = has_suffix[idxs]
        # 只检查那些“确实给了 suffix”的样本
        ds = ds[hs]
        if len(ds) == 0:
            bad_suffix_buckets += 1
        else:
            if len(np.unique(ds)) != len(ds):
                bad_suffix_buckets += 1

    return {
        "suffix_present_rate": float(has_suffix.mean()),
        "ambiguous_item_rate": float(ambiguous_items / N),
        "ambiguous_bucket_rate": float(ambiguous_buckets / len(by)),
        "max_bucket_sem": int(sizes.max()) if len(sizes) else 0,
        "p95_bucket_sem": float(np.percentile(sizes, 95)) if len(sizes) else 0.0,
        "suffix_bad_bucket_rate": float(bad_suffix_buckets / max(1, checked_buckets)),
        "checked_ambiguous_buckets": int(checked_buckets),
    }

metrics_disamb = compute_disambiguation_metrics(codes_sem, suffix_d, has_suffix)
metrics_disamb


{'suffix_present_rate': 0.26485202937425795,
 'ambiguous_item_rate': 0.26485202937425795,
 'ambiguous_bucket_rate': 0.12163085167866337,
 'max_bucket_sem': 28,
 'p95_bucket_sem': 2.0,
 'suffix_bad_bucket_rate': 0.0,
 'checked_ambiguous_buckets': 2315}

In [7]:
from collections import Counter

sem_keys = [tuple(row) for row in codes_sem.tolist()]
cnt = Counter(sem_keys)
dups = [(k, v) for k, v in cnt.items() if v > 1]
dups.sort(key=lambda x: -x[1])

print("num collided semantic SIDs:", len(dups))
print("top 10 collided buckets:")
for k, v in dups[:10]:
    print(k, v)

num collided semantic SIDs: 2315
top 10 collided buckets:
(181, 196, 63) 28
(129, 52, 166) 23
(218, 200, 166) 23
(129, 51, 189) 21
(129, 34, 63) 20
(122, 67, 97) 16
(131, 232, 247) 15
(178, 62, 189) 14
(213, 225, 40) 14
(196, 61, 19) 14


In [8]:
# 保存最终 metrics 到 json
OUT_JSON = Path("sid_metrics.json")
OUT_JSON.write_text(json.dumps(metrics_sem, ensure_ascii=False, indent=2), encoding="utf-8")
print("Saved:", OUT_JSON.resolve())

Saved: /mnt/sdb1/sdb1_xiaojinsong/tiny-onerec/sid_metrics.json
