In [2]:
import os
# ==== SPPMI-SVD: runnable, self-contained ====


# 1) 准备数据：加载 text8
import numpy as np
import gensim.downloader as api

USE_TOY = False  # True: 用内置玩具数据；False: 读取本地语料文件

save_dir = "./models_sppmi"
os.makedirs(save_dir, exist_ok=True)

if USE_TOY:
    # 已分词的小数据（演示）
    sentences = [
        ["I", "love", "natural", "language", "processing"],
        ["word2vec", "is", "a", "popular", "model", "for", "embeddings"],
        ["we", "can", "learn", "word", "relationships"],
        ["word", "embeddings", "capture", "semantic", "meanings"],
        ["I", "enjoy", "teaching", "word2vec", "to", "students"]
    ]
else:
    # 这是一个迭代器，产出的是一串词（没有真正的句子边界）
    text8 = api.load("text8")  # 第一次会自动下载到本地缓存
    # text8 是类似 ["anarchism", "originated", "as", ...] 的token流

    # 可选：为了方便共现窗口，你可以把它切成伪句子（比如每2k个词一段）
    def make_sentences(tokens, sent_len=2048):
        buf, cnt = [], 0
        for w in tokens:
            buf.append(w)
            cnt += 1
            if cnt >= sent_len:
                yield buf
                buf, cnt = [], 0
        if buf:
            yield buf

    sentences = list(make_sentences(text8, sent_len=2048))[0]  # 注意：会占用内存，想省内存就边流式边处理

len(sentences), len(sentences[0][:10])

(1701, 10)

In [3]:
import json
# -----------------------------
# 2) 参数
# -----------------------------
from collections import Counter
import itertools
import math
from scipy.sparse import coo_matrix, save_npz, load_npz
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from collections import defaultdict
from scipy.sparse import coo_matrix, save_npz, load_npz
import numpy as np



min_count = 1        # 频次阈值，调大可省内存
shift_k     = 5       # SPPMI 的 k（通常与负采样 k 对应）
window_sizes = [4, 6, 8]        # 共现窗口（左右各 window_size）
embed_dims   = [100, 200]       # 词向量维度（SVD 维度）


# 2) 构建词表（限制词表规模，避免内存过大）
max_vocab = 50000     # 也可用 top-K 限制
V = 50000
cnt = Counter(w.lower() for s in sentences for w in s)

# 过滤 & 截断
vocab_words = [w for w,c in cnt.items() if c >= min_count]
vocab_words = sorted(vocab_words, key=lambda w: cnt[w], reverse=True)[:max_vocab]

vocab = {w:i for i,w in enumerate(vocab_words)}
id2word = {i:w for w,i in vocab.items()}

with open("id2word.json", "w", encoding="utf-8") as f:
    json.dump(id2word, f, ensure_ascii=False, indent=2)

# 3) 共现统计（对称窗口）

def co_statistics(win, dim, USE_LOCAL_CO = True):

    fname = os.path.join(save_dir, f"CO_win{win}_dim{dim}.npz")

    if not USE_LOCAL_CO:
        co_counts = defaultdict(float)
        total_tokens = 0
        for sent in sentences:
            tokens = [w.lower() for w in sent if w.lower() in vocab]
            n = len(tokens)
            total_tokens += n
            for i, w in enumerate(tokens):
                wi = vocab[w]
                start = max(0, i - win)
                end   = min(n, i + win + 1)
                for j in range(start, end):
                    if j == i:
                        continue
                    c = tokens[j]
                    if c not in vocab:
                        continue
                    wj = vocab[c]
                    co_counts[(wi, wj)] += 1.0
        if not co_counts:
            raise ValueError("Co-occurrence counts are empty. Increase data/window or lower min_count.")

        # 稀疏矩阵
        V = max_vocab
        r, c, v = zip(*[(i,j,x) for (i,j),x in co_counts.items()])
        CO = coo_matrix((v, (r, c)), shape=(V, V), dtype=np.float64).tocsr()
        save_npz(fname, CO)
    else:
        CO = load_npz(fname)

    return CO



def SPPMI_calculation(win, dim, CO, USE_LOCAL_SPPMI = True):
    # 4) 从 CO 计算 SPPMI

    fname = os.path.join(save_dir, f"SPPMI_win{win}_dim{dim}.npz")

    if not USE_LOCAL_SPPMI:
        sum_co = CO.sum()
        pi = np.array(CO.sum(axis=1)).ravel() / sum_co
        pj = np.array(CO.sum(axis=0)).ravel() / sum_co

        CO_coo = CO.tocoo()
        spmi_data = []
        shift_k = 5.0  # 可调

        for i, j, x in zip(CO_coo.row, CO_coo.col, CO_coo.data):
            pij = x / sum_co
            denom = pi[i] * pj[j]
            if denom == 0 or pij == 0:
                continue
            pmi = math.log(pij / denom)
            val = max(pmi - math.log(shift_k), 0.0)  # SPPMI
            if val > 0:
                spmi_data.append((i, j, val))

        if not spmi_data:
            raise ValueError("SPPMI is empty. Try a smaller k, larger window, or larger vocab.")

        ri, rj, rv = zip(*spmi_data)
        SPPMI = coo_matrix((rv, (ri, rj)), shape=(V, V), dtype=np.float64).tocsr()
        save_npz(fname, SPPMI)
    else:
        SPPMI = load_npz(fname)
    return SPPMI


def embedding_matrix(win, dim, SPPMI, USE_LOCAL_EMB = True):
    # 5) SVD 得到词向量 + 相似词查询

    fname = os.path.join(save_dir, f"emb_win{win}_dim{dim}.npy")

    if not USE_LOCAL_EMB:
        svd = TruncatedSVD(n_components=min(dim, max(2, min(SPPMI.shape)-1)), random_state=42)
        U = svd.fit_transform(SPPMI)  # (V, d)

        Sigma_sqrt = np.sqrt(svd.singular_values_)
        emb = U * Sigma_sqrt
        emb = normalize(emb)
        np.save(fname, emb)
    else:
        emb = np.load(fname)

    return emb

In [4]:
import itertools

for win, dim in itertools.product(window_sizes, embed_dims):
    print(f"\n[INFO] Running with window={win}, dim={dim}")

    # 1) 共现矩阵
    CO = co_statistics(win, dim, USE_LOCAL_CO=True)

    # 2) SPPMI
    SPPMI = SPPMI_calculation(win, dim, CO, USE_LOCAL_SPPMI=True)

    # 3) Embedding
    emb = embedding_matrix(win, dim, SPPMI, USE_LOCAL_EMB=True)

    # 4) Top-4 最近邻查询
    word = "science"
    if word in vocab:
        qv = emb[vocab[word]]
        sims = emb @ qv
        idx = np.argpartition(-sims, range(5))[:5]  # 取前5（包含自己）
        idx = idx[np.argsort(-sims[idx])]
        print(f"\nTop-4 nearest neighbors of '{word}' (win={win}, dim={dim}):")
        count = 0
        for i in idx:
            if id2word[i] == word:
                continue
            print(f"   {id2word[i]:>12s}  {sims[i]:.4f}")
            count += 1
            if count >= 4:
                break
    else:
        print("[WARN] word not in vocabulary.")



[INFO] Running with window=4, dim=100

Top-4 nearest neighbors of 'science' (win=4, dim=100):
          study  0.8829
        studies  0.8790
       research  0.8766
     scientific  0.8736

[INFO] Running with window=4, dim=200

Top-4 nearest neighbors of 'science' (win=4, dim=200):
     scientific  0.8398
          study  0.8395
        studies  0.8302
       research  0.8286

[INFO] Running with window=6, dim=100

Top-4 nearest neighbors of 'science' (win=6, dim=100):
       research  0.8779
     scientific  0.8776
          study  0.8735
        journal  0.8719

[INFO] Running with window=6, dim=200

Top-4 nearest neighbors of 'science' (win=6, dim=200):
     scientific  0.8402
        journal  0.8388
       research  0.8370
          study  0.8343

[INFO] Running with window=8, dim=100

Top-4 nearest neighbors of 'science' (win=8, dim=100):
     scientific  0.8905
        journal  0.8856
       research  0.8819
          study  0.8814

[INFO] Running with window=8, dim=200

Top-4