In [1]:
import os
import torch, numpy as np
from scipy.sparse import load_npz

# ---- 设备：优先用 CUDA ----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

V = 50000
win = window_size = 8
d = 200

save_dir = "./models_glove"
os.makedirs(save_dir, exist_ok=True)

fname = os.path.join(save_dir, f"CO_win{win}_dim{d}.npz")

# ---- 确保 CO 是 coo，并取出 (row, col, val) ----
CO = load_npz(fname)  # 如果已经是 coo 可省略
CO = CO.tocoo()

rows_np = CO.row.astype(np.int64)
cols_np = CO.col.astype(np.int64)
vals_np = CO.data.astype(np.float32)

# ---- 张量放到 GPU（最小更改：只搬 indices 和 values）----
rows = torch.from_numpy(rows_np).to(device)
cols = torch.from_numpy(cols_np).to(device)
x_ij = torch.from_numpy(vals_np).to(device)      # 共现计数
nnz = x_ij.numel()
print("NNZ:", nnz)


Device: cpu
NNZ: 37294295


In [66]:
import torch.nn as nn
import math

xmax = 10      # GloVe 的标准超参，可按需调
alpha = 0.75

# ---- Embedding 参数（目标/上下文 + bias）----
W  = nn.Embedding(V, d).to(device)  # target
C  = nn.Embedding(V, d).to(device)  # context
bW = nn.Embedding(V, 1).to(device)
bC = nn.Embedding(V, 1).to(device)

# 初始化（简单、稳定）
for emb in [W, C]:
    nn.init.uniform_(emb.weight, -0.5/d, 0.5/d)
nn.init.zeros_(bW.weight); nn.init.zeros_(bC.weight)

# ---- 优化器（改一行就行）----
# Adam 在 CUDA 上支持最好；如果你用 sparse=True，可换 SparseAdam
opt = torch.optim.Adam(
    list(W.parameters()) + list(C.parameters()) + list(bW.parameters()) + list(bC.parameters()),
    lr=2e-3
)


In [67]:
##import torch
torch.set_float32_matmul_precision("high")  # PyTorch 2.x 可选

def glove_weight(x):
    w = torch.pow(x / xmax, alpha)
    w = torch.clamp(w, max=1.0)
    return w

# ---- 训练参数（可微调）----
epochs = 20
batch_size = 65536  # 如显存紧张可减小 (64k~256k)
perm = torch.randperm(nnz, device=device)

for ep in range(1, epochs+1):
    total_loss = 0.0
    # 每个 epoch 打乱一次
    perm = torch.randperm(nnz, device=device)
    for start in range(0, nnz, batch_size):
        end = min(start + batch_size, nnz)
        idx = perm[start:end]

        i = rows[idx]
        j = cols[idx]
        x = x_ij[idx]

        wi = W(i)          # (B, d)
        cj = C(j)          # (B, d)
        bi = bW(i).squeeze(-1)  # (B,)
        bj = bC(j).squeeze(-1)  # (B,)

        # 预测和目标
        pred = (wi * cj).sum(dim=1) + bi + bj
        logx = torch.log(x)

        w = glove_weight(x)
        loss = (w * (pred - logx)**2).mean()

        opt.zero_grad(set_to_none=True)
        loss.backward()
        opt.step()

        total_loss += loss.item() * (end - start)

    avg = total_loss / nnz
    print(f"[Epoch {ep}/{epochs}] avg loss = {avg:.6f}")

# window = 4, dim = 100, train_time = 3m 4s
# window = 4, dim = 200, train_time = 5m 26s
# window = 6, dim = 100, train_time = 4m 18s
# window = 6, dim = 200, train_time = 6m 59s
# window = 8, dim = 100, train_time = 5m 0s
# window = 8, dim = 200, train_time = 8m 25s

[Epoch 1/20] avg loss = 0.250381
[Epoch 2/20] avg loss = 0.132865
[Epoch 3/20] avg loss = 0.112785
[Epoch 4/20] avg loss = 0.098450
[Epoch 5/20] avg loss = 0.085816
[Epoch 6/20] avg loss = 0.074914
[Epoch 7/20] avg loss = 0.066874
[Epoch 8/20] avg loss = 0.061716
[Epoch 9/20] avg loss = 0.058472
[Epoch 10/20] avg loss = 0.056327
[Epoch 11/20] avg loss = 0.054814
[Epoch 12/20] avg loss = 0.053694
[Epoch 13/20] avg loss = 0.052787
[Epoch 14/20] avg loss = 0.052030
[Epoch 15/20] avg loss = 0.051384
[Epoch 16/20] avg loss = 0.050839
[Epoch 17/20] avg loss = 0.050375
[Epoch 18/20] avg loss = 0.049970
[Epoch 19/20] avg loss = 0.049625
[Epoch 20/20] avg loss = 0.049315


In [68]:
import json
import numpy as np
from sklearn.preprocessing import normalize

with torch.no_grad():
    final = W.weight + C.weight          # (V, d)
    final = final.detach().cpu().numpy()
    final = normalize(final)             # 行归一化，便于余弦相似

# ---- 保存 .npy ----
fname = os.path.join(save_dir, f"glove_win{window_size}_dim{d}.npy")
np.save(fname, final)

# ---- load .txt（word + 向量）----
with open("id2word.json", "r", encoding="utf-8") as f:
    id2word = json.load(f)

fname = os.path.join(save_dir, f"glove_win{window_size}_dim{d}.txt")
with open(fname, "w", encoding="utf-8") as f:
    for i in range(V):
        idx = str(i)
        w = id2word[idx]
        vec = " ".join(f"{x:.6f}" for x in final[i])
        f.write(f"{w} {vec}\n")

# ---- 保存为 Gensim KeyedVectors（便于 evaluate_word_pairs / most_similar）----
from gensim.models import KeyedVectors
kv = KeyedVectors(vector_size=d)
kv.add_vectors([id2word[str(i)] for i in range(V)], final)

fname = os.path.join(save_dir, f"glove_win{window_size}_dim{d}.kv")
kv.save(fname)
print("[SAVED] npy/txt/kv exported.")


[SAVED] npy/txt/kv exported.


In [69]:
# 形状
print("Embedding shape:", final.shape)   # (V, d)

# 前 3 个词的前 8 维（示例）
for i in range(3):
    idx = str(i)
    print(f"{id2word[idx]:>12s} : {final[i, :8]}")

# 查相似词（用 kv or 直接 numpy）
def most_similar_word(word, topn=10):
    if word not in kv:
        print(f"'{word}' OOV.")
        return
    for term, score in kv.most_similar(word, topn=topn):
        print(f"{term:>15s}  {score:.4f}")

print("\nTop-10 similar to 'king':")
most_similar_word("king", topn=10)


Embedding shape: (50000, 200)
         the : [ 0.09024875  0.06032214 -0.05287563 -0.04998    -0.0284281   0.08411795
 -0.03706627  0.03375566]
          of : [ 0.05926024  0.03618823 -0.09434661 -0.07992446 -0.03884495  0.04664911
 -0.05659388  0.03590697]
         and : [-0.02759532  0.02639293 -0.0614111  -0.00533459 -0.03225466  0.01557078
 -0.01585256  0.02070977]

Top-10 similar to 'king':
            iii  0.6073
            son  0.6000
          queen  0.5920
          henry  0.5607
        kingdom  0.5564
             ii  0.5522
          kings  0.5482
        himself  0.5398
            vii  0.5368
        charles  0.5266
