In [6]:
# Example 1 (enhanced)
# Import necessary libraries
from gensim.models import Word2Vec
import gensim.downloader as api
from pprint import pprint

# ---------- Step 1: Load a larger dataset ----------
print("[INFO] Loading dataset: text8 (≈17MB Wikipedia subset)")
dataset = api.load("text8")   # 一个约17M的维基百科子集
# 切成句子块（gensim 需要 list of list）
def make_sentences(tokens, sent_len=2048):
    buf, cnt = [], 0
    for w in tokens:
        buf.append(w)
        cnt += 1
        if cnt >= sent_len:
            yield buf
            buf, cnt = [], 0
    if buf:
        yield buf

sentences = list(make_sentences(dataset, sent_len=2048))[0]

[INFO] Loading dataset: text8 (≈17MB Wikipedia subset)


In [7]:
import os
import itertools
# ---------- Step 2: Train a Word2Vec model ----------

# 超参数组合
window_sizes = [4, 6, 8]
embed_dims   = [100, 200]

# 保存目录
save_dir = "./models_word2vec"
os.makedirs(save_dir, exist_ok=True)

# 网格搜索
for win, dim in itertools.product(window_sizes, embed_dims):
    params = dict(vector_size=dim, window=win, min_count=5, sg=1, workers=4)
    print(f"[INFO] Training Word2Vec with window={win}, dim={dim}")
    model = Word2Vec(sentences, **params)

    # 文件名区分不同组合
    fname = os.path.join(save_dir, f"word2vec_win{win}_dim{dim}.model")
    model.save(fname)
    print(f"[SAVE] {fname}")

[INFO] Training Word2Vec with window=4, dim=100
[SAVE] ./models_word2vec/word2vec_win4_dim100.model
[INFO] Training Word2Vec with window=4, dim=200
[SAVE] ./models_word2vec/word2vec_win4_dim200.model
[INFO] Training Word2Vec with window=6, dim=100
[SAVE] ./models_word2vec/word2vec_win6_dim100.model
[INFO] Training Word2Vec with window=6, dim=200
[SAVE] ./models_word2vec/word2vec_win6_dim200.model
[INFO] Training Word2Vec with window=8, dim=100
[SAVE] ./models_word2vec/word2vec_win8_dim100.model
[INFO] Training Word2Vec with window=8, dim=200
[SAVE] ./models_word2vec/word2vec_win8_dim200.model


In [10]:
# ---------- Step 3: Explore the Word2Vec model ----------

win = 4     # 4/6/8
dim = 100   # 100/200

path = os.path.join(save_dir, f"word2vec_win{win}_dim{dim}.model")
model = Word2Vec.load(path)
print("\n[INFO] Vocabulary size:", len(model.wv))
print("[INFO] Top-20 most frequent tokens:")
pprint(model.wv.index_to_key[:20])


[INFO] Vocabulary size: 71290
[INFO] Top-20 most frequent tokens:
['the',
 'of',
 'and',
 'one',
 'in',
 'a',
 'to',
 'zero',
 'nine',
 'two',
 'is',
 'as',
 'eight',
 'for',
 's',
 'five',
 'three',
 'was',
 'by',
 'that']


In [12]:
# ---------- Step 4: Safe vector lookup ----------
def safe_vec(w):
    if w in model.wv:
        print(f"\n[OK] Vector for '{w}': shape={model.wv[w].shape}")
        print(model.wv[w])
    else:
        print(f"\n[WARN] '{w}' not in vocabulary (OOV). Try another word.")
# 你原本用 'Word
# 2Vec' 这个词，但在 text8 里几乎不可能出现，容易 OOV
safe_vec("language")     # 在 text8 中基本一定存在


[OK] Vector for 'language': shape=(100,)
[-5.63780665e-01 -1.43342018e-01 -1.93605721e-01  5.01701832e-02
  3.68403435e-01  1.69639453e-01  4.76629049e-01  2.73675889e-01
 -6.46183074e-01 -1.00742295e-01 -3.78163218e-01  1.89057052e-01
 -6.51260853e-01  7.71197602e-02 -2.48700157e-02 -1.76573202e-01
  1.03793316e-01 -4.54686470e-02  4.56223696e-01 -1.94364607e-01
  9.44484547e-02 -6.38125837e-01 -3.12224925e-02 -2.82439440e-01
 -4.39857006e-01 -2.64926553e-02 -1.51290402e-01 -1.51182428e-01
 -2.04791531e-01 -1.30243093e-01 -3.66244942e-01 -4.15785983e-03
  5.57571888e-01 -9.31952074e-02 -8.81810784e-02  4.43528026e-01
 -2.47798681e-01  1.30837619e-01  9.93213733e-04 -1.66155040e-01
 -2.56508291e-01 -1.27962202e-01 -8.27650949e-02 -8.22350234e-02
 -3.64964336e-01 -4.02844667e-01  1.84474811e-01  1.51095107e-01
  8.68820429e-01  5.69558069e-02  2.24656343e-01  2.09192812e-01
 -1.06547043e-01 -4.16797876e-01 -4.21094030e-01 -3.47633153e-01
  3.10274929e-01 -2.74279341e-02  1.94999631e-02