In [2]:
# 词向量训练（Skip-Gram模式）
import pandas as pd
import jieba
from gensim.models.word2vec import Word2Vec
import logging  # 添加日志记录

# 配置日志输出
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# 1. 数据预处理
def preprocess_text(text):
    """文本清洗和分词处理"""
    # 去除标点符号（扩展更全的标点集合）
    punctuation = "，。！？、；：“”‘’【】（）《》~@#￥%……&*"
    for p in punctuation:
        text = text.replace(p, "")
    return jieba.lcut(text)

# 读入训练集文件
data = pd.read_csv('train.csv')
corpus = [preprocess_text(str(comment)) for comment in data['comment'].values]

# 2. Skip-Gram模型训练
model = Word2Vec(
    corpus,
    sg=1,  # 关键修改：sg=1表示使用Skip-Gram（默认CBOW是sg=0）
    vector_size=300,  # 词向量维度
    window=5,        # 上下文窗口大小（Skip-Gram通常用更大窗口）
    min_count=3,     # 忽略低频词
    workers=4,       # 并行线程数
    negative=5,      # 负采样数（Skip-Gram推荐5-20）
    hs=0,            # 禁用层次softmax（与negative采样二选一）
    alpha=0.025,     # 初始学习率
    min_alpha=0.0001 # 最小学习率
)

# 3. 模型保存与加载
model.save("word2vec_skipgram.model")  # 保存模型
# model = Word2Vec.load("word2vec_skipgram.model")  # 加载模型

# 4. 模型验证
print('\n===== 模型参数 =====')
print(f"模型架构: {'Skip-Gram' if model.sg else 'CBOW'}")
print(f"词表大小: {len(model.wv)}")
print(f"训练总词数: {model.corpus_total_words}\n")

  import pkg_resources
Building prefix dict from the default dictionary ...
2025-06-23 16:10:20,372 : DEBUG : Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\CMH\AppData\Local\Temp\jieba.cache
2025-06-23 16:10:20,374 : DEBUG : Loading model from cache C:\Users\CMH\AppData\Local\Temp\jieba.cache
Loading model cost 1.053 seconds.
2025-06-23 16:10:21,426 : DEBUG : Loading model cost 1.053 seconds.
Prefix dict has been built successfully.
2025-06-23 16:10:21,427 : DEBUG : Prefix dict has been built successfully.
2025-06-23 16:10:23,207 : INFO : collecting all words and their counts
2025-06-23 16:10:23,208 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-06-23 16:10:23,254 : INFO : collected 12099 word types from a corpus of 188848 raw words and 10000 sentences
2025-06-23 16:10:23,255 : INFO : Creating a fresh vocabulary
2025-06-23 16:10:23,273 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=3 retains 4028 


===== 模型参数 =====
模型架构: Skip-Gram
词表大小: 4028
训练总词数: 188848



In [3]:
# 语义相似度查询
test_words = ['点赞', '不错', '难吃', '推荐', '地道']
for word in test_words:
    if word in model.wv:
        print(f"与'{word}'最相似的词：{model.wv.most_similar(word, topn=3)}")

# 向量获取示例
if '地道' in model.wv:
    print(f"\n'地道'的词向量（前10维）:\n{model.wv['地道'][:10]}")
else:
    print("\n警告：'地道'不在词表中")

与'点赞'最相似的词：[('人超', 0.9842818975448608), ('帅', 0.9838330149650574), ('耿直', 0.9832215309143066)]
与'不错'最相似的词：[('挺不错', 0.9037325978279114), ('齐全', 0.8998403549194336), ('嗯', 0.8991395831108093)]
与'难吃'最相似的词：[('咸', 0.8957402110099792), ('垃圾', 0.8782604336738586), ('简直', 0.8750356435775757)]
与'推荐'最相似的词：[('值得', 0.8852267265319824), ('一试', 0.880580723285675), ('一去', 0.8680894374847412)]
与'地道'最相似的词：[('很赞', 0.98487788438797), ('正宗', 0.9749028086662292), ('很香', 0.969668447971344)]

'地道'的词向量（前10维）:
[-0.01735038  0.0398961   0.01903399  0.02288429 -0.03887833 -0.0492439
  0.05986579  0.3933846  -0.04652843 -0.08235199]


In [4]:
# 检查并输出"环境"的词向量及形状
if '环境' in model.wv:
    env_vector = model.wv['环境']
    print(f"'环境'的词向量（前5维）:\n{env_vector[:5]}")
    print(f"词向量形状: {env_vector.shape}")  # 应输出 (300,)
else:
    print("警告：'环境'不在词表中")

'环境'的词向量（前5维）:
[-0.02304971  0.10635543 -0.04023348 -0.00408974 -0.03905172]
词向量形状: (300,)


In [5]:
# 输出与"好吃"最相似的3个词
if '好吃' in model.wv:
    print("\n与'好吃'最相似的3个词:")
    for word, similarity in model.wv.most_similar('好吃', topn=3):
        print(f"{word}: {similarity:.4f}")
else:
    print("警告：'好吃'不在词表中")

# 计算词语相似度
similarity_results = []
for word in ['美味', '蟑螂']:
    if '好吃' in model.wv and word in model.wv:
        sim = model.wv.similarity('好吃', word)
        similarity_results.append((word, sim))
    else:
        print(f"警告：'{word}'不在词表中")

print("\n词语相似度:")
for word, sim in similarity_results:
    print(f"'好吃' vs '{word}': {sim:.4f}")


与'好吃'最相似的3个词:
入味: 0.8613
棒: 0.8519
油腻: 0.8326

词语相似度:
'好吃' vs '美味': 0.8244
'好吃' vs '蟑螂': 0.2673


In [6]:
# 向量类比计算
if all(word in model.wv for word in ['餐厅', '聚会', '安静']):
    result = model.wv.most_similar(
        positive=['餐厅', '聚会'],
        negative=['安静'],
        topn=1
    )
    print(f"\n向量运算 '餐厅 + 聚会 - 安静' ≈ '{result[0][0]}' (相似度: {result[0][1]:.4f})")
else:
    print("警告：计算所需的词未全部存在于词表中")


向量运算 '餐厅 + 聚会 - 安静' ≈ '家庭聚会' (相似度: 0.9598)
