In [1]:
! pip install numpy gensim

Looking in indexes: https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
Collecting numpy
  Downloading https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/a3/dd/4b822569d6b96c39d1215dbae0582fd99954dcbcf0c1a13c61783feaca3f/numpy-2.2.6-cp310-cp310-win_amd64.whl (12.9 MB)
     ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.9 MB 35.0 MB/s eta 0:00:01
     --------------------------------------  12.8/12.9 MB 62.0 MB/s eta 0:00:01
     --------------------------------------- 12.9/12.9 MB 47.7 MB/s eta 0:00:00
Collecting gensim
  Downloading https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cd/4a/f07e2f255aedd6bb4bd0ae420a465f228a4a91bc78ac359216ea20557be6/gensim-4.3.3-cp310-cp310-win_amd64.whl (24.0 MB)
     ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
     ------------------------- ------------- 15.7/24.0 MB 82.7 MB/s eta 0:00:01
     --------------------------------------- 24.0/24.0 MB 5

In [6]:
# 词向量训练（Skip-Gram模式）
import pandas as pd
import jieba
from gensim.models.word2vec import Word2Vec
import logging  # 添加日志记录

# 配置日志输出
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# 1. 数据预处理
def preprocess_text(text):
    """文本清洗和分词处理"""
    # 去除标点符号（扩展更全的标点集合）
    punctuation = "，。！？、；：“”‘’【】（）《》~@#￥%……&*"
    for p in punctuation:
        text = text.replace(p, "")
    return jieba.lcut(text)

# 读入训练集文件
data = pd.read_csv('train.csv')
corpus = [preprocess_text(str(comment)) for comment in data['comment'].values]

# 2. Skip-Gram模型训练
model = Word2Vec(
    corpus,
    sg=1,  # 关键修改：sg=1表示使用Skip-Gram（默认CBOW是sg=0）
    vector_size=300,  # 词向量维度
    window=5,        # 上下文窗口大小（Skip-Gram通常用更大窗口）
    min_count=3,     # 忽略低频词
    workers=4,       # 并行线程数
    negative=5,      # 负采样数（Skip-Gram推荐5-20）
    hs=0,            # 禁用层次softmax（与negative采样二选一）
    alpha=0.025,     # 初始学习率
    min_alpha=0.0001 # 最小学习率
)

# 3. 模型保存与加载
model.save("word2vec_skipgram.model")  # 保存模型
# model = Word2Vec.load("word2vec_skipgram.model")  # 加载模型

# 4. 模型验证
print('\n===== 模型参数 =====')
print(f"模型架构: {'Skip-Gram' if model.sg else 'CBOW'}")
print(f"词表大小: {len(model.wv)}")
print(f"训练总词数: {model.corpus_total_words}\n")

  import pkg_resources
Building prefix dict from the default dictionary ...
2025-06-25 19:11:48,917 : DEBUG : Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\CMH\AppData\Local\Temp\jieba.cache
2025-06-25 19:11:48,921 : DEBUG : Loading model from cache C:\Users\CMH\AppData\Local\Temp\jieba.cache
Loading model cost 0.828 seconds.
2025-06-25 19:11:49,749 : DEBUG : Loading model cost 0.828 seconds.
Prefix dict has been built successfully.
2025-06-25 19:11:49,752 : DEBUG : Prefix dict has been built successfully.
2025-06-25 19:11:51,771 : INFO : collecting all words and their counts
2025-06-25 19:11:51,772 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-06-25 19:11:51,819 : INFO : collected 12099 word types from a corpus of 188848 raw words and 10000 sentences
2025-06-25 19:11:51,820 : INFO : Creating a fresh vocabulary
2025-06-25 19:11:51,843 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=3 retains 4028 


===== 模型参数 =====
模型架构: Skip-Gram
词表大小: 4028
训练总词数: 188848



In [7]:
# 语义相似度查询
test_words = ['点赞', '不错', '难吃', '推荐', '地道']
for word in test_words:
    if word in model.wv:
        print(f"与'{word}'最相似的词：{model.wv.most_similar(word, topn=3)}")

# 向量获取示例
if '地道' in model.wv:
    print(f"\n'地道'的词向量（前10维）:\n{model.wv['地道'][:10]}")
else:
    print("\n警告：'地道'不在词表中")

与'点赞'最相似的词：[('气氛', 0.9861142635345459), ('人超', 0.9856061339378357), ('特', 0.9842628836631775)]
与'不错'最相似的词：[('好极了', 0.9139845967292786), ('挺不错', 0.9026724100112915), ('纯正', 0.8975269198417664)]
与'难吃'最相似的词：[('垃圾', 0.8909440040588379), ('咸', 0.8723955154418945), ('实在', 0.8503879904747009)]
与'推荐'最相似的词：[('值得', 0.8977064490318298), ('一去', 0.8875811100006104), ('一试', 0.8784179091453552)]
与'地道'最相似的词：[('正', 0.9781695604324341), ('很赞', 0.9704442620277405), ('依旧', 0.9675837755203247)]

'地道'的词向量（前10维）:
[-0.03942199  0.01611122 -0.01001185  0.04299872 -0.06971505 -0.05957232
  0.14781     0.40237567 -0.09764312 -0.11671848]


In [8]:
# 检查并输出"环境"的词向量及形状
if '环境' in model.wv:
    env_vector = model.wv['环境']
    print(f"'环境'的词向量（前5维）:\n{env_vector[:5]}")
    print(f"词向量形状: {env_vector.shape}")  # 应输出 (300,)
else:
    print("警告：'环境'不在词表中")

'环境'的词向量（前5维）:
[-0.14039193  0.07566755 -0.1379349   0.06719865 -0.07074745]
词向量形状: (300,)


In [9]:
# 输出与"好吃"最相似的3个词
if '好吃' in model.wv:
    print("\n与'好吃'最相似的3个词:")
    for word, similarity in model.wv.most_similar('好吃', topn=3):
        print(f"{word}: {similarity:.4f}")
else:
    print("警告：'好吃'不在词表中")

# 计算词语相似度
similarity_results = []
for word in ['美味', '蟑螂']:
    if '好吃' in model.wv and word in model.wv:
        sim = model.wv.similarity('好吃', word)
        similarity_results.append((word, sim))
    else:
        print(f"警告：'{word}'不在词表中")

print("\n词语相似度:")
for word, sim in similarity_results:
    print(f"'好吃' vs '{word}': {sim:.4f}")


与'好吃'最相似的3个词:
棒: 0.8560
入味: 0.8390
好看: 0.8320

词语相似度:
'好吃' vs '美味': 0.8133
'好吃' vs '蟑螂': 0.2728


In [10]:
# 向量类比计算
if all(word in model.wv for word in ['餐厅', '聚会', '安静']):
    result = model.wv.most_similar(
        positive=['餐厅', '聚会'],
        negative=['安静'],
        topn=1
    )
    print(f"\n向量运算 '餐厅 + 聚会 - 安静' ≈ '{result[0][0]}' (相似度: {result[0][1]:.4f})")
else:
    print("警告：计算所需的词未全部存在于词表中")


向量运算 '餐厅 + 聚会 - 安静' ≈ '酒店' (相似度: 0.9638)
