In [10]:
import pandas as pd
import jieba
import re
from gensim.models import Word2Vec

# 加载数据（假设train.csv在同一目录）
data = pd.read_csv('train.csv')
corpus = data['comment'].values.astype(str)

# 预处理：分词+去标点
def preprocess(text):
    text = re.sub(r"[，。！!~；？?#【】]", "", text)
    return jieba.lcut(text)

corpus = [preprocess(text) for text in corpus]
print("数据预处理完成，示例：", corpus[0][:5])  # 显示第一条评论的前5个词

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\hp\AppData\Local\Temp\jieba.cache
Loading model cost 0.882 seconds.
Prefix dict has been built successfully.


数据预处理完成，示例： ['一如既往', '地', '好吃', '希望', '可以']


In [11]:
# Skip-Gram模型训练
skipgram_model = Word2Vec(
    corpus,
    sg=1,               # sg=1表示Skip-Gram
    vector_size=300,
    window=5,
    min_count=3,
    workers=4
)
print("Skip-Gram模型训练完成！")

Skip-Gram模型训练完成！


In [12]:
# 获取词向量
env_vector = skipgram_model.wv['环境']
print("“环境”的词向量（前5维）:", env_vector[:5])
print("词向量形状:", env_vector.shape)

“环境”的词向量（前5维）: [ 0.17611246  0.16745846 -0.1169007   0.11330535 -0.12056729]
词向量形状: (300,)


In [13]:
# 语义相似词
similar_words = skipgram_model.wv.most_similar('好吃', topn=3)
print("与“好吃”最接近的3个词:", similar_words)

与“好吃”最接近的3个词: [('好看', 0.8585113286972046), ('入味', 0.8494536876678467), ('美味', 0.8298680186271667)]


In [14]:
# 计算相似度
sim1 = skipgram_model.wv.similarity('好吃', '美味')
sim2 = skipgram_model.wv.similarity('好吃', '蟑螂')
print("“好吃”和“美味”的相似度:", round(sim1, 2))
print("“好吃”和“蟑螂”的相似度:", round(sim2, 2))

“好吃”和“美味”的相似度: 0.83
“好吃”和“蟑螂”的相似度: 0.28


In [15]:
# 词向量类比
result = skipgram_model.wv.most_similar(
    positive=['餐厅', '聚会'],
    negative=['安静'],
    topn=1
)
print("“餐厅 + 聚会 - 安静”最相关结果:", result[0])

“餐厅 + 聚会 - 安静”最相关结果: ('家庭聚会', 0.9471422433853149)
