In [1]:
import pandas as pd
import jieba
from gensim.models.word2vec import Word2Vec

# 读入训练集文件
data = pd.read_csv('train.csv')    

In [2]:
# 转字符串数组
corpus = data['comment'].values.astype(str)
# 分词，再重组为字符串数组
corpus = [jieba.lcut(corpus[index]
                          .replace("，", "")
                          .replace("!", "")
                          .replace("！", "")
                          .replace("。", "")
                          .replace("~", "")
                          .replace("；", "")
                          .replace("？", "")
                          .replace("?", "")
                          .replace("【", "")
                          .replace("】", "")
                          .replace("#", "")
                        ) for index in range(len(corpus))]    

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\yzx\AppData\Local\Temp\jieba.cache
Loading model cost 0.562 seconds.
Prefix dict has been built successfully.


In [3]:
# 词向量模型训练，使用 Skip - Gram (sg=1)
model = Word2Vec(corpus, sg=1, vector_size=300, window=5, min_count=3, workers=4)
# 模型显示
print('模型参数：', model, '\n')    

模型参数： Word2Vec<vocab=4036, vector_size=300, alpha=0.025> 



In [4]:
# 输出“环境”的词向量及其形状
env_vector = model.wv['环境']
print('“环境”的词向量：', env_vector)
print('“环境”词向量的形状：', env_vector.shape)    

“环境”的词向量： [ 0.24793099  0.11586279 -0.05771534  0.15198332 -0.07688954 -0.14096543
 -0.12353875  0.23698929 -0.22737318 -0.12559749  0.00768888 -0.10519144
 -0.10363474  0.09207348 -0.18506446  0.10547119  0.39047042  0.06819883
  0.23046814 -0.3682349  -0.10112085 -0.07517746 -0.21865714 -0.03256328
 -0.09899133  0.09034736  0.08119465  0.06867903  0.04742669 -0.04535075
  0.24809794 -0.07218164  0.13347983  0.1079419  -0.15300177  0.00830391
  0.08384697 -0.06833746  0.0790541  -0.10494325  0.08692354  0.01080894
  0.3011219  -0.11105656  0.09023183  0.23090413  0.02576946 -0.14214496
 -0.06746858 -0.02585439 -0.02783542  0.06054003 -0.26324758  0.15974241
  0.03347721  0.15479024 -0.26134932 -0.16245985  0.04006618 -0.16969936
 -0.00827062 -0.05310436  0.04834816  0.14686418 -0.06353551  0.06236472
  0.1311701  -0.08245474 -0.29974404  0.07335623 -0.03917816 -0.1565386
 -0.04692876 -0.11049825  0.2426559   0.0468522   0.02767964  0.07917938
 -0.15964463  0.03523874 -0.08761709 -0.04

In [5]:
# 输出与“好吃”语义最接近的3个词
similar_words = model.wv.most_similar('好吃', topn=3)
print('与“好吃”语义最接近的3个词：', similar_words)    

与“好吃”语义最接近的3个词： [('棒', 0.8372853398323059), ('入味', 0.8348568081855774), ('好看', 0.8288472294807434)]


In [6]:
# 计算“好吃”和“美味”的相似度、“好吃”和“蟑螂”的相似度
sim_delicious = model.wv.similarity('好吃', '美味')
sim_cockroach = model.wv.similarity('好吃', '蟑螂')
print('“好吃”和“美味”的相似度：', sim_delicious)
print('“好吃”和“蟑螂”的相似度：', sim_cockroach)    

“好吃”和“美味”的相似度： 0.82293105
“好吃”和“蟑螂”的相似度： 0.29948843


In [8]:
# 执行向量运算“餐厅+聚会-安静=？”，输出1个最相关结果
result = model.wv.most_similar(positive=['餐厅', '聚会'], negative=['安静'], topn=1)
print('餐厅+聚会-安静的结果：', result)    

餐厅+聚会-安静的结果： [('家庭聚会', 0.9511897563934326)]
