In [1]:
#词向量
import pandas as pd
import jieba
from gensim.models.word2vec import Word2Vec

# 读入训练集文件
data = pd.read_csv('train.csv')
# 转字符串数组
corpus = data['comment'].values.astype(str)
# 分词，再重组为字符串数组
corpus = [jieba.lcut(corpus[index]
                          .replace("，", "")
                          .replace("!", "")
                          .replace("！", "")
                          .replace("。", "")
                          .replace("~", "")
                          .replace("；", "")
                          .replace("？", "")
                          .replace("?", "")
                          .replace("【", "")
                          .replace("】", "")
                          .replace("#", "")
                        ) for index in range(len(corpus))]
# 词向量模型训练
model = Word2Vec(corpus, sg=0, vector_size=300, window=5, min_count=3, workers=4)
#模型显示
print('模型参数：',model,'\n')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\HONOR\AppData\Local\Temp\jieba.cache
Loading model cost 0.896 seconds.
Prefix dict has been built successfully.


模型参数： Word2Vec<vocab=4036, vector_size=300, alpha=0.025> 



In [2]:
#最匹配
print('最匹配的词是：',model.wv.most_similar(positive=['点赞', '不错'], negative=['难吃']),'\n')

最匹配的词是： [('无可挑剔', 0.9576396942138672), ('好找', 0.9427543878555298), ('可以', 0.9352957010269165), ('适合', 0.9310916662216187), ('推荐', 0.9297167658805847), ('供选择', 0.9271414875984192), ('位置', 0.9268225431442261), ('高', 0.9185799360275269), ('足下', 0.9180248379707336), ('团购', 0.9176540970802307)] 



In [3]:
#语义相似度
print('相似度为=',model.wv.similarity('推荐','好吃'),'\n')

相似度为= 0.80467325 



In [4]:
#坐标返回
print(model.wv.__getitem__('地道'))

[ 0.01512641  0.13053448  0.02732472  0.05621429 -0.04699517 -0.08870899
  0.08707423  0.29099858  0.00113476 -0.05026208  0.00989338 -0.12539138
 -0.01916232 -0.01675975 -0.13585266 -0.04247999  0.1024887  -0.00361826
  0.05035571 -0.06281035 -0.06292713 -0.00939744  0.03806112  0.02033504
  0.07093426 -0.0353511  -0.16279799  0.04140955 -0.04616684 -0.108353
  0.09449728 -0.06255464  0.04351    -0.01121015 -0.0753372   0.03248082
  0.0645622  -0.15147203  0.03157796  0.02245488 -0.06892373  0.02729325
  0.02687319 -0.11434247  0.07636221  0.1324291   0.05424308  0.01279882
 -0.01366138  0.1038764   0.04831009 -0.01101197 -0.02765924  0.03020534
 -0.0253619   0.09101205  0.04751582  0.00495797  0.00810997  0.01765076
 -0.03953578 -0.041846   -0.00269819  0.05797637 -0.03422213  0.06802765
  0.00222682  0.06941713 -0.09436832 -0.0524837  -0.00773204  0.03257536
  0.11696862 -0.11609308  0.04873702  0.06573881 -0.08733592  0.03108968
 -0.05163511  0.07524239 -0.09126388 -0.10415883  0.0

In [5]:
# 使用Skip-Gram训练Word2Vec模型 (sg=1表示Skip-Gram)
sg_model = Word2Vec(corpus, sg=1, vector_size=300, window=5, min_count=3, workers=4)
print('Skip-Gram模型参数：', sg_model)

Skip-Gram模型参数： Word2Vec<vocab=4036, vector_size=300, alpha=0.025>


In [6]:
# 获取"环境"的词向量
env_vector = sg_model.wv['环境']
print('"环境"的词向量：\n', env_vector)
print('词向量形状：', env_vector.shape)

"环境"的词向量：
 [ 1.62444144e-01  5.12923822e-02  1.28743676e-02  1.02511533e-01
 -5.49425334e-02 -1.71599209e-01 -1.14423875e-03  3.30003887e-01
 -2.53876328e-01 -1.60749018e-01  8.02204609e-02 -2.23388255e-01
 -6.53935522e-02  5.81463240e-02 -2.45007738e-01  1.06048115e-01
  4.09319609e-01  9.64477584e-02  2.19919801e-01 -4.24362808e-01
 -1.33282974e-01  1.72450375e-02 -1.01402216e-01 -2.58177798e-02
 -1.69649079e-01  5.90251200e-02  9.81324092e-02  4.38115709e-02
  6.16299361e-02 -2.26423424e-02  2.54213154e-01 -2.25200281e-02
  8.51259604e-02  1.92080796e-01 -1.39245063e-01 -3.01884711e-02
 -4.96174814e-03 -1.08901277e-01  1.22046500e-01 -4.40527499e-02
  1.28939927e-01 -6.89938068e-02  2.81941831e-01 -7.64180049e-02
  5.40926047e-02  2.46446952e-01  7.86538646e-02 -1.85088426e-01
  1.17267251e-01  1.35297060e-01  1.03733554e-01 -9.83005464e-02
 -7.35878572e-02  1.47482455e-01  4.07570563e-02  8.29810426e-02
 -1.46392539e-01 -2.34872401e-01  1.40690207e-01 -8.32440555e-02
 -7.29365721e-

In [7]:
# 找出与"好吃"最相似的3个词
similar_words = sg_model.wv.most_similar('好吃', topn=3)
print('与"好吃"最相似的3个词：')
for word, similarity in similar_words:
    print(f'{word}: {similarity:.4f}')

与"好吃"最相似的3个词：
好看: 0.8305
棒: 0.8303
入味: 0.8223


In [8]:
# 计算词语相似度
print('"好吃"和"美味"的相似度:', sg_model.wv.similarity('好吃', '美味'))
print('"好吃"和"蟑螂"的相似度:', sg_model.wv.similarity('好吃', '蟑螂'))

"好吃"和"美味"的相似度: 0.8050233
"好吃"和"蟑螂"的相似度: 0.28619593


In [9]:
# 向量运算"餐厅+聚会-安静=？"
result = sg_model.wv.most_similar(positive=['餐厅', '聚会'], negative=['安静'], topn=1)
print('\n向量运算"餐厅+聚会-安静="最相关结果:', result[0][0])


向量运算"餐厅+聚会-安静="最相关结果: 家庭聚会
