## 代码部署

In [1]:
# 词向量
import pandas as pd
import jieba
from gensim.models.word2vec import Word2Vec

# 读入训练集文件
data = pd.read_csv('train.csv')
# 转字符串数组
corpus = data['comment'].values.astype(str)
# 分词，再重组为字符串数组
corpus = [jieba.lcut(corpus[index]
                          .replace("，", "")
                          .replace("!", "")
                          .replace("！", "")
                          .replace("。", "")
                          .replace("~", "")
                          .replace("；", "")
                          .replace("？", "")
                          .replace("?", "")
                          .replace("【", "")
                          .replace("】", "")
                          .replace("#", "")
                        ) for index in range(len(corpus))]
# 词向量模型训练
model = Word2Vec(corpus, sg=0, vector_size=300, window=5, min_count=3, workers=4)
#模型显示
print('模型参数：',model,'\n')
#最匹配
print('最匹配的词是：',model.wv.most_similar(positive=['点赞', '不错'], negative=['难吃']),'\n')
#最不匹配
#print('最不匹配的词是：',model.wv.doesnt_match("点赞 好吃 支持 难吃".split()),'\n')
#语义相似度
print('相似度为=',model.wv.similarity('推荐','好吃'),'\n')
#坐标返回
print(model.wv.__getitem__('地道'))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\72740\AppData\Local\Temp\jieba.cache
Loading model cost 0.516 seconds.
Prefix dict has been built successfully.


模型参数： Word2Vec<vocab=4036, vector_size=300, alpha=0.025> 

最匹配的词是： [('无可挑剔', 0.9476285576820374), ('小弟', 0.9437561631202698), ('好找', 0.9264233112335205), ('位置', 0.9218050837516785), ('值得', 0.9203643202781677), ('高', 0.9197431802749634), ('丰富', 0.9182590842247009), ('团购', 0.9180325269699097), ('拥有', 0.9169207215309143), ('推荐', 0.9159362316131592)] 

相似度为= 0.79333323 

[ 1.50901694e-02  1.09742545e-01  3.23524512e-02  5.77193797e-02
 -4.80722152e-02 -7.80649707e-02  8.85137916e-02  2.78760970e-01
  3.23988288e-03 -4.36470546e-02 -2.44259508e-03 -1.19602844e-01
 -4.00138088e-02 -2.08969060e-02 -1.33488476e-01 -4.20169048e-02
  1.12857297e-01  2.84011080e-03  5.66594414e-02 -7.23656267e-02
 -6.70827553e-02 -1.12318071e-02  1.32260146e-02  3.07967402e-02
  7.77630657e-02 -3.87130417e-02 -1.55605257e-01  4.68544215e-02
 -3.09741553e-02 -9.55468193e-02  9.84860957e-02 -4.38954271e-02
  2.55813021e-02 -3.11054918e-03 -7.22712949e-02  8.87671299e-03
  6.65071607e-02 -1.67182758e-01  5.11582382e

## 1. 使用 Skip-Gram 训练 Word2Vec 模型

In [2]:
# 词向量
import pandas as pd
import jieba
from gensim.models.word2vec import Word2Vec

# 读入训练集文件
data = pd.read_csv('train.csv')
# 转字符串数组
corpus = data['comment'].values.astype(str)
# 分词，再重组为字符串数组
corpus = [jieba.lcut(corpus[index]
                          .replace("，", "")
                          .replace("!", "")
                          .replace("！", "")
                          .replace("。", "")
                          .replace("~", "")
                          .replace("；", "")
                          .replace("？", "")
                          .replace("?", "")
                          .replace("【", "")
                          .replace("】", "")
                          .replace("#", "")
                        ) for index in range(len(corpus))]
# 使用 Skip-Gram 训练词向量模型
model = Word2Vec(corpus, sg=1, vector_size=300, window=5, min_count=3, workers=4)
#模型显示
print('模型参数：',model,'\n')

模型参数： Word2Vec<vocab=4036, vector_size=300, alpha=0.025> 



## 2. 输出“环境”的词向量及其形状

In [3]:
# 输出“环境”的词向量及其形状
import numpy as np

word_vector = model.wv['环境']
word_vector_shape = word_vector.shape
print("'环境'的词向量：", word_vector)
print("'环境'的词向量形状：", word_vector_shape)

'环境'的词向量： [ 0.15954813  0.12948781 -0.05851304  0.09182726 -0.11029695 -0.14895657
 -0.07623646  0.34476575 -0.1823972  -0.14919832  0.07066109 -0.13151382
 -0.17095688  0.15991732 -0.15437618  0.0886275   0.32360107  0.07315033
  0.3084609  -0.39761615 -0.13082027 -0.09740936 -0.14684512 -0.06076075
 -0.07541888  0.02468818 -0.00571914  0.05008976  0.06059223  0.00107256
  0.23833986 -0.08422097  0.09189817  0.16458702 -0.19572376  0.02649102
  0.04091755 -0.12085357  0.01891434 -0.11578597  0.16593482 -0.03231172
  0.30259824 -0.11637963  0.07726106  0.25337666  0.01995787 -0.17351307
 -0.02906262 -0.0254773  -0.03019513 -0.03430745 -0.2239513   0.13363498
  0.00874617  0.08570673 -0.20962444 -0.18491554  0.06927156 -0.16897772
 -0.01736644 -0.00956455  0.06553503  0.1707377  -0.08537266  0.04981199
  0.12191371 -0.13918047 -0.3404668   0.08454912 -0.11060489 -0.1687502
 -0.05790257 -0.07648024  0.22072922  0.02555029  0.06041231  0.06765262
 -0.10549769  0.02378939  0.02578619 -0.06

## 3. 输出与“好吃”语义最接近的3个词

In [4]:
# 输出与“好吃”语义最接近的3个词
similar_words = model.wv.most_similar('好吃', topn=3)
print("与'好吃'语义最接近的3个词：", similar_words)

与'好吃'语义最接近的3个词： [('入味', 0.8587936162948608), ('好看', 0.8412091135978699), ('棒', 0.8212128281593323)]


## 4. 计算“好吃”和“美味”、“好吃”和“蟑螂”的相似度

In [5]:
# 计算“好吃”和“美味”的相似度
similarity_delicious = model.wv.similarity('好吃', '美味')
# 计算“好吃”和“蟑螂”的相似度
similarity_cockroach = model.wv.similarity('好吃', '蟑螂')

print("'好吃'和'美味'的相似度：", similarity_delicious)
print("'好吃'和'蟑螂'的相似度：", similarity_cockroach)

'好吃'和'美味'的相似度： 0.7937404
'好吃'和'蟑螂'的相似度： 0.23263259


## 5. 执行向量运算“餐厅+聚会-安静=？”

In [7]:
# 执行向量运算“餐厅+聚会-安静”
result = model.wv.most_similar(positive=['餐厅', '聚会'], negative=['安静'], topn=1)
print("向量运算 '餐厅 + 聚会 - 安静' 的结果：", result)


向量运算 '餐厅 + 聚会 - 安静' 的结果： [('百货', 0.9511458277702332)]
