In [3]:
# 1. 导入必要的库  
import gensim  
from gensim.models import Word2Vec  
import numpy as np  
import logging  

# 设置日志  
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)  

In [4]:
# 2. 准备数据  
# 示例数据集，您可以使用自己的数据集  
sentences = [  
    ['环境', '优雅', '美食', '好吃', '舒适'],  
    ['餐厅', '氛围', '安静', '聚会'],  
    ['蟑螂', '肮脏', '食物', '味道'],  
    ['好吃', '美味', '佳肴'],  
]  

# 3. 训练 Word2Vec 模型（使用 Skip-Gram）  
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)  

# 输出模型训练完成的消息  
print("Word2Vec Skip-Gram模型训练完成。")  

2025-04-11 10:35:14,607 : INFO : collecting all words and their counts
2025-04-11 10:35:14,608 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-04-11 10:35:14,608 : INFO : collected 15 word types from a corpus of 16 raw words and 4 sentences
2025-04-11 10:35:14,609 : INFO : Creating a fresh vocabulary
2025-04-11 10:35:14,609 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 15 unique words (100.00% of original 15, drops 0)', 'datetime': '2025-04-11T10:35:14.609791', 'gensim': '4.3.1', 'python': '3.9.21 (main, Dec 11 2024, 16:35:24) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'prepare_vocab'}
2025-04-11 10:35:14,610 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 16 word corpus (100.00% of original 16, drops 0)', 'datetime': '2025-04-11T10:35:14.610799', 'gensim': '4.3.1', 'python': '3.9.21 (main, Dec 11 2024, 16:35:24) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-1

Word2Vec Skip-Gram模型训练完成。


In [5]:
# 4. 输出“环境”的词向量及其形状  
vector_environment = word2vec_model.wv['环境']  
print("“环境”的词向量:", vector_environment)  
print("形状:", vector_environment.shape)  

“环境”的词向量: [ 7.6966463e-03  9.1206422e-03  1.1355019e-03 -8.3250795e-03
  8.4250160e-03 -3.6962307e-03  5.7421732e-03  4.3915794e-03
  9.6899448e-03 -9.2934975e-03  9.2084054e-03 -9.2815282e-03
 -6.9077122e-03 -9.1021946e-03 -5.5471100e-03  7.3688962e-03
  9.1644777e-03 -3.3253515e-03  3.7230505e-03 -3.6252034e-03
  7.8814710e-03  5.8668759e-03  2.0861626e-07 -3.6286747e-03
 -7.2243060e-03  4.7686161e-03  1.4529788e-03 -2.6131857e-03
  7.8378068e-03 -4.0496145e-03 -9.1489861e-03 -2.2554707e-03
  1.2514711e-04 -6.6392552e-03 -5.4866159e-03 -8.4997769e-03
  9.2298733e-03  7.4240281e-03 -2.9524326e-04  7.3676636e-03
  7.9507884e-03 -7.8357337e-04  6.6120909e-03  3.7675237e-03
  5.0768424e-03  7.2529912e-03 -4.7393893e-03 -2.1855331e-03
  8.7312341e-04  4.2362059e-03  3.3043313e-03  5.0958274e-03
  4.5864857e-03 -8.4385090e-03 -3.1838394e-03 -7.2367596e-03
  9.6814223e-03  5.0065992e-03  1.7084122e-04  4.1129780e-03
 -7.6561309e-03 -6.2946510e-03  3.0763936e-03  6.5346383e-03
  3.9498745e-0

In [6]:
# 5. 输出与“好吃”语义最接近的3个词  
similar_words = word2vec_model.wv.most_similar('好吃', topn=3)  
print("与“好吃”最接近的3个词:", similar_words)  

与“好吃”最接近的3个词: [('氛围', 0.21617145836353302), ('美食', 0.0931011214852333), ('聚会', 0.09291718155145645)]


In [7]:
# 6. 计算相似度  
similarity_delicious_delicacy = word2vec_model.wv.similarity('好吃', '美味')  
similarity_delicious_cockroach = word2vec_model.wv.similarity('好吃', '蟑螂')  

print("“好吃”和“美味”的相似度:", similarity_delicious_delicacy)  
print("“好吃”和“蟑螂”的相似度:", similarity_delicious_cockroach)  

“好吃”和“美味”的相似度: -0.052346736
“好吃”和“蟑螂”的相似度: 0.016134689


In [8]:
# 7. 执行向量运算“餐厅+聚会-安静=？”  
result = word2vec_model.wv.most_similar(positive=['餐厅', '聚会'], negative=['安静'], topn=1)  
print("“餐厅+聚会-安静=？”的结果:", result)  

“餐厅+聚会-安静=？”的结果: [('舒适', 0.09841553866863251)]
