# 範例 : 載入 GloVe 模型, 並使用 gensim 套件觀察模型表現

# [教學目標]
- 載入並觀察 GloVe 模型的效果

# [學習重點]
- 使用 GloVe 官方預訓練詞向量, 並使用 gensim 觀察訓練結果

# Step 1
- 執行前請先下載 GloVe 預訓練詞向量檔 http://nlp.stanford.edu/data/glove.6B.zip
- 解壓縮後, 將 glove.6B.300d.txt 複製到本程式同一執行目錄中, 再執行後續程式

In [1]:
# 載入 gensim 與 GloVe 模型容器
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# 忽略警告訊息
import warnings
warnings.filterwarnings("ignore")

In [2]:
# 設定模型
input_file = 'glove.6B.300d.txt'
output_file = 'gensim_glove.6B.300d.txt'
glove2word2vec(input_file, output_file)

FileNotFoundError: [Errno 2] No such file or directory: 'glove.6B.300d.txt'

In [3]:
# 轉換並讀取模型
model = KeyedVectors.load_word2vec_format(output_file, binary=False)

# Step 2
- 檢驗預訓練詞向量的效果

In [4]:
# 顯示最相近的字彙
model.most_similar(['woman'])

[('girl', 0.7296419143676758),
 ('man', 0.6998662948608398),
 ('mother', 0.689943790435791),
 ('she', 0.6433225870132446),
 ('her', 0.6327142715454102),
 ('female', 0.6251603364944458),
 ('herself', 0.6215280890464783),
 ('person', 0.6170897483825684),
 ('women', 0.6047609448432922),
 ('wife', 0.5986992716789246)]

In [5]:
# 顯示最相近的字彙(附加反義詞)
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.6713277101516724),
 ('princess', 0.5432624816894531),
 ('throne', 0.5386105179786682),
 ('monarch', 0.5347574949264526),
 ('daughter', 0.498025119304657)]

In [6]:
# 挑選最不相同的字彙
model.wv.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [7]:
# 顯示字彙間的相似性
model.wv.similarity('woman', 'man')

0.69986635

In [8]:
# 顯示字彙的詞向量
model['computer']

array([-2.7628e-01,  1.3999e-01,  9.8519e-02, -6.4019e-01,  3.1988e-02,
        1.0066e-01, -1.8673e-01, -3.7129e-01,  5.9740e-01, -2.0405e+00,
        2.2368e-01, -2.6314e-02,  7.2408e-01, -4.3829e-01,  4.8886e-01,
       -3.5486e-03, -1.0006e-01, -3.0587e-01, -1.5621e-01, -6.8136e-02,
        2.1104e-01,  2.9287e-01, -8.8861e-02, -2.0462e-01, -5.7602e-01,
        3.4526e-01,  4.1390e-01,  1.7917e-01,  2.5143e-01, -2.2678e-01,
       -1.0103e-01,  1.4576e-01,  2.0127e-01,  3.1810e-01, -7.8907e-01,
       -2.2194e-01, -2.4833e-01, -1.5103e-02, -2.0050e-01, -2.6441e-02,
        1.8551e-01,  3.3782e-01, -3.3543e-01,  8.6117e-01, -4.7083e-02,
       -1.7009e-01,  3.0438e-01,  9.4119e-02,  3.2435e-01, -8.1171e-01,
        8.8966e-01, -3.9149e-01,  1.6828e-01,  1.4316e-01,  3.6339e-03,
       -6.4557e-02,  4.5777e-02, -3.2248e-01,  4.8943e-02,  1.6817e-01,
        6.8344e-02,  5.4227e-01,  1.2493e-01,  6.9742e-01, -3.7194e-02,
        3.3080e-01, -4.2194e-01,  3.3970e-01,  2.7646e-01, -1.60