# 作業 : 變更不同的 GloVe 模型, 並使用 gensim 套件觀察模型表現

# [作業目標]
- 載入不同版本的 GloVe 模型, 並觀察模型間有何差異

# [作業重點]
- 觀察 GloVe 不同的預訓練詞向量, 效果間的差異

# glove.6B

## Step 1
- 到 GloVe 官方網站 (https://github.com/stanfordnlp/GloVe), 由四組預訓練模型選擇一項下載
- 將模型檔解壓縮後, 選擇並複製詞向量檔到本程式同一執行目錄中
- 依照你所選擇的詞向量檔, 修改設定模型中的"input_file"與"output_file", 再執行後續程式

In [1]:
# 載入 gensim 與 GloVe 模型容器
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# 忽略警告訊息
import warnings
warnings.filterwarnings("ignore")

In [2]:
# 設定模型
input_file = './glove.6B/glove.6B.50d.txt'
output_file = './glove.6B/gensim_glove.6B.50d.txt'
glove2word2vec(input_file, output_file)

(400000, 50)

In [3]:
# 轉換並讀取模型
model = KeyedVectors.load_word2vec_format(output_file, binary=False)

## Step 2
- 觀察變更預訓練詞向量的效果

In [4]:
# 顯示最相近的字彙
model.most_similar(['woman'])

[('girl', 0.906528115272522),
 ('man', 0.8860336542129517),
 ('mother', 0.8763704299926758),
 ('her', 0.8613135814666748),
 ('boy', 0.8596119284629822),
 ('she', 0.8430695533752441),
 ('herself', 0.8224567770957947),
 ('child', 0.8108214139938354),
 ('wife', 0.8037394285202026),
 ('old', 0.7982393503189087)]

In [5]:
# 顯示最相近的字彙(附加反義詞)
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.8523603677749634),
 ('throne', 0.7664333581924438),
 ('prince', 0.7592144012451172),
 ('daughter', 0.7473883032798767),
 ('elizabeth', 0.7460219860076904)]

In [6]:
# 挑選最不相同的字彙
model.wv.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [7]:
# 顯示字彙間的相似性
model.wv.similarity('woman', 'man')

0.8860338

In [8]:
# 顯示字彙的詞向量
model['computer']

array([ 0.079084, -0.81504 ,  1.7901  ,  0.91653 ,  0.10797 , -0.55628 ,
       -0.84427 , -1.4951  ,  0.13418 ,  0.63627 ,  0.35146 ,  0.25813 ,
       -0.55029 ,  0.51056 ,  0.37409 ,  0.12092 , -1.6166  ,  0.83653 ,
        0.14202 , -0.52348 ,  0.73453 ,  0.12207 , -0.49079 ,  0.32533 ,
        0.45306 , -1.585   , -0.63848 , -1.0053  ,  0.10454 , -0.42984 ,
        3.181   , -0.62187 ,  0.16819 , -1.0139  ,  0.064058,  0.57844 ,
       -0.4556  ,  0.73783 ,  0.37203 , -0.57722 ,  0.66441 ,  0.055129,
        0.037891,  1.3275  ,  0.30991 ,  0.50697 ,  1.2357  ,  0.1274  ,
       -0.11434 ,  0.20709 ], dtype=float32)

# glove.42B.300d

## Step 1

In [9]:
# 設定模型
input_file = './glove.42B.300d/glove.42B.300d.txt'
output_file = './glove.42B.300d/gensim_glove.42B.300d.txt'
glove2word2vec(input_file, output_file)

(1917494, 300)

In [10]:
# 轉換並讀取模型
model = KeyedVectors.load_word2vec_format(output_file, binary=False)

## Step 2

In [11]:
# 顯示最相近的字彙
model.most_similar(['woman'])

[('man', 0.8047993183135986),
 ('girl', 0.7628219127655029),
 ('women', 0.7070599794387817),
 ('she', 0.697038471698761),
 ('lady', 0.6870704293251038),
 ('mother', 0.6809671521186829),
 ('wife', 0.666800856590271),
 ('female', 0.659212589263916),
 ('her', 0.6492205858230591),
 ('person', 0.6446772813796997)]

In [12]:
# 顯示最相近的字彙(附加反義詞)
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.7833545804023743),
 ('prince', 0.5993281006813049),
 ('princess', 0.5823438167572021),
 ('elizabeth', 0.553723931312561),
 ('daughter', 0.544403076171875)]

In [13]:
# 挑選最不相同的字彙
model.wv.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [14]:
# 顯示字彙間的相似性
model.wv.similarity('woman', 'man')

0.8047993

In [15]:
# 顯示字彙的詞向量
model['computer']

array([-3.1564e-02,  1.2363e-01, -1.7034e-01,  1.7133e-01,  3.6865e-01,
       -7.9461e-01, -3.5579e+00,  1.7796e-01,  7.5441e-01,  5.8339e-01,
       -1.1223e-01, -4.2855e-02,  3.0289e-01,  7.3101e-02, -5.7529e-01,
        5.1862e-01, -6.3629e-02, -4.7917e-01, -2.7654e-01, -1.3803e-01,
       -2.3805e-01, -1.0256e-01,  1.9888e-01, -5.4682e-02, -2.4799e-01,
        9.8878e-02,  4.0410e-01,  2.0282e-01, -1.6771e-01, -5.6312e-01,
       -1.1060e-01, -9.1475e-02, -2.2039e-01, -3.4514e-01, -1.7298e-01,
        2.6311e-01, -2.1555e-01,  4.3417e-01,  1.7068e-01, -3.2459e-01,
       -3.0770e-01, -5.4136e-01, -2.1150e-01, -7.7317e-02,  1.9324e-02,
        2.3751e-01,  1.3064e-01,  3.1118e-01,  8.6216e-04, -1.7932e-01,
       -2.5800e-01,  3.5938e-01,  2.5808e-01,  2.1604e-01,  2.6730e-02,
       -2.1068e-01,  8.7728e-02, -1.6411e-01, -7.4360e-01, -1.6145e-01,
       -9.5305e-02,  3.0688e-01,  2.4271e-01, -3.9576e-01, -3.8444e-02,
        4.4282e-01, -2.4518e-01, -1.5674e-02, -1.2109e-01, -2.76

# glove.840B.300d

## Step 1

In [16]:
# 設定模型
input_file = './glove.840B.300d/glove.840B.300d.txt'
output_file = './glove.840B.300d/gensim_glove.840B.300d.txt'
glove2word2vec(input_file, output_file)

(2196017, 300)

In [17]:
# 轉換並讀取模型
model = KeyedVectors.load_word2vec_format(output_file, binary=False)

## Step 2

In [18]:
# 顯示最相近的字彙
model.most_similar(['woman'])

[('lady', 0.7816672921180725),
 ('girl', 0.7700793743133545),
 ('man', 0.740174412727356),
 ('women', 0.7178930640220642),
 ('mother', 0.6826747059822083),
 ('female', 0.6689871549606323),
 ('she', 0.6653774976730347),
 ('wife', 0.6570038795471191),
 ('pregnant', 0.642966628074646),
 ('herself', 0.6276910305023193)]

In [19]:
# 顯示最相近的字彙(附加反義詞)
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.775162398815155),
 ('prince', 0.6123065948486328),
 ('princess', 0.6016970872879028),
 ('kings', 0.5996101498603821),
 ('queens', 0.5655797719955444)]

In [20]:
# 挑選最不相同的字彙
model.wv.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [21]:
# 顯示字彙間的相似性
model.wv.similarity('woman', 'man')

0.74017453

In [22]:
# 顯示字彙的詞向量
model['computer']

array([-0.26657  , -0.13717  ,  0.23549  , -0.26712  ,  0.093324 ,
        0.17563  , -0.33309  , -0.81744  ,  0.52655  ,  1.588    ,
        0.078138 , -0.093094 , -0.27167  , -0.31207  , -0.35018  ,
        0.072898 ,  0.032007 ,  2.5085   , -0.35727  , -0.018104 ,
        0.26102  , -0.3157   , -0.46466  ,  0.029905 ,  0.576    ,
       -0.34603  , -0.1502   , -0.23481  ,  0.20626  ,  0.28202  ,
       -0.6897   , -0.17632  ,  0.40369  ,  0.40672  ,  0.13068  ,
       -0.059988 , -0.3405   ,  0.46314  , -0.21149  ,  0.033152 ,
        0.36526  ,  0.049521 , -0.096128 , -0.087093 , -0.12957  ,
        0.22673  , -0.50435  , -0.50732  ,  0.27695  ,  0.14523  ,
       -0.11455  , -0.13968  ,  0.38549  ,  0.26088  , -0.43217  ,
       -0.45305  ,  0.072362 , -0.53383  ,  0.23317  ,  0.10471  ,
        0.51278  , -0.05899  ,  0.06051  ,  0.22083  ,  0.63098  ,
       -0.03232  , -0.36192  ,  0.45576  ,  0.037796 ,  0.21651  ,
        0.44988  , -0.33074  ,  0.32553  , -0.30153  , -0.2041

# glove.twitter.27B.50d

## Step 1

In [24]:
# 設定模型
input_file = './glove.twitter.27B/glove.twitter.27B.50d.txt'
output_file = './glove.twitter.27B/gensim_glove.twitter.27B.50d.txt'
glove2word2vec(input_file, output_file)

(1193514, 50)

In [25]:
# 轉換並讀取模型
model = KeyedVectors.load_word2vec_format(output_file, binary=False)

## Step 2

In [26]:
# 顯示最相近的字彙
model.most_similar(['woman'])

[('person', 0.8589048981666565),
 ('guy', 0.8574362397193909),
 ('child', 0.8568395376205444),
 ('wife', 0.8564570546150208),
 ('women', 0.852713942527771),
 ('female', 0.8499904274940491),
 ('girl', 0.8392189741134644),
 ('mother', 0.8352892994880676),
 ('being', 0.8168318867683411),
 ('called', 0.8135411739349365)]

In [27]:
# 顯示最相近的字彙(附加反義詞)
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('prince', 0.759779691696167),
 ('stone', 0.7595877051353455),
 ('queen', 0.7546269297599792),
 ('meets', 0.7404001951217651),
 ('royal', 0.7376409769058228)]

In [28]:
# 挑選最不相同的字彙
model.wv.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [29]:
# 顯示字彙間的相似性
model.wv.similarity('woman', 'man')

0.70595735

In [30]:
# 顯示字彙的詞向量
model['computer']

array([ 0.65572  ,  0.50502  , -0.64935  ,  0.14142  , -0.73858  ,
       -0.060631 ,  0.5229   , -1.4541   ,  0.79533  , -0.05805  ,
        0.6463   ,  0.57013  , -3.7753   , -0.2414   , -0.16999  ,
       -0.57056  ,  0.45857  ,  0.53634  , -0.45337  , -0.13149  ,
       -0.94989  ,  0.063493 ,  0.47575  ,  0.34153  , -1.2699   ,
       -0.0061569, -0.47168  , -0.68425  , -0.31045  ,  0.5198   ,
       -0.3573   , -0.97569  , -0.37267  , -1.3218   ,  0.27268  ,
       -0.72485  ,  0.46634  ,  0.40591  ,  0.17259  , -0.61372  ,
       -0.39455  ,  1.406    , -1.3402   ,  0.21337  ,  0.30572  ,
       -0.043278 , -0.019297 ,  0.62462  , -0.78825  ,  0.29806  ],
      dtype=float32)