# Exporting fastText

In [1]:
import fasttext
import gensim
from mecab import MeCab
import torch
from torch.nn import Embedding

## MeCab

In [2]:
mecab = MeCab()
document = '우리에게 굵직한 지혜들을 건네는 너무나 멋진 영화'
morphemes = mecab.morphs(document)

In [3]:
morphemes

['우리', '에게', '굵직', '한', '지혜', '들', '을', '건네', '는', '너무나', '멋진', '영화']

In [4]:
' '.join(morphemes) 

'우리 에게 굵직 한 지혜 들 을 건네 는 너무나 멋진 영화'

## fastText

In [5]:
model = fasttext.load_model('../baseline/tmp/fasttext.bin')



In [6]:
model.get_sentence_vector(' '.join(morphemes))

array([-0.16013725,  0.09005935,  0.00400428, -0.02762699, -0.11783235,
        0.07976735, -0.03836356,  0.0338936 ,  0.12256839,  0.04571179,
        0.0437787 ,  0.00462348,  0.16793722, -0.02188173,  0.05547931,
        0.06301126, -0.00247431, -0.02743533,  0.0066749 ,  0.00479955,
       -0.00394412, -0.01312571, -0.01293556,  0.047396  , -0.01472647,
       -0.05767235,  0.08567697,  0.04963464, -0.02022251, -0.04452684,
        0.06156359, -0.03003763, -0.108027  , -0.06451722,  0.04826152,
       -0.00611135, -0.02877227,  0.15473019,  0.01656478,  0.05878403,
        0.06016641,  0.00974289, -0.00591775,  0.09609663, -0.01499486,
       -0.08712989,  0.01524194,  0.0886748 , -0.06767763,  0.11617516,
        0.01487919, -0.02622127, -0.14212397,  0.0141244 , -0.10621323,
       -0.00182579,  0.03622719, -0.02664652, -0.00377709,  0.01688775,
       -0.08327223,  0.20424649,  0.03198184,  0.01031437,  0.01716161,
        0.05074051,  0.09522699,  0.0571514 , -0.09904211, -0.00

In [7]:
embeddings = [model.get_word_vector(morpheme) for _, morpheme in enumerate(morphemes)]
sentence_embedding = sum(embeddings)

In [8]:
sentence_embedding

array([-5.3881803e+00,  3.3339934e+00, -1.9899374e-01, -1.3103043e+00,
       -3.9608002e+00,  2.8865838e+00, -1.2759537e+00,  8.7179995e-01,
        3.7926445e+00,  1.6915550e+00,  9.2647779e-01,  1.7089513e-01,
        5.7450757e+00, -9.4605929e-01,  1.8335358e+00,  1.7263796e+00,
        8.6635843e-02, -6.0403693e-01,  3.5892546e-01,  1.9706249e-01,
       -2.7951634e-01, -5.7524288e-01, -3.7349406e-01,  1.2886776e+00,
       -5.2843738e-01, -1.6650205e+00,  2.9132798e+00,  1.8467942e+00,
       -7.1334571e-01, -1.3275107e+00,  2.0068963e+00, -1.0071089e+00,
       -3.7656507e+00, -2.7547455e+00,  1.5657277e+00, -3.7755519e-03,
       -1.2180982e+00,  5.3885221e+00,  9.0594316e-01,  1.7410032e+00,
        2.2105443e+00, -1.5293247e-01, -5.7573795e-01,  3.2544551e+00,
       -1.6215771e-01, -3.0973606e+00,  2.2906888e-01,  3.1046381e+00,
       -2.2722075e+00,  3.9370999e+00,  3.6998588e-01, -1.0924687e+00,
       -4.8797350e+00,  1.3474625e-01, -3.7131810e+00,  5.5600509e-02,
      

## gensim

### Loading fastText with gensim

In [9]:
model = gensim.models.fasttext.load_facebook_model('../baseline/tmp/fasttext.bin')

In [10]:
model

<gensim.models.fasttext.FastText at 0x7ffa00dfa850>

### vocabulary-index dictionary

In [11]:
model.wv.key_to_index

{'1': 0,
 '.': 1,
 '0': 2,
 '2': 3,
 '의': 4,
 ',': 5,
 '이': 6,
 '는': 7,
 '다': 8,
 ')': 9,
 '(': 10,
 '9': 11,
 '년': 12,
 '에': 13,
 '을': 14,
 '하': 15,
 '3': 16,
 '5': 17,
 '4': 18,
 '8': 19,
 '은': 20,
 '6': 21,
 '7': 22,
 ':': 23,
 '를': 24,
 '월': 25,
 '분류': 26,
 '일': 27,
 '고': 28,
 '-': 29,
 '가': 30,
 '있': 31,
 '에서': 32,
 '으로': 33,
 '로': 34,
 '한': 35,
 '되': 36,
 '었': 37,
 '과': 38,
 '들': 39,
 '와': 40,
 '도': 41,
 '했': 42,
 '적': 43,
 '인': 44,
 '였': 45,
 '</s>': 46,
 '그': 47,
 '어': 48,
 '기': 49,
 '《': 50,
 '제': 51,
 '것': 52,
 '*': 53,
 '~': 54,
 '게': 55,
 '지': 56,
 '"': 57,
 '》': 58,
 '여': 59,
 '한다': 60,
 '수': 61,
 '역': 62,
 '된': 63,
 '등': 64,
 '/': 65,
 '며': 66,
 '대': 67,
 '·': 68,
 '회': 69,
 '선수': 70,
 '영화': 71,
 '대한민국': 72,
 '할': 73,
 '던': 74,
 '해': 75,
 '아': 76,
 '만': 77,
 '%': 78,
 '명': 79,
 '않': 80,
 '자': 81,
 '시': 82,
 '에게': 83,
 '중': 84,
 '주': 85,
 '까지': 86,
 '미국': 87,
 '았': 88,
 '나': 89,
 '번': 90,
 '면': 91,
 '지만': 92,
 '일본': 93,
 '없': 94,
 '사람': 95,
 '받': 96,
 '성': 97,
 '위': 98,
 '

### Pretrained embeddings

In [12]:
model.wv.vectors

array([[-0.19089553, -0.04864176, -0.12746184, ..., -0.20128623,
        -0.05452869,  0.21092379],
       [-0.04622136,  0.09959393,  0.03069987, ..., -0.21032026,
        -0.0100284 , -0.01041338],
       [-0.0235911 ,  0.09469731, -0.1185493 , ..., -0.18746178,
         0.04212563,  0.16035339],
       ...,
       [-0.50652236,  0.14685082, -0.24950482, ..., -0.15726815,
         0.6935291 ,  0.5601857 ],
       [-0.38853964,  0.02418062,  0.0767933 , ..., -0.3630212 ,
        -0.27333122,  0.4337093 ],
       [-0.26303595, -0.38537705, -0.3572211 , ...,  0.04461757,
         0.09841006, -0.07057536]], dtype=float32)

### Is the length of `wv.key_to_index` the same as the length of `wv.vectors`?

In [13]:
assert len(model.wv.key_to_index) == len(model.wv.vectors)

### Sentence embedding

1. https://github.com/RaRe-Technologies/gensim/issues/3015
2. https://github.com/RaRe-Technologies/gensim/pull/3188
3. https://stackoverflow.com/questions/65397810/whats-the-equivalent-to-get-sentence-vector-for-gensims-fasttext

In [14]:
mecab = MeCab()
document = '우리에게 굵직한 지혜들을 건네는 너무나 멋진 영화'

In [15]:
document in model.wv.key_to_index

False

In [16]:
morphemes = mecab.morphs(document)
morphemes

['우리', '에게', '굵직', '한', '지혜', '들', '을', '건네', '는', '너무나', '멋진', '영화']

In [17]:
embeddings = [model.wv.get_vector(morpheme) for _, morpheme in enumerate(morphemes)]
sentence_embedding = sum(embeddings)
sentence_embedding

array([-5.3881803e+00,  3.3339934e+00, -1.9899373e-01, -1.3103043e+00,
       -3.9608002e+00,  2.8865838e+00, -1.2759537e+00,  8.7179995e-01,
        3.7926445e+00,  1.6915551e+00,  9.2647779e-01,  1.7089513e-01,
        5.7450757e+00, -9.4605929e-01,  1.8335358e+00,  1.7263796e+00,
        8.6635843e-02, -6.0403693e-01,  3.5892546e-01,  1.9706249e-01,
       -2.7951634e-01, -5.7524288e-01, -3.7349406e-01,  1.2886776e+00,
       -5.2843738e-01, -1.6650205e+00,  2.9132798e+00,  1.8467942e+00,
       -7.1334577e-01, -1.3275107e+00,  2.0068963e+00, -1.0071089e+00,
       -3.7656507e+00, -2.7547455e+00,  1.5657277e+00, -3.7755519e-03,
       -1.2180982e+00,  5.3885221e+00,  9.0594304e-01,  1.7410032e+00,
        2.2105443e+00, -1.5293247e-01, -5.7573795e-01,  3.2544551e+00,
       -1.6215771e-01, -3.0973606e+00,  2.2906888e-01,  3.1046381e+00,
       -2.2722075e+00,  3.9370999e+00,  3.6998582e-01, -1.0924687e+00,
       -4.8797350e+00,  1.3474619e-01, -3.7131810e+00,  5.5600509e-02,
      

In [18]:
sentence_embedding = model.wv.get_sentence_vector(morphemes)

In [19]:
sentence_embedding

array([-0.16013725,  0.09005935,  0.00400428, -0.02762699, -0.11783233,
        0.07976735, -0.03836356,  0.0338936 ,  0.12256839,  0.04571179,
        0.0437787 ,  0.00462348,  0.16793722, -0.02188173,  0.05547931,
        0.06301126, -0.00247431, -0.02743533,  0.0066749 ,  0.00479955,
       -0.00394412, -0.01312572, -0.01293556,  0.047396  , -0.01472647,
       -0.05767235,  0.08567697,  0.04963464, -0.02022251, -0.04452683,
        0.0615636 , -0.03003763, -0.10802699, -0.06451722,  0.04826152,
       -0.00611135, -0.02877227,  0.15473019,  0.01656478,  0.05878403,
        0.06016641,  0.00974289, -0.00591775,  0.09609661, -0.01499486,
       -0.08712989,  0.01524194,  0.08867479, -0.06767762,  0.11617515,
        0.01487919, -0.02622127, -0.14212398,  0.0141244 , -0.10621322,
       -0.00182579,  0.03622719, -0.02664652, -0.00377709,  0.01688775,
       -0.08327223,  0.20424648,  0.03198184,  0.01031437,  0.01716161,
        0.05074051,  0.09522698,  0.0571514 , -0.09904211, -0.00

## PyTorch

In [20]:
embedding = Embedding.from_pretrained(torch.FloatTensor(model.wv.vectors))

In [21]:
embedding

Embedding(358043, 100)

### Indices of the tokens

In [22]:
tokens = [model.wv.key_to_index[morpheme] for _, morpheme in enumerate(morphemes)]

In [23]:
tokens

[359, 83, 61033, 35, 5257, 39, 14, 13637, 7, 6661, 5975, 71]

In [24]:
embeddings = embedding(torch.LongTensor(tokens))

In [25]:
embeddings

tensor([[-0.4039,  0.4243, -0.2641,  ..., -0.3232, -0.1522,  0.1071],
        [-0.3133,  0.2569, -0.2068,  ..., -0.1537,  0.4309,  0.0933],
        [-0.3159, -0.1389,  0.2980,  ..., -0.6312, -0.3703,  0.5739],
        ...,
        [-0.8329,  0.3622, -0.2046,  ..., -0.3303,  0.1547, -0.2323],
        [-0.9337, -0.0201, -0.1198,  ..., -0.1091,  0.1686,  0.5696],
        [-0.2876,  0.6111,  0.0328,  ..., -0.5097, -0.0914,  0.4112]])

In [26]:
embeddings.shape

torch.Size([12, 100])

In [27]:
embeddings.sum(axis=0)

tensor([-5.3882e+00,  3.3340e+00, -1.9899e-01, -1.3103e+00, -3.9608e+00,
         2.8866e+00, -1.2760e+00,  8.7180e-01,  3.7926e+00,  1.6916e+00,
         9.2648e-01,  1.7090e-01,  5.7451e+00, -9.4606e-01,  1.8335e+00,
         1.7264e+00,  8.6636e-02, -6.0404e-01,  3.5893e-01,  1.9706e-01,
        -2.7952e-01, -5.7524e-01, -3.7349e-01,  1.2887e+00, -5.2844e-01,
        -1.6650e+00,  2.9133e+00,  1.8468e+00, -7.1335e-01, -1.3275e+00,
         2.0069e+00, -1.0071e+00, -3.7657e+00, -2.7547e+00,  1.5657e+00,
        -3.7756e-03, -1.2181e+00,  5.3885e+00,  9.0594e-01,  1.7410e+00,
         2.2105e+00, -1.5293e-01, -5.7574e-01,  3.2545e+00, -1.6216e-01,
        -3.0974e+00,  2.2907e-01,  3.1046e+00, -2.2722e+00,  3.9371e+00,
         3.6999e-01, -1.0925e+00, -4.8797e+00,  1.3475e-01, -3.7132e+00,
         5.5601e-02,  1.4174e+00, -9.6063e-01, -2.3184e-01,  2.6176e-01,
        -3.3821e+00,  6.2837e+00,  1.0648e+00,  5.0091e-01,  5.4767e-01,
         1.6379e+00,  2.9534e+00,  1.5720e+00, -3.3