# Exporting fastText

In [1]:
import fasttext
import numpy as np
import gensim
from mecab import MeCab
import torch
from torch.nn import Embedding
from torch.nn.functional import normalize

## MeCab

- Recall that it’s more appropriate to tokenize Korean corpus based on morphemes to reflect the meaning because Korean belongs to an agglutinative language
- That's why we don't simply pass the document to the fastText model
- Instead, we tokenize it based on morphemes and pass the tokens to the model

In [2]:
mecab = MeCab()
document = '우리에게 굵직한 지혜들을 건네는 너무나 멋진 영화'
morphemes = mecab.morphs(document)

In [3]:
morphemes

['우리', '에게', '굵직', '한', '지혜', '들', '을', '건네', '는', '너무나', '멋진', '영화']

In [4]:
' '.join(morphemes) 

'우리 에게 굵직 한 지혜 들 을 건네 는 너무나 멋진 영화'

## fastText

In [5]:
model = fasttext.load_model('../baseline/tmp/fasttext.bin')



- Be aware `get_sentence_vector` normalizes each word vector before taking a mean
- That's why `get_sentence_vector` and 

In [6]:
model.get_sentence_vector(' '.join(morphemes))

array([-0.16013725,  0.09005935,  0.00400428, -0.02762699, -0.11783235,
        0.07976735, -0.03836356,  0.0338936 ,  0.12256839,  0.04571179,
        0.0437787 ,  0.00462348,  0.16793722, -0.02188173,  0.05547931,
        0.06301126, -0.00247431, -0.02743533,  0.0066749 ,  0.00479955,
       -0.00394412, -0.01312571, -0.01293556,  0.047396  , -0.01472647,
       -0.05767235,  0.08567697,  0.04963464, -0.02022251, -0.04452684,
        0.06156359, -0.03003763, -0.108027  , -0.06451722,  0.04826152,
       -0.00611135, -0.02877227,  0.15473019,  0.01656478,  0.05878403,
        0.06016641,  0.00974289, -0.00591775,  0.09609663, -0.01499486,
       -0.08712989,  0.01524194,  0.0886748 , -0.06767763,  0.11617516,
        0.01487919, -0.02622127, -0.14212397,  0.0141244 , -0.10621323,
       -0.00182579,  0.03622719, -0.02664652, -0.00377709,  0.01688775,
       -0.08327223,  0.20424649,  0.03198184,  0.01031437,  0.01716161,
        0.05074051,  0.09522699,  0.0571514 , -0.09904211, -0.00

In [9]:
model.get_word_vector('한국종합주가지수')

array([-0.3404287 ,  0.14629197, -0.08776837, -0.24680458, -0.2814848 ,
        0.15057775, -0.0491869 , -0.04219627, -0.16823529,  0.3390175 ,
        0.26483497, -0.088466  ,  0.23041442, -0.10376953,  0.27699685,
       -0.02443741,  0.11224408, -0.24032238,  0.00901352,  0.1358867 ,
        0.169552  ,  0.0714559 , -0.03221908,  0.1070381 ,  0.06202681,
        0.01233821,  0.0925159 , -0.07816944, -0.14903975, -0.12694903,
        0.09238172, -0.20401032, -0.09378432, -0.17411737, -0.01845991,
       -0.2746254 ,  0.16057947,  0.44116694,  0.16157727, -0.02768847,
        0.28650057, -0.05532373, -0.05640784,  0.29480362, -0.19458799,
       -0.00232764,  0.3086277 , -0.02296171, -0.07182577,  0.01512193,
       -0.1581458 ,  0.10353798, -0.09793069, -0.17010964, -0.50876594,
       -0.1894337 ,  0.08700848, -0.22082461, -0.22423746, -0.27011713,
       -0.1915692 ,  0.22655883, -0.03149575, -0.07704592, -0.15466662,
       -0.14191377,  0.11327878,  0.09035682, -0.05632119,  0.04

## gensim

### Loading fastText with gensim

In [22]:
model = gensim.models.fasttext.load_facebook_model('../baseline/tmp/fasttext.bin')

In [23]:
model

<gensim.models.fasttext.FastText at 0x7fd4fc5d2c50>

### Vocabulary-id dictionary

In [24]:
model.wv.key_to_index

{'1': 0,
 '.': 1,
 '0': 2,
 '2': 3,
 '의': 4,
 ',': 5,
 '이': 6,
 '는': 7,
 '다': 8,
 ')': 9,
 '(': 10,
 '9': 11,
 '년': 12,
 '에': 13,
 '을': 14,
 '하': 15,
 '3': 16,
 '5': 17,
 '4': 18,
 '8': 19,
 '은': 20,
 '6': 21,
 '7': 22,
 ':': 23,
 '를': 24,
 '월': 25,
 '분류': 26,
 '일': 27,
 '고': 28,
 '-': 29,
 '가': 30,
 '있': 31,
 '에서': 32,
 '으로': 33,
 '로': 34,
 '한': 35,
 '되': 36,
 '었': 37,
 '과': 38,
 '들': 39,
 '와': 40,
 '도': 41,
 '했': 42,
 '적': 43,
 '인': 44,
 '였': 45,
 '</s>': 46,
 '그': 47,
 '어': 48,
 '기': 49,
 '《': 50,
 '제': 51,
 '것': 52,
 '*': 53,
 '~': 54,
 '게': 55,
 '지': 56,
 '"': 57,
 '》': 58,
 '여': 59,
 '한다': 60,
 '수': 61,
 '역': 62,
 '된': 63,
 '등': 64,
 '/': 65,
 '며': 66,
 '대': 67,
 '·': 68,
 '회': 69,
 '선수': 70,
 '영화': 71,
 '대한민국': 72,
 '할': 73,
 '던': 74,
 '해': 75,
 '아': 76,
 '만': 77,
 '%': 78,
 '명': 79,
 '않': 80,
 '자': 81,
 '시': 82,
 '에게': 83,
 '중': 84,
 '주': 85,
 '까지': 86,
 '미국': 87,
 '았': 88,
 '나': 89,
 '번': 90,
 '면': 91,
 '지만': 92,
 '일본': 93,
 '없': 94,
 '사람': 95,
 '받': 96,
 '성': 97,
 '위': 98,
 '

### Pretrained embeddings

In [25]:
model.wv.vectors

array([[-0.19089553, -0.04864176, -0.12746184, ..., -0.20128623,
        -0.05452869,  0.21092379],
       [-0.04622136,  0.09959393,  0.03069987, ..., -0.21032026,
        -0.0100284 , -0.01041338],
       [-0.0235911 ,  0.09469731, -0.1185493 , ..., -0.18746178,
         0.04212563,  0.16035339],
       ...,
       [-0.50652236,  0.14685082, -0.24950482, ..., -0.15726815,
         0.6935291 ,  0.5601857 ],
       [-0.38853964,  0.02418062,  0.0767933 , ..., -0.3630212 ,
        -0.27333122,  0.4337093 ],
       [-0.26303595, -0.38537705, -0.3572211 , ...,  0.04461757,
         0.09841006, -0.07057536]], dtype=float32)

### Is the length of `wv.key_to_index` the same as the length of `wv.vectors`?

In [26]:
assert len(model.wv.key_to_index) == len(model.wv.vectors)

### Sentence embedding

1. https://github.com/RaRe-Technologies/gensim/issues/3015
2. https://github.com/RaRe-Technologies/gensim/pull/3188
3. https://stackoverflow.com/questions/65397810/whats-the-equivalent-to-get-sentence-vector-for-gensims-fasttext

- Be aware `get_sentence_vector` normalizes each word vector before taking a mean

In [27]:
sentence_embedding = model.wv.get_sentence_vector(morphemes)

In [28]:
sentence_embedding

array([-0.16013725,  0.09005935,  0.00400428, -0.02762699, -0.11783233,
        0.07976735, -0.03836356,  0.0338936 ,  0.12256839,  0.04571179,
        0.0437787 ,  0.00462348,  0.16793722, -0.02188173,  0.05547931,
        0.06301126, -0.00247431, -0.02743533,  0.0066749 ,  0.00479955,
       -0.00394412, -0.01312572, -0.01293556,  0.047396  , -0.01472647,
       -0.05767235,  0.08567697,  0.04963464, -0.02022251, -0.04452683,
        0.0615636 , -0.03003763, -0.10802699, -0.06451722,  0.04826152,
       -0.00611135, -0.02877227,  0.15473019,  0.01656478,  0.05878403,
        0.06016641,  0.00974289, -0.00591775,  0.09609661, -0.01499486,
       -0.08712989,  0.01524194,  0.08867479, -0.06767762,  0.11617515,
        0.01487919, -0.02622127, -0.14212398,  0.0141244 , -0.10621322,
       -0.00182579,  0.03622719, -0.02664652, -0.00377709,  0.01688775,
       -0.08327223,  0.20424648,  0.03198184,  0.01031437,  0.01716161,
        0.05074051,  0.09522698,  0.0571514 , -0.09904211, -0.00

### Character N-grams in Gensim

In [33]:
model.wv.get_vector('한국종합주가지수')

array([-0.34042868,  0.14629196, -0.08776836, -0.24680457, -0.28148478,
        0.15057775, -0.04918689, -0.04219627, -0.16823529,  0.3390175 ,
        0.26483494, -0.088466  ,  0.2304144 , -0.10376953,  0.27699685,
       -0.02443741,  0.11224408, -0.24032237,  0.00901352,  0.13588668,
        0.169552  ,  0.0714559 , -0.03221908,  0.1070381 ,  0.0620268 ,
        0.01233821,  0.0925159 , -0.07816944, -0.14903975, -0.12694901,
        0.09238172, -0.20401032, -0.09378432, -0.17411736, -0.01845991,
       -0.27462536,  0.16057946,  0.4411669 ,  0.16157725, -0.02768847,
        0.28650057, -0.05532373, -0.05640784,  0.29480362, -0.19458798,
       -0.00232764,  0.30862767, -0.02296171, -0.07182577,  0.01512193,
       -0.1581458 ,  0.10353798, -0.09793068, -0.17010964, -0.50876594,
       -0.18943368,  0.08700848, -0.22082461, -0.22423744, -0.2701171 ,
       -0.19156918,  0.22655883, -0.03149575, -0.07704592, -0.1546666 ,
       -0.14191376,  0.11327878,  0.09035682, -0.05632119,  0.04

In [34]:
ngram_weights = model.wv.vectors_ngrams

In [35]:
ngram_weights

array([[ 0.00961216,  0.00857213, -0.00182109, ...,  0.00089249,
         0.00933384,  0.00446054],
       [-0.06039548,  0.25845215,  0.62752146, ...,  0.28242078,
        -0.18507813, -0.28522423],
       [-0.13343883, -0.07715016, -0.43124828, ..., -0.21491931,
         0.18529506,  0.11182162],
       ...,
       [ 0.06084336, -0.23052943, -0.2995869 , ...,  0.34839487,
        -0.3797674 , -0.07417264],
       [-0.09971589,  0.15364054, -0.30051127, ...,  0.11669045,
         0.03423443, -0.1164437 ],
       [-0.25392744,  0.40449083, -0.46832526, ..., -0.05358298,
         0.27670175,  0.00591091]], dtype=float32)

In [36]:
ngram_weights.shape

(2000000, 100)

In [37]:
'한국종합주가지수' in model.wv.key_to_index

False

In [38]:
from gensim.models.fasttext_inner import (  # noqa: F401
        compute_ngrams,
        compute_ngrams_bytes,
)

In [39]:
ngrams = compute_ngrams(
    '한국종합주가지수', 
    model.wv.min_n, 
    model.wv.max_n,
)

In [40]:
ngrams

['<한국',
 '한국종',
 '국종합',
 '종합주',
 '합주가',
 '주가지',
 '가지수',
 '지수>',
 '<한국종',
 '한국종합',
 '국종합주',
 '종합주가',
 '합주가지',
 '주가지수',
 '가지수>',
 '<한국종합',
 '한국종합주',
 '국종합주가',
 '종합주가지',
 '합주가지수',
 '주가지수>',
 '<한국종합주',
 '한국종합주가',
 '국종합주가지',
 '종합주가지수',
 '합주가지수>']

In [41]:
len(ngrams)

26

- https://github.com/RaRe-Technologies/gensim/blob/fdf40eb309f5b188b8ef173cff80c7727bfa36ff/gensim/models/fasttext.py#L1326

In [42]:
ngram_hashes = gensim.models.fasttext.ft_ngram_hashes(
    '한국종합주가지수', 
    model.wv.min_n, 
    model.wv.max_n, 
    model.wv.bucket
)

In [43]:
ngram_hashes

[1124397,
 910738,
 202451,
 1938434,
 808344,
 1504293,
 882540,
 834962,
 1012641,
 880192,
 359134,
 564985,
 1776378,
 276608,
 300123,
 1362093,
 498541,
 1169236,
 1189864,
 169826,
 1539173,
 873363,
 417623,
 462288,
 1126698,
 1207226]

In [44]:
len(ngram_hashes)

26

In [45]:
result = np.zeros(
    model.wv.vectors_ngrams.shape[1], 
    dtype=np.float32
)
for _, ngram_hash in enumerate(ngram_hashes):
    result += ngram_weights[ngram_hash]

In [52]:
result / len(ngram_hashes)

array([-0.34042868,  0.14629196, -0.08776836, -0.24680457, -0.28148478,
        0.15057775, -0.04918689, -0.04219627, -0.16823529,  0.3390175 ,
        0.26483494, -0.088466  ,  0.2304144 , -0.10376953,  0.27699685,
       -0.02443741,  0.11224408, -0.24032237,  0.00901352,  0.13588668,
        0.169552  ,  0.0714559 , -0.03221908,  0.1070381 ,  0.0620268 ,
        0.01233821,  0.0925159 , -0.07816944, -0.14903975, -0.12694901,
        0.09238172, -0.20401032, -0.09378432, -0.17411736, -0.01845991,
       -0.27462536,  0.16057946,  0.4411669 ,  0.16157725, -0.02768847,
        0.28650057, -0.05532373, -0.05640784,  0.29480362, -0.19458798,
       -0.00232764,  0.30862767, -0.02296171, -0.07182577,  0.01512193,
       -0.1581458 ,  0.10353798, -0.09793068, -0.17010964, -0.50876594,
       -0.18943368,  0.08700848, -0.22082461, -0.22423744, -0.2701171 ,
       -0.19156918,  0.22655883, -0.03149575, -0.07704592, -0.1546666 ,
       -0.14191376,  0.11327878,  0.09035682, -0.05632119,  0.04

In [50]:
model.wv.get_vector('한국종합주가지수')

array([-0.34042868,  0.14629196, -0.08776836, -0.24680457, -0.28148478,
        0.15057775, -0.04918689, -0.04219627, -0.16823529,  0.3390175 ,
        0.26483494, -0.088466  ,  0.2304144 , -0.10376953,  0.27699685,
       -0.02443741,  0.11224408, -0.24032237,  0.00901352,  0.13588668,
        0.169552  ,  0.0714559 , -0.03221908,  0.1070381 ,  0.0620268 ,
        0.01233821,  0.0925159 , -0.07816944, -0.14903975, -0.12694901,
        0.09238172, -0.20401032, -0.09378432, -0.17411736, -0.01845991,
       -0.27462536,  0.16057946,  0.4411669 ,  0.16157725, -0.02768847,
        0.28650057, -0.05532373, -0.05640784,  0.29480362, -0.19458798,
       -0.00232764,  0.30862767, -0.02296171, -0.07182577,  0.01512193,
       -0.1581458 ,  0.10353798, -0.09793068, -0.17010964, -0.50876594,
       -0.18943368,  0.08700848, -0.22082461, -0.22423744, -0.2701171 ,
       -0.19156918,  0.22655883, -0.03149575, -0.07704592, -0.1546666 ,
       -0.14191376,  0.11327878,  0.09035682, -0.05632119,  0.04

## PyTorch

In [28]:
embedding = Embedding.from_pretrained(torch.FloatTensor(model.wv.vectors))

In [29]:
embedding

Embedding(358043, 100)

### Indices of the tokens

In [30]:
tokens = [model.wv.key_to_index[morpheme] for _, morpheme in enumerate(morphemes)]

In [31]:
tokens

[359, 83, 61033, 35, 5257, 39, 14, 13637, 7, 6661, 5975, 71]

In [32]:
embeddings = embedding(torch.LongTensor(tokens))

In [33]:
embeddings

tensor([[-0.4039,  0.4243, -0.2641,  ..., -0.3232, -0.1522,  0.1071],
        [-0.3133,  0.2569, -0.2068,  ..., -0.1537,  0.4309,  0.0933],
        [-0.3159, -0.1389,  0.2980,  ..., -0.6312, -0.3703,  0.5739],
        ...,
        [-0.8329,  0.3622, -0.2046,  ..., -0.3303,  0.1547, -0.2323],
        [-0.9337, -0.0201, -0.1198,  ..., -0.1091,  0.1686,  0.5696],
        [-0.2876,  0.6111,  0.0328,  ..., -0.5097, -0.0914,  0.4112]])

- Recall that
> - Be aware `get_sentence_vector` normalizes each word vector before taking a mean
- That's why I normalize `embeddings` before taking the mean

In [34]:
normalize(embeddings).mean(axis=0)

tensor([-0.1601,  0.0901,  0.0040, -0.0276, -0.1178,  0.0798, -0.0384,  0.0339,
         0.1226,  0.0457,  0.0438,  0.0046,  0.1679, -0.0219,  0.0555,  0.0630,
        -0.0025, -0.0274,  0.0067,  0.0048, -0.0039, -0.0131, -0.0129,  0.0474,
        -0.0147, -0.0577,  0.0857,  0.0496, -0.0202, -0.0445,  0.0616, -0.0300,
        -0.1080, -0.0645,  0.0483, -0.0061, -0.0288,  0.1547,  0.0166,  0.0588,
         0.0602,  0.0097, -0.0059,  0.0961, -0.0150, -0.0871,  0.0152,  0.0887,
        -0.0677,  0.1162,  0.0149, -0.0262, -0.1421,  0.0141, -0.1062, -0.0018,
         0.0362, -0.0266, -0.0038,  0.0169, -0.0833,  0.2042,  0.0320,  0.0103,
         0.0172,  0.0507,  0.0952,  0.0572, -0.0990, -0.0084, -0.0120,  0.0411,
         0.0362, -0.0298, -0.0245,  0.1409,  0.0134, -0.0293,  0.0620, -0.0176,
         0.0552,  0.0541,  0.0854,  0.0437, -0.0140,  0.0086,  0.0023, -0.0035,
         0.0326, -0.0993, -0.1320, -0.0617,  0.0263, -0.0177, -0.0263,  0.0392,
         0.0344, -0.1125,  0.0015,  0.06