# Chracter N-grams

In [1]:
import fasttext
import numpy as np
import gensim
from mecab import MeCab
import torch
from torch.nn import Embedding, EmbeddingBag
from torch.nn.functional import normalize
from torch.autograd import Variable

In [2]:
query = '한국종합주가지수'
model_path= '../baseline/tmp/fasttext.bin'

## fastText

In [3]:
model = fasttext.load_model(model_path)



In [4]:
query in model.words

False

### Checking if `query` is Out-Of-Vocabulary (OOV)

In [5]:
model.get_word_id(query)

-1

### Getting a copy of the full input matrix of a Model

In [6]:
embeddings = model.get_input_matrix()

In [7]:
embeddings

array([[-0.15613616, -0.00356458, -0.12930149, ..., -0.10728721,
        -0.04252381,  0.18029688],
       [ 0.25154757, -0.15255873,  0.1887694 , ..., -0.26557404,
         0.0757494 , -0.19356793],
       [-0.05408973,  0.14642444, -0.08972213, ..., -0.22458382,
         0.08341924,  0.19547725],
       ...,
       [ 0.06084336, -0.23052943, -0.2995869 , ...,  0.34839487,
        -0.3797674 , -0.07417264],
       [-0.09971589,  0.15364054, -0.30051127, ...,  0.11669045,
         0.03423443, -0.1164437 ],
       [-0.25392744,  0.40449083, -0.46832526, ..., -0.05358298,
         0.27670175,  0.00591091]], dtype=float32)

In [8]:
embeddings.shape

(2358043, 100)

### Getting subwords and their indicies given `query`

In [9]:
subwords = model.get_subwords(query)

In [10]:
subwords

(['<한국',
  '<한국종',
  '<한국종합',
  '<한국종합주',
  '한국종',
  '한국종합',
  '한국종합주',
  '한국종합주가',
  '국종합',
  '국종합주',
  '국종합주가',
  '국종합주가지',
  '종합주',
  '종합주가',
  '종합주가지',
  '종합주가지수',
  '합주가',
  '합주가지',
  '합주가지수',
  '합주가지수>',
  '주가지',
  '주가지수',
  '주가지수>',
  '가지수',
  '가지수>',
  '지수>'],
 array([1482440, 1268781,  560494, 2296477, 1166387, 1862336, 1240583,
        1193005, 1370684, 1238235,  717177,  923028, 2134421,  634651,
         658166, 1720136,  856584, 1527279, 1547907,  527869, 1897216,
        1231406,  775666,  820331, 1484741, 1565269]))

### Calculating an embedding given `query` ourselves

In [11]:
result = np.zeros(
    embeddings.shape[1], 
    dtype=np.float32
)

for _, subword_idx in enumerate(subwords[1]):
    result += embeddings[subword_idx]
    
result = result / subwords[1].shape[0]

### Verification

In [12]:
np.allclose(
    result, 
    model.get_word_vector(query)
)

True

## PyTorch

### `Embedding`

In [13]:
torch_embedding = Embedding.from_pretrained(
    torch.FloatTensor(model.get_input_matrix())
)

In [14]:
torch_embedding

Embedding(2358043, 100)

In [15]:
torch_result = torch_embedding(
    torch.LongTensor(
        subwords[1]
    )
).mean(axis=0)

In [16]:
np.allclose(
    result, 
    torch_result.numpy()
)

True

### `EmbeddingBag`

In [17]:
# https://github.com/facebookresearch/fastText/blob/a20c0d27cd0ee88a25ea0433b7f03038cd728459/python/doc/examples/FastTextEmbeddingBag.py#L27


class FastTextEmbeddingBag(EmbeddingBag):
    def __init__(self, model_path):
        self.model = fasttext.load_model(model_path)
        input_matrix = self.model.get_input_matrix()
        input_matrix_shape = input_matrix.shape
        super().__init__(input_matrix_shape[0], input_matrix_shape[1])
        self.weight.data.copy_(torch.FloatTensor(input_matrix))

    def forward(self, words):
        word_subinds = np.empty([0], dtype=np.int64)
        word_offsets = [0]
        for word in words:
            _, subinds = self.model.get_subwords(word)
            word_subinds = np.concatenate((word_subinds, subinds))
            word_offsets.append(word_offsets[-1] + len(subinds))
        word_offsets = word_offsets[:-1]
        ind = Variable(torch.LongTensor(word_subinds))
        offsets = Variable(torch.LongTensor(word_offsets))
        return super().forward(ind, offsets)

In [18]:
torch_embeddingbag = FastTextEmbeddingBag(model_path)



In [19]:
torch_embeddingbag

FastTextEmbeddingBag(2358043, 100, mode=mean)

In [20]:
np.allclose(
    torch_embeddingbag([query])[0].detach().numpy(), 
    model.get_word_vector(query)
)

True