In [None]:
# REF https://frhyme.github.io/python-libs/gensim2_fasttext/#when-to-use-fasttext

In [4]:
import pandas as pd
import numpy as np
import ast
import string
from tqdm import tqdm

from sklearn.pipeline import Pipeline
from sklearn.linear_model import *
from sklearn.model_selection import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import *
from sklearn.metrics import *

from gensim.models import *

In [3]:
sentences = ["I am a boy", "you are a girl"]*10
sentences = [s.lower().strip().split(" ") for s in sentences]

In [7]:
# Word2vec Model.
WVmodel = Word2Vec(min_count=1, vector_size=50)
WVmodel.build_vocab(sentences)
WVmodel.train(sentences, total_examples=len(sentences), epochs=300)

(2156, 24000)

In [12]:
# FastTest Model.
FTmodel = FastText(min_count=1, vector_size=50)  # size => vector_size
FTmodel.build_vocab(sentences)
FTmodel.train(sentences, total_examples=len(sentences), epochs=300)

(2156, 24000)

In [14]:
# WVmodel, FTmodel 모두 같은 corpus로부터 생성되었기 때문에 vocabulary는 같습니다.
assert set(WVmodel.wv.index_to_key) == set(FTmodel.wv.index_to_key)

#### out of vocab

In [17]:
# Infer Vector for Out-Of-Vocabulary
# 현재 vocab에서 가장 유사하다고 생각되어지는 word와 그 유사도. 
out_of_vocabs = ['boys', 'girls', 'your', 'bare']
for oov in out_of_vocabs:
    print(f"Does WVmodel have <{oov}> in vocab?: {oov in WVmodel.wv.key_to_index}")
    try: # Word2vec의 경우 vocab에 없는 word의 경우 vector로 표현해주지 못함.
        print(f"vector of <{oov}> => {WVmodel.wv[oov]}")
    except Exception as e:
        print(f"- Exception: {e}")
#     print("--"*20)        
    print(f"Does FTmodel have <{oov}> in vocab?: {oov in FTmodel.wv.key_to_index }")
    print(f"- vector of <{oov}> => {FTmodel.wv[oov][:2]}")
    most_similar_vocab = [f"{w}: {sim:.2f}" for w, sim in FTmodel.wv.most_similar(positive=[oov])]
    print(f"- {most_similar_vocab}")
#     print("=="*20)  # out of vocab에 대해서 word2vec, FastText의 결과가 다름을 보임.

Does WVmodel have <boys> in vocab?: False
- Exception: "Key 'boys' not present"
Does FTmodel have <boys> in vocab?: False
- vector of <boys> => [0.00119801 0.00336338]
- ['boy: 0.62', 'girl: 0.49', 'are: 0.40', 'a: 0.39', 'i: 0.30', 'am: 0.28', 'you: 0.28']
Does WVmodel have <girls> in vocab?: False
- Exception: "Key 'girls' not present"
Does FTmodel have <girls> in vocab?: False
- vector of <girls> => [-0.00148499  0.00092695]
- ['girl: 0.71', 'i: 0.53', 'are: 0.50', 'a: 0.47', 'you: 0.41', 'boy: 0.34', 'am: 0.34']
Does WVmodel have <your> in vocab?: False
- Exception: "Key 'your' not present"
Does FTmodel have <your> in vocab?: False
- vector of <your> => [-0.00329254 -0.00294157]
- ['girl: 0.48', 'you: 0.44', 'i: 0.27', 'am: 0.27', 'a: 0.23', 'boy: 0.22', 'are: 0.13']
Does WVmodel have <bare> in vocab?: False
- Exception: "Key 'bare' not present"
Does FTmodel have <bare> in vocab?: False
- vector of <bare> => [ 0.00242721 -0.00344649]
- ['boy: 0.33', 'girl: 0.32', 'a: 0.30', 'am: 0.

In [19]:
out_of_vocabs = ['hi']
for oov in out_of_vocabs:
    print(f"Does FTmodel have <{oov}> in vocab?: {oov in FTmodel.wv.key_to_index}")
    print(f"- vector of <{oov}> => {FTmodel.wv[oov][:2]}")
    most_similar_vocab = [f"{w}: {sim:.2f}" for w, sim in FTmodel.wv.most_similar(positive=[oov])]
    print(f"- {most_similar_vocab}")
#     print("=="*20)

Does FTmodel have <hi> in vocab?: False
- vector of <hi> => [ 0.00193439 -0.00188542]
- ['are: 0.08', 'a: 0.05', 'boy: 0.01', 'i: 0.01', 'am: -0.01', 'girl: -0.06', 'you: -0.06']


#### in vocab

In [20]:
sentences = ["I am a boy", "you are a girl"]*100 
sentences.append("we are boys")  # "boys" 빈도 +1 = rare word, almost oov
sentences = [s.lower().strip().split(" ") for s in sentences]

In [22]:
# Word2vec Model.
WVmodel = Word2Vec(min_count=1, vector_size=50)
WVmodel.build_vocab(sentences)
WVmodel.train(sentences, total_examples=len(sentences), epochs=30)

(2183, 24090)

In [25]:
FTmodel = FastText(min_count=1, vector_size=50)
FTmodel.build_vocab(sentences)
FTmodel.train(sentences, total_examples=len(sentences), epochs=30)

(2183, 24090)

In [26]:
assert set(WVmodel.wv.index_to_key) == set(FTmodel.wv.index_to_key)

In [27]:
print(f"Word2vec similarity: {WVmodel.wv.similarity('boy', 'boys'):.4f}")
print(f"FastText similarity: {FTmodel.wv.similarity('boy', 'boys'):.4f}")

Word2vec similarity: 0.4560
FastText similarity: 0.8681


In [None]:
# REF https://radimrehurek.com/gensim/auto_examples/tutorials/run_fasttext.html#sphx-glr-auto-examples-tutorials-run-fasttext-py

In [1]:
from pprint import pprint as print
from gensim.models.fasttext import FastText
from gensim.test.utils import datapath

In [2]:
# Set file names for train and test data
corpus_file = datapath('lee_background.cor')

model = FastText(vector_size=100)

# build the vocabulary
model.build_vocab(corpus_file=corpus_file)

# train the model
model.train(
    corpus_file=corpus_file, epochs=model.epochs,
    total_examples=model.corpus_count, total_words=model.corpus_total_words,
)

print(model)

<gensim.models.fasttext.FastText object at 0x000001CEDB60BD48>


In [12]:
print(corpus_file)

'D:\\Anaconda3\\envs\\sd_env\\lib\\site-packages\\gensim\\test\\test_data\\lee_background.cor'


In [3]:
wv = model.wv
print(wv)

#
# FastText models support vector lookups for out-of-vocabulary words by summing up character ngrams belonging to the word.
#
print('night' in wv.key_to_index)

<gensim.models.fasttext.FastTextKeyedVectors object at 0x000001CEDB60BF48>
True


In [13]:
wv.key_to_index

{'the': 0,
 'to': 1,
 'of': 2,
 'in': 3,
 'and': 4,
 'a': 5,
 'is': 6,
 'for': 7,
 'The': 8,
 'on': 9,
 'he': 10,
 'has': 11,
 'says': 12,
 'was': 13,
 'have': 14,
 'that': 15,
 'be': 16,
 'are': 17,
 'will': 18,
 'with': 19,
 'Mr': 20,
 'said.': 21,
 'at': 22,
 'from': 23,
 'by': 24,
 'been': 25,
 'not': 26,
 'as': 27,
 'his': 28,
 'an': 29,
 'it': 30,
 'were': 31,
 'had': 32,
 'after': 33,
 'but': 34,
 'they': 35,
 'said': 36,
 'who': 37,
 'this': 38,
 'Australian': 39,
 'we': 40,
 'Palestinian': 41,
 'their': 42,
 'which': 43,
 'people': 44,
 'two': 45,
 'there': 46,
 'up': 47,
 'about': 48,
 'also': 49,
 'its': 50,
 'out': 51,
 'South': 52,
 'into': 53,
 'would': 54,
 'US': 55,
 'when': 56,
 'against': 57,
 'first': 58,
 'New': 59,
 'more': 60,
 'last': 61,
 'I': 62,
 'He': 63,
 'A': 64,
 'Israeli': 65,
 'Australia': 66,
 'if': 67,
 'one': 68,
 'over': 69,
 'United': 70,
 'Government': 71,
 'or': 72,
 'than': 73,
 'no': 74,
 'all': 75,
 'could': 76,
 'three': 77,
 'before': 78,
 't

In [4]:
print('nights' in wv.key_to_index)

False


In [5]:
print(wv['night'])

array([-0.15475433,  0.1252425 , -0.27081397, -0.10889962,  0.04203569,
        0.37108007,  0.27523392,  0.52786773,  0.28581938, -0.1986254 ,
        0.03456262, -0.14185578, -0.21298781,  0.5713253 , -0.3888784 ,
       -0.5917308 ,  0.18881075, -0.2336333 , -0.49938384, -0.567006  ,
       -0.49670756, -0.01644066, -0.49384695, -0.10753794, -0.12664177,
       -0.3076506 , -0.666508  , -0.09729929, -0.3132778 ,  0.27637705,
       -0.3458514 ,  0.31331167,  0.80769545, -0.2884593 ,  0.18809943,
        0.34015375,  0.3516266 , -0.12963241, -0.42162904, -0.31708774,
        0.44845504, -0.44526386,  0.03168629, -0.39517945, -0.5213216 ,
       -0.34871668, -0.08258637,  0.17523675,  0.36536118,  0.0229767 ,
        0.35191676, -0.50156254,  0.30075848, -0.38963047, -0.17609014,
       -0.20503609, -0.19204898, -0.20033792,  0.06501315, -0.32075122,
       -0.32703844, -0.3937584 , -0.15649399,  0.31981346, -0.07401659,
        0.6701349 ,  0.03683931,  0.07132513,  0.47971678,  0.25

In [6]:
print(wv['nights'])

array([-0.134322  ,  0.10915444, -0.23417717, -0.09382299,  0.03511144,
        0.31954968,  0.23910838,  0.45785618,  0.24755444, -0.17339875,
        0.03164794, -0.12100703, -0.18521912,  0.491595  , -0.33745897,
       -0.5117655 ,  0.1624686 , -0.2013433 , -0.430313  , -0.49060196,
       -0.42613953, -0.01538588, -0.4266895 , -0.09435295, -0.10810266,
       -0.26445672, -0.5746753 , -0.08173843, -0.27051112,  0.24022208,
       -0.29692358,  0.27018812,  0.6961611 , -0.24896628,  0.16269405,
        0.2934013 ,  0.30509531, -0.11197305, -0.36439785, -0.27455863,
        0.38682416, -0.38366386,  0.0269192 , -0.34092772, -0.45134073,
       -0.30012524, -0.06844437,  0.15201978,  0.31714198,  0.02100454,
        0.30557156, -0.43357727,  0.2605747 , -0.33645216, -0.15178265,
       -0.1759117 , -0.1676968 , -0.1712804 ,  0.05748541, -0.27453104,
       -0.2814604 , -0.34052545, -0.13479911,  0.27569476, -0.06336423,
        0.5800043 ,  0.0320218 ,  0.05910176,  0.41412053,  0.22

In [9]:
print(wv.similarity("night", "nights"))

0.999992


In [10]:
print(wv.most_similar("nights"))

[('night', 0.9999919533729553),
 ('rights', 0.9999876022338867),
 ('flights', 0.9999876022338867),
 ('overnight', 0.999987006187439),
 ('fighting', 0.9999858140945435),
 ('entered', 0.9999855160713196),
 ('fight', 0.9999850988388062),
 ('fighters', 0.9999850988388062),
 ('fighter', 0.9999844431877136),
 ('treated', 0.9999844431877136)]
