In [1]:
from tokenizers import BertWordPieceTokenizer
from pathlib import Path
from gensim.models import Word2Vec

In [2]:
out_base = Path('./polished/models/v2bert/')

In [3]:
data_file = './no_en_data/'

In [107]:
wb_tokenizer = BertWordPieceTokenizer(str(out_base / 'wordpiece' / 'vocab.txt' ), 
                                      clean_text=True, handle_chinese_chars=True,
                                      strip_accents=True, lowercase=False)

Remember to remove special tokens during w2v training

In [108]:
wb_tokenizer.encode('შემომეჭამა').ids

[2, 6399, 1909, 2096, 3]

In [110]:
wb_tokenizer.decode(wb_tokenizer.encode('შემომეჭამა').ids[1:-1])

'შემომეჭამა'

In [4]:
from geotok import _basic_georgian_normalize

In [9]:
epochs = 1
def get_subword_sentences():
    for i in range(epochs):
        with open(data_file) as f:
            for line in f:
                yield _basic_georgian_normalize(line)
        print(f'epoch #{i} done')

In [10]:
model = Word2Vec(sg=1, 
                 vector_size=300,  
                 workers=12,
                 window=12, # subwords need more context
                 epochs=1, # epochs are controlled above so we can use effient generator
                 seed=1337)

In [11]:
epochs = 1 # 1 epoch for vocab training
model.build_vocab(get_subword_sentences())
total_sentences = sum(1 for _ in get_subword_sentences())

epoch #0 done
epoch #0 done


In [12]:
total_sentences

747262

In [13]:
epochs = 10 # n epoch for vocab training
model.train(get_subword_sentences(), 
            total_examples=total_sentences, 
            epochs=model.epochs
           )

epoch #0 done
epoch #1 done
epoch #2 done
epoch #3 done
epoch #4 done
epoch #5 done
epoch #6 done
epoch #7 done
epoch #8 done
epoch #9 done


(87335322, 116475660)

In [14]:
model.save("./polished/models/word2vec/fullword.model")

In [16]:
model.wv.most_similar('მეფე')

[('სოლომონ', 0.8507850170135498),
 ('ბაგრატ', 0.8443211913108826),
 ('პაპი', 0.8345552682876587),
 ('მოსე', 0.824222207069397),
 ('რომის', 0.8208573460578918),
 ('ერეკლე', 0.8186022639274597),
 ('მეფის', 0.8166657090187073),
 ('ეპისკოპოსი', 0.811444878578186),
 ('დედოფალი', 0.8107984066009521),
 ('მეფედ', 0.8099412322044373)]

In [20]:
model.wv.most_similar(positive=['სანდრო', 'ქალი'], negative=['კაცი'])

[('კოტე', 0.7473993301391602),
 ('სოფო', 0.7397592663764954),
 ('მომღერალი', 0.7386395335197449),
 ('მაკა', 0.7351353764533997),
 ('ელენე', 0.7272740006446838),
 ('ლადო', 0.7189862132072449),
 ('ლალი', 0.7176459431648254),
 ('მაია', 0.7154039144515991),
 ('ფოტოები', 0.7124801874160767),
 ('მანანა', 0.7120293974876404)]

In [18]:
model.wv.most_similar('ღორი')

[('პაწია', 0.9774032831192017),
 ('ჭუჭყიანი', 0.9709874987602234),
 ('თუთიყუში', 0.9700448513031006),
 ('პერანგი', 0.9688372015953064),
 ('ცხენები', 0.9684747457504272),
 ('პაწაწინა', 0.9683911800384521),
 ('თაროზე', 0.966964840888977),
 ('ხარი', 0.9656761288642883),
 ('თაგვი', 0.9652339220046997),
 ('იღიმება', 0.96466463804245)]