In [134]:
from tokenizers import BertWordPieceTokenizer
from pathlib import Path
from gensim.models import Word2Vec

In [135]:
out_base = Path('./polished/models/v2bert/')

In [106]:
data_file = './no_en_data/ka_nse_train.txt'

In [107]:
wb_tokenizer = BertWordPieceTokenizer(str(out_base / 'wordpiece' / 'vocab.txt' ), 
                                      clean_text=True, handle_chinese_chars=True,
                                      strip_accents=True, lowercase=False)

Remember to remove special tokens during w2v training

In [108]:
wb_tokenizer.encode('შემომეჭამა').ids

[2, 6399, 1909, 2096, 3]

In [110]:
wb_tokenizer.decode(wb_tokenizer.encode('შემომეჭამა').ids[1:-1])

'შემომეჭამა'

In [111]:
epochs: int
def get_subword_sentences():
    for i in range(epochs):
        with open(data_file) as f:
            for line in f:
                yield wb_tokenizer.encode(line).tokens[1:-1]
        print(f'epoch #{i} done')

In [112]:
#sentences = [sent for sent in get_subword_sentences()]

In [113]:
model = Word2Vec(sg=1, 
                 vector_size=300,  
                 workers=12,
                 window=12, # subwords need more context
                 epochs=1, # epochs are controlled above so we can use effient generator
                 seed=1337)

In [114]:
epochs = 1 # 1 epoch for vocab training
model.build_vocab(get_subword_sentences())
total_sentences = sum(1 for _ in get_subword_sentences())

epoch #0 done
epoch #0 done


In [115]:
total_sentences

747262

In [116]:
epochs = 10 # n epoch for vocab training
model.train(get_subword_sentences(), 
            total_examples=total_sentences, 
            epochs=model.epochs
           )

epoch #0 done
epoch #1 done
epoch #2 done
epoch #3 done
epoch #4 done
epoch #5 done
epoch #6 done
epoch #7 done
epoch #8 done
epoch #9 done


(134444292, 158616630)

In [117]:
model.save("./polished/models/word2vec/subword.model")

In [130]:
model.wv.most_similar('##შვილი')

[('##იაშვილი', 0.7265536785125732),
 ('##აძე', 0.7166693210601807),
 ('##ძე', 0.6936086416244507),
 ('##იშვილი', 0.690089225769043),
 ('##აშვილი', 0.6892117857933044),
 ('##რაშვილი', 0.6885238885879517),
 ('წიკლაური', 0.6833595037460327),
 ('##უაშვილი', 0.6818857192993164),
 ('##უნაშვილი', 0.680444061756134),
 ('##ლიშვილი', 0.6801262497901917)]

In [131]:
model.wv.most_similar('##ები')

[('##ებიც', 0.6364842653274536),
 ('##ების', 0.49715572595596313),
 ('##ებისთვის', 0.48254188895225525),
 ('##ებისგან', 0.46793222427368164),
 ('##ურები', 0.42871180176734924),
 ('##ერები', 0.41608086228370667),
 ('##ებია', 0.4138261675834656),
 ('##კები', 0.41119346022605896),
 ('##ეები', 0.4096052944660187),
 ('ღირ', 0.4076472520828247)]

In [133]:
model.wv.most_similar('ღორ')

[('ძროხ', 0.8630732893943787),
 ('##ატარე', 0.8326977491378784),
 ('##ყრილი', 0.8293331861495972),
 ('ძაფ', 0.8266956210136414),
 ('მღვ', 0.8252092599868774),
 ('ქუდ', 0.8243218660354614),
 ('ღილ', 0.8230547308921814),
 ('გადაქ', 0.8226965069770813),
 ('##ალაგ', 0.8214482665061951),
 ('ლოყ', 0.819601833820343)]