# Utils Function 

In [1]:
def get_sentence(file, sent_segment, tokenizer):
    f = open(file, mode='r', encoding='utf8')
    data = f.read()
    f.close()
    # remove @highlight in data
    r = re.search('@highlight', data)
    if r is not None:
        data = data[:r.start(0)]

    # print(data)
    sentences = sent_segment(data)

    if tokenizer is not None:
        for sent in sentences:
            if sent != '':
                yield tokenizer(sent)
    else:
        for sent in sentences:
            if sent != '':
                yield '<s> ' + sent + ' </s>'

class LineIter:
    def __init__(self, link: list, sent_segment=None, tokenizer=None):
        self.link = link
        self.sent_segment = sent_segment
        self.tokenizer = tokenizer

    def __iter__(self):
        for file in self.link:
            for line in get_sentence(file, self.sent_segment, self.tokenizer):
                yield line.split()

In [2]:
import logging
logging.basicConfig(
    level=logging.INFO # allow INFO level messages to pass through the logger
    )

# CNN DailyMail for Word2vec


## Preparing Data

In [None]:
!git clone https://github.com/abisee/cnn-dailymail.git

Cloning into 'cnn-dailymail'...
remote: Enumerating objects: 61, done.[K
remote: Total 61 (delta 0), reused 0 (delta 0), pack-reused 61[K
Unpacking objects: 100% (61/61), done.


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cd 'cnn-dailymail'
# !unzip  'finished_files.zip'

/content/cnn-dailymail


In [None]:
!cp '../gdrive/MyDrive/DeepLearning/MT/cnn_stories.tgz' './'
!cp '../gdrive/MyDrive/DeepLearning/MT/dailymail_stories.tgz' './'

In [None]:
!tar -xzf cnn_stories.tgz
!tar -xzf dailymail_stories.tgz

In [None]:
from os import listdir
from os.path import isfile, join
cnn_path = 'cnn/stories'
dm_path = 'dailymail/stories'
cnn = [join(cnn_path, f) for f in listdir(cnn_path) if isfile(join(cnn_path, f))]
dm = [join(dm_path, f) for f in listdir(dm_path) if isfile(join(dm_path, f))]
link = cnn + dm

In [None]:
from tqdm import tqdm
import re

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
def token(tokentool=None):
    print(tokentool('The allegations were first revealed in a story in Rolling Stone magazine that was initially widely circulated and then increasingly widely doubted... It is not credible, Goldberg wrote in the Los Angeles Times'))
token(sent_tokenize)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
['The allegations were first revealed in a story in Rolling Stone magazine that was initially widely circulated and then increasingly widely doubted...', 'It is not credible, Goldberg wrote in the Los Angeles Times']


In [None]:
!git clone https://github.com/TanHM-1211/Machine-Translation.git
%cd Machine-Translation

Cloning into 'Machine-Translation'...
remote: Enumerating objects: 57, done.[K
remote: Counting objects: 100% (57/57), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 57 (delta 19), reused 52 (delta 14), pack-reused 0[K
Unpacking objects: 100% (57/57), done.
/content/cnn-dailymail/Machine-Translation


In [None]:
from bpe.BPE_EN import BPE_EN
bpe = BPE_EN(padding=False)

In [None]:
link = ['../' + i for i in link]

## Train space word2vec

In [None]:
from gensim.models import word2vec
from time import time

INFO:summarizer.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English


In [None]:
N = 10
EMBEDDING_SIZE = 128
VOCAB_SIZE = 60000
EPOCHS = 10
DIR = '../../gdrive/MyDrive/DeepLearning/MT/space_en/word2vec.kv'

In [None]:
lineIter = LineIter(link, sent_segment=sent_tokenize)

In [None]:
link[:2]

['../cnn/stories/962e0924971cea4b310d2c83e87df917b22c706c.story',
 '../cnn/stories/b8489841b83ca2e1f8396bb85b9e4533b9328055.story']

In [None]:
for i in lineIter:
    print(i)
    break

['<s>', '(CNN)', '--', 'A', 'jet', 'crashed', 'Sunday', 'near', 'Freeport,', 'a', 'city', 'on', 'the', 'island', 'of', 'Grand', 'Bahama,', 'killing', 'all', 'nine', 'people', 'on', 'board,', 'authorities', 'said.', '</s>']


In [None]:
model = word2vec.Word2Vec(size=EMBEDDING_SIZE, window=10, iter=EPOCHS,
                               min_count=N, compute_loss=True, seed=22)

In [None]:
# build vocab
s1 = time()
model.build_vocab(sentences=lineIter, progress_per=1000000)
s2 = time()
print(f'Vocab was built in {s2 - s1}')

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #1000000, processed 22070986 words, keeping 497695 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #2000000, processed 44128369 words, keeping 723535 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #3000000, processed 66240292 words, keeping 897312 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #4000000, processed 93926542 words, keeping 1243052 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #5000000, processed 121684438 words, keeping 1481860 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #6000000, processed 149443702 words, keeping 1682222 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #7000000, processed 177236811 words, keeping 1860612 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence

Vocab was built in 557.7005567550659


In [None]:
model.train(lineIter, total_examples=model.corpus_count, epochs=model.iter, report_delay=300)

  """Entry point for launching an IPython kernel.
INFO:gensim.models.base_any2vec:training model with 3 workers on 323912 vocabulary and 128 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 0.13% examples, 189520 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 37.29% examples, 186374 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 69.10% examples, 192576 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 1 : training on 230124973 raw words (173748517 effective words) took 892.3s, 194712 effective words/s
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: 

(1737444606, 2301249730)

In [None]:
print(model.n_similarity('football', 'stadium'))
print(model.n_similarity('football', 'music'))
print(model.n_similarity('cat', 'dog'))
model.get_latest_training_loss()

0.74665844
0.49071336
0.14459415


  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):
  
  This is separate from the ipykernel package so we can avoid doing imports until


0.0

In [None]:
model.wv.save(DIR)

INFO:gensim.utils:saving Word2VecKeyedVectors object under ../../gdrive/MyDrive/DeepLearning/MT/space_en/word2vec.kv, separately None
INFO:gensim.utils:storing np array 'vectors' to ../../gdrive/MyDrive/DeepLearning/MT/space_en/word2vec.kv.vectors.npy
INFO:gensim.utils:not storing attribute vectors_norm
INFO:gensim.utils:saved ../../gdrive/MyDrive/DeepLearning/MT/space_en/word2vec.kv


In [None]:
model.wv['football']

array([-9.7590822e-01,  3.8036153e+00,  6.9648600e-01,  3.7148672e-01,
        2.2378893e+00, -1.4140592e+00, -2.8256812e+00, -4.0949845e+00,
       -5.6873918e-01,  2.2923908e+00, -5.3688502e-01,  3.0435446e-01,
       -1.1201771e+00, -2.4728308e+00, -5.2242222e+00, -9.8943681e-01,
       -2.1075423e+00, -1.3357676e+00,  1.5095940e-01, -2.5818129e+00,
       -7.7626598e-01, -1.4187355e+00,  1.2761979e+00,  1.0963104e+00,
        1.2149917e+00,  7.8583080e-01, -4.1722913e+00, -1.9624556e+00,
        2.5844502e+00,  9.5476890e-01,  1.5299875e+00,  1.0961572e+00,
       -1.6290165e+00,  3.7860224e+00, -2.9448275e-03, -1.9659894e+00,
       -3.0364484e-01, -2.1589589e+00,  1.5922133e+00, -2.4424024e-01,
        1.7634314e+00, -1.6875887e+00, -6.3435917e+00,  4.4455889e-01,
       -5.7625151e-01,  1.1510919e+00,  3.2200816e-01, -1.5640926e+00,
       -6.6915178e-01,  2.9786596e+00, -3.8710158e+00, -7.5155044e-01,
       -2.3971994e+00, -4.1999406e-01, -2.3999141e-01, -2.1911139e+00,
      

## Train bpe word2vec

In [None]:
import json
from gensim.models import word2vec
from time import time

In [None]:
vocab = json.load(open('./bpe/resources/vocab_en.json', 'r', encoding='utf8'))
vocab = {k: 2 for k, v in vocab.items()}

In [None]:
N = 2
EMBEDDING_SIZE = 128
VOCAB_SIZE = len(vocab)
EPOCHS = 10
DIR = '../../gdrive/MyDrive/DeepLearning/MT/bpe_en/word2vec.kv'

In [None]:
!ls

bpe  bpe_test.py  embedding  model  model_test.py


In [None]:
lineIter = LineIter(link, sent_segment=sent_tokenize, tokenizer=bpe.tokenizer)

In [None]:
for i in lineIter:
    print(i)
    break

['<s>', '(', 'CNN', ')', 'Ġ--', 'ĠA', 'Ġjet', 'Ġcrashed', 'ĠSunday', 'Ġnear', 'ĠFree', 'port', ',', 'Ġa', 'Ġcity', 'Ġon', 'Ġthe', 'Ġisland', 'Ġof', 'ĠGrand', 'ĠBah', 'ama', ',', 'Ġkilling', 'Ġall', 'Ġnine', 'Ġpeople', 'Ġon', 'Ġboard', ',', 'Ġauthorities', 'Ġsaid', '.', '</s>']


In [None]:
model = word2vec.Word2Vec(size=EMBEDDING_SIZE, window=10, iter=EPOCHS,
                               min_count=N, compute_loss=True, seed=22)
s1 = time()
model.build_vocab_from_freq(word_freq=vocab)
model.build_vocab(sentences=lineIter, progress_per=1000000, update=True)
s2 = time()

INFO:gensim.models.base_any2vec:Processing provided word frequencies
INFO:gensim.models.base_any2vec:collected 50265 different raw word, with total frequency of 100530
INFO:gensim.models.word2vec:Loading a fresh vocabulary
INFO:gensim.models.word2vec:effective_min_count=2 retains 50265 unique words (100% of original 50265, drops 0)
INFO:gensim.models.word2vec:effective_min_count=2 leaves 100530 word corpus (100% of original 100530, drops 0)
INFO:gensim.models.word2vec:deleting the raw counts dictionary of 50265 items
INFO:gensim.models.word2vec:sample=0.001 downsamples 0 most-common words
INFO:gensim.models.word2vec:downsampling leaves estimated 100530 word corpus (100.0% of prior 100530)
INFO:gensim.models.base_any2vec:estimated required memory for 50265 words and 128 dimensions: 76603860 bytes
INFO:gensim.models.word2vec:resetting layer weights
INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, ke

In [None]:
len(model.wv.vocab)

50265

In [None]:
# print(f'Vocab was built in {s2 - s1}')
model.train(lineIter, total_examples=model.corpus_count, epochs=model.iter, report_delay=300)

  
INFO:gensim.models.base_any2vec:training model with 3 workers on 50265 vocabulary and 128 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 0.09% examples, 154479 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 26.21% examples, 158485 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 48.89% examples, 160727 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 70.04% examples, 161786 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 91.33% examples, 162669 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models

IsADirectoryError: ignored

In [None]:
print(model.wv.n_similarity('Ġfootball', 'Ġstadium'))
print(model.wv.n_similarity('Ġfootball', 'Ġmusic'))
print(model.wv.n_similarity('Ġcat', 'Ġcar'))
model.get_latest_training_loss()

0.8612573
0.7877873
0.9201586


  if np.issubdtype(vec.dtype, np.int):


0.0

In [None]:
model.wv.vocab

{'<s>': <gensim.models.keyedvectors.Vocab at 0x7f59ca7347f0>,
 '<pad>': <gensim.models.keyedvectors.Vocab at 0x7f59ca734860>,
 '</s>': <gensim.models.keyedvectors.Vocab at 0x7f59ca734828>,
 '<unk>': <gensim.models.keyedvectors.Vocab at 0x7f59ca744390>,
 '.': <gensim.models.keyedvectors.Vocab at 0x7f59ca744128>,
 'Ġthe': <gensim.models.keyedvectors.Vocab at 0x7f59ca7442e8>,
 ',': <gensim.models.keyedvectors.Vocab at 0x7f59ca744c18>,
 'Ġto': <gensim.models.keyedvectors.Vocab at 0x7f59ca7449e8>,
 'Ġand': <gensim.models.keyedvectors.Vocab at 0x7f59ca744240>,
 'Ġof': <gensim.models.keyedvectors.Vocab at 0x7f59ca744f28>,
 'Ġa': <gensim.models.keyedvectors.Vocab at 0x7f59ca744908>,
 'Ġin': <gensim.models.keyedvectors.Vocab at 0x7f59ca744320>,
 '-': <gensim.models.keyedvectors.Vocab at 0x7f59ca7445c0>,
 'Ġfor': <gensim.models.keyedvectors.Vocab at 0x7f59ca744780>,
 'Ġthat': <gensim.models.keyedvectors.Vocab at 0x7f59ca7447f0>,
 'Ġon': <gensim.models.keyedvectors.Vocab at 0x7f59ca7448d0>,
 'Ġis

In [None]:
model.wv.save(DIR)

INFO:gensim.utils:saving Word2VecKeyedVectors object under ../../gdrive/MyDrive/DeepLearning/MT/bpe_en/word2vec.kv, separately None
INFO:gensim.utils:not storing attribute vectors_norm
INFO:gensim.utils:saved ../../gdrive/MyDrive/DeepLearning/MT/bpe_en/word2vec.kv


In [None]:
from gensim.models import KeyedVectors
tmp = KeyedVectors.load(DIR, mmap='r')

INFO:gensim.utils:loading Word2VecKeyedVectors object from ../../gdrive/MyDrive/DeepLearning/MT/bpe_en/word2vec.kv
INFO:gensim.utils:setting ignored attribute vectors_norm to None
INFO:gensim.utils:loaded ../../gdrive/MyDrive/DeepLearning/MT/bpe_en/word2vec.kv


gensim.models.keyedvectors.Word2VecKeyedVectors

# VLSP Vietnamese 3Gb Text

## Preparing Data

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
from tqdm import tqdm
import re

In [5]:
!git clone https://github.com/TanHM-1211/Machine-Translation.git
%cd Machine-Translation

Cloning into 'Machine-Translation'...
remote: Enumerating objects: 57, done.[K
remote: Counting objects: 100% (57/57), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 57 (delta 19), reused 52 (delta 14), pack-reused 0[K
Unpacking objects: 100% (57/57), done.
/content/Machine-Translation


In [6]:
!cp '../gdrive/MyDrive/DeepLearning/MT/corpus.tokened.2M.shuf.txt.zip' './'
!mkdir vi_data
!unzip corpus.tokened.2M.shuf.txt.zip -d vi_data

Archive:  corpus.tokened.2M.shuf.txt.zip
  inflating: vi_data/corpus.tokened.2M.shuf.txt  


In [7]:
# f_w = open('corpus.tokened.2M.shuf.txt', 'w', encoding='utf8')
# for file in tqdm(link):
#     f = open(file, 'r', encoding='utf8')
#     a = f.read().strip().split('\n')
#     for i in a:
#         s = ''
#         tmp = segmentNLP.tokenize(i)
#         for j in tmp:
#             s += ' '.join(j) + ' '
#         f_w.write(s + '\n')
#         # f_w.write(' '.join(tmp) + '\n')
#     f.close()
# f_w.close()
!ls

bpe	     corpus.tokened.2M.shuf.txt.zip  model	    vi_data
bpe_test.py  embedding			     model_test.py


In [8]:
!head vi_data/corpus.tokened.2M.shuf.txt

Chuyện ngược_lại diễn ra ở những nơi mại_dâm là bất_hợp_pháp . 
Cánh thuỷ_quân của quân Nguyên hoàn_toàn bị tiêu_diệt . 
Giám_đốc Bệnh_viện Bạch_Mai chỉ_đạo đoàn lên công_tác phải bằng mọi giá tập_trung nhân_lực vật_lực hỗ_trợ cho Bệnh_viện Đa_khoa Hoà_Bình , chuẩn_bị giường , máy , các phương_tiện để hỗ_trợ bệnh_nhân . 
Nghi can Duy_thực nghiệm lại hành_vi siết cổ tài_xế taxi trên XLHN để cướp tài_sản . 
Phân_chuồng có_thể làm phân_bón gốc cũng có_thể làm phân_bón thêm , phân_vô_cơ làm phân_bón thêm là tốt . 
Ra đón TBT là toàn_thể cán_bộ nhân_viên Đại_sứ_quán Việt_Nam tại LB Nga và hơn 400 người Việt sinh_sống tại thủ_đô Moskva . 
Theo Tân_Hoa xã , cảnh_sát địa_phương cho biết ngày 5/10 , nhà_chức_trách Philippines đã phá tan âm_mưu đánh bom nhằm vào một bến_xe buýt tại tỉnh Bắc_Cotabato , miền Nam_Philippines sau khi bắt một nghi can . 
Thái_độ của Zeng cũng tương_tự khi được hỏi về chiếc máy_bay vận_tải Y-20 . 
Thủ_tướng Israel phát_biểu tại Quốc_hội Mỹ Trong cử_chỉ nhằm xoa_dịu nh

In [9]:
%cd './vi_data'
!split -l 1000 './corpus.tokened.2M.shuf.txt' 'vi'
%cd '..'

/content/Machine-Translation/vi_data
/content/Machine-Translation


In [10]:
from os import listdir
from os.path import isfile, join
_path = 'vi_data'
link = [join(_path, f) for f in listdir(_path) if isfile(join(_path, f)) and 'vi' in f]

In [11]:
link[1]

'vi_data/vizlez'

In [12]:
# !pip install vncorenlp
# from vncorenlp import VnCoreNLP
# !wget 'https://github.com/vncorenlp/VnCoreNLP/archive/v1.1.1.zip' -O ./models.$$ && unzip -o ./models.$$ && rm -r ./models.$$.
# from vncorenlp import VnCoreNLP
# segmentNLP = VnCoreNLP('./VnCoreNLP-1.1.1/VnCoreNLP-1.1.1.jar', port=9001, annotators="wseg,pos,ner,parse", quiet=False)

In [13]:
# text = "Ông Nguyễn Khắc Chúc đang làm việc tại Đại học Quốc gia Hà Nội."
# segmentNLP.tokenize(text)

In [14]:
def sentence_segment(text):
    tmps = text.strip('\n').split('\n')
    return tmps

## Train Space Word2vec

In [29]:
from gensim.models import word2vec
from time import time
import numpy as np

In [35]:
N = 10
EMBEDDING_SIZE = 128
VOCAB_SIZE = 60000
EPOCHS = 10
DIR = '../gdrive/MyDrive/DeepLearning/MT/space_vi/word2vec.kv'

In [31]:
len(link)

20000

In [32]:
model = word2vec.Word2Vec(size=EMBEDDING_SIZE, window=10, iter=EPOCHS, max_final_vocab=VOCAB_SIZE,
                               min_count=N, compute_loss=True, seed=22)

In [33]:
lineIter = LineIter(link, sent_segment=sentence_segment)
model.build_vocab(sentences=lineIter, progress_per=2000000)
print(f'vocab size = {len(model.wv.vocab)}')

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #2000000, processed 56192220 words, keeping 458764 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #4000000, processed 112377602 words, keeping 682503 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #6000000, processed 168561125 words, keeping 857126 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #8000000, processed 224728912 words, keeping 1005732 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000000, processed 280886637 words, keeping 1135682 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #12000000, processed 337014678 words, keeping 1253776 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #14000000, processed 393192737 words, keeping 1362179 word types
INFO:gensim.models.word2vec:PROGRESS: at se

vocab size = 59814


In [36]:
# model.train(lineIter, total_examples=model.corpus_count, epochs=model.iter, report_delay=600)
model.wv.save(DIR)
print('Space model saved complete!') 

INFO:gensim.utils:saving Word2VecKeyedVectors object under ../gdrive/MyDrive/DeepLearning/MT/space_vi/word2vec.kv, separately None
INFO:gensim.utils:not storing attribute vectors_norm
INFO:gensim.utils:saved ../gdrive/MyDrive/DeepLearning/MT/space_vi/word2vec.kv


Space model saved complete!


In [43]:
model.wv.most_similar('xe_máy')

  if np.issubdtype(vec.dtype, np.int):


[('xe_gắn_máy', 0.9317652583122253),
 ('xe', 0.7999663949012756),
 ('xe_đạp', 0.7632004022598267),
 ('ôtô', 0.705029308795929),
 ('ô_tô', 0.7026943564414978),
 ('xe_tải', 0.6789513230323792),
 ('xe_khách', 0.6348327994346619),
 ('xe_con', 0.6344842910766602),
 ('ô-tô', 0.5954999327659607),
 ('Exciter', 0.590166449546814)]

## Train BPE Word2vec

In [15]:
import json
from gensim.models import word2vec
from time import time

INFO:summarizer.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English


In [25]:
vocab = json.load(open('./bpe/resources/vocab_vi.json', 'r', encoding='utf8'))
vocab = {k: 2 for k, v in vocab.items()}
len(vocab)

64000

In [18]:
N = 2
EMBEDDING_SIZE = 128
VOCAB_SIZE = len(vocab)
EPOCHS = 10
DIR = '../gdrive/MyDrive/DeepLearning/MT/bpe_vi/word2vec.kv'

In [19]:
from bpe.BPE_VI import BPE_VI
bpe = BPE_VI(padding=False)

In [20]:
lineIter = LineIter(link, sent_segment=sentence_segment, tokenizer=bpe.tokenizer)

In [22]:
for j, i in enumerate(lineIter):
    print(i)
    if j > 10:
        break

['<s>', 'Tại_sao', 'sự_việc', 'chỉ', 'xảy', 'ra', 'trong', 'một', 'khoảng', 'thời_gian', 'vô_cùng', 'ngắn', '(', '17', 'giây', ')', 'mà', 'quân_đội', 'Thổ_Nhĩ_Kỳ', 'đã', 'kịp', 'liên_lạc', 'và', 'báo_cáo', 'tình_hình', 'với', 'Thủ_tướng', ',', 'sau', 'đó', 'nhận', 'lệnh', 'và', 'thực_thi', 'mệnh_lệnh', '?', '</s>']
['<s>', 'Vấn_đề', 'không', 'phải', 'có', 'thoả_thuận', 'hay', 'không', 'mà', 'là', 'còn', 'thiếu', 'một_số', 'điều', 'cụ_thể', '"', '.', '</s>']
['<s>', 'Vận_tốc', 'cực_đại', 'của', 'tiêm_kích', 'Y@@', 'F-@@', '23', '1.650', '+', 'mph', 'hay_là', 'Mach', '2.1', '.', '</s>']
['<s>', 'Ông', 'Nguyễn_@@', 'Hữu', 'Hoà@@', 'ng-@@', 'Giám', 'đốc', 'DNTN', 'Toàn', 'Ni', ',', 'cho', 'biết', ':', 'Xưởng', 'chúng_tôi', 'có', 'khoảng', 'hơn', '20', 'công_nhân', 'đang', 'làm_việc', ',', 'khi', 'sự_việc', 'xảy', 'ra', 'chỉ', 'có', '3-4', 'công_nhân', 'đang', 'trông_coi', 'xưởng', 'và', 'tất_cả', 'các', 'công_nhân', 'này', 'đều', 'mới', 'từ', 'nơi', 'khác', 'đến', 'làm_việc', ',', 'nên', '

In [28]:
model = word2vec.Word2Vec(size=EMBEDDING_SIZE, window=10, iter=EPOCHS,
                               min_count=N, compute_loss=True, seed=22)
s1 = time()
model.build_vocab_from_freq(word_freq=vocab)
model.build_vocab(sentences=lineIter, progress_per=2000000, update=True)
s2 = time()

INFO:gensim.models.base_any2vec:Processing provided word frequencies
INFO:gensim.models.base_any2vec:collected 64000 different raw word, with total frequency of 128000
INFO:gensim.models.word2vec:Loading a fresh vocabulary
INFO:gensim.models.word2vec:effective_min_count=2 retains 64000 unique words (100% of original 64000, drops 0)
INFO:gensim.models.word2vec:effective_min_count=2 leaves 128000 word corpus (100% of original 128000, drops 0)
INFO:gensim.models.word2vec:deleting the raw counts dictionary of 64000 items
INFO:gensim.models.word2vec:sample=0.001 downsamples 0 most-common words
INFO:gensim.models.word2vec:downsampling leaves estimated 128000 word corpus (100.0% of prior 128000)
INFO:gensim.models.base_any2vec:estimated required memory for 64000 words and 128 dimensions: 97536000 bytes
INFO:gensim.models.word2vec:resetting layer weights
INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, ke

In [29]:
model.train(lineIter, total_examples=model.corpus_count, epochs=model.iter, report_delay=600)

  """Entry point for launching an IPython kernel.
INFO:gensim.models.base_any2vec:training model with 3 workers on 64000 vocabulary and 128 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 0.05% examples, 247465 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 33.35% examples, 259797 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 66.69% examples, 259905 words/s, in_qsize 1, out_qsize 0
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 1 : training on 583370376 raw words (468169462 effective words) took 1799.0s, 260237 effective words/s
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: 

(4681678179, 5833703760)

In [30]:
model.wv.save(DIR)

INFO:gensim.utils:saving Word2VecKeyedVectors object under ../gdrive/MyDrive/DeepLearning/MT/bpe_vi/word2vec.kv, separately None
INFO:gensim.utils:not storing attribute vectors_norm
INFO:gensim.utils:saved ../gdrive/MyDrive/DeepLearning/MT/bpe_vi/word2vec.kv


In [37]:
model.wv.most_similar(['để'])

  if np.issubdtype(vec.dtype, np.int):


[('nhằm', 0.6676191091537476),
 ('Để', 0.6332790851593018),
 ('và', 0.6203718185424805),
 ('hòng', 0.5820088386535645),
 ('đồng_thời', 0.5814229249954224),
 ('thay_vì', 0.5605562925338745),
 ('.', 0.5516546964645386),
 ('cách_thức', 0.5438627600669861),
 ('việc', 0.5355466604232788),
 ('rồi', 0.5204837918281555)]