In [15]:
import spacy
from bs4 import BeautifulSoup
import lxml
from pathlib import Path
from tqdm import tqdm
import codecs
import sys
from collections import defaultdict, Counter
from learn_bpe import learn_bpe

### Take a look at the contents of the sgm file

In [2]:
!head -n 5 /home/uih/JYL/GitHub/Transformer/data/datasets/UN/en-zh/UNv1.0.en-zh.en

<refset setid="newsdev2017" srclang="any" trglang="zh">
<doc sysid="ref" docid="abcnews.199680" genre="news" origlang="en">
<p>
<seg id="1">加利福尼亚州水务工程的新问题</seg>
<seg id="2">在加利福尼亚州一个主要水务管理区披露州长杰瑞·布朗领导的行政当局将提供政府资金以完成两条巨型输水隧道的规划之后，有一些评论家和一位州议员表示，他们想进一步了解由谁来为州长所支持的拟耗资160亿美元的水务工程承担费用。</seg>


In [3]:
!head -n 5 /home/uih/JYL/GitHub/Transformer/data/datasets/UN/en-zh/UNv1.0.en-zh.zh

<refset setid="newsdev2017" srclang="any" trglang="en">
<doc sysid="ref" docid="abcnews.199680" genre="news" origlang="en">
<p>
<seg id="1">New Questions Over California Water Project</seg>
<seg id="2">Critics and a state lawmaker say they want more explanations on who's paying for a proposed $16 billion water project backed by Gov. Jerry Brown, after a leading California water district said Brown's administration was offering government funding to finish the planning for the two giant water tunnels.</seg>


In [4]:
with open("/home/uih/JYL/Dataset/Others/Translation/dev/newsdev2017-enzh-ref.zh.sgm") as f:
    bs = BeautifulSoup(f, 'lxml')

In [5]:
for doc in bs.find_all('doc'):
    for seg in doc.find_all('seg'):
        print(seg.get_text(), seg['id'])
        break
    break

加利福尼亚州水务工程的新问题 1


### parse .sgm files to txt

In [6]:
def sgm2txt(filepath, save_dir):
    if not Path(save_dir).exists():
        Path(save_dir).mkdir(parents=True)
    save_path = Path(save_dir) / f"{Path(filepath).name}.txt"
    
    total_seg_count = 0
    with open(filepath) as fin:
        with open(save_path, 'w') as fout:
            bs = BeautifulSoup(fin, 'lxml')
            for doc in tqdm(bs.find_all('doc')):
                for seg in doc.find_all('seg'): 
                    total_seg_count += 1
                    fout.write(f"{seg.get_text()}\n")
                    
    print(f"file: {filepath}, total count: {total_seg_count}")

In [7]:
filepath = '/home/uih/JYL/Dataset/Others/Translation/dev/newsdev2017-zhen-ref.en.sgm'
save_dir = "/home/uih/JYL/GitHub/Transformer/data/dev_en"
sgm2txt(filepath, save_dir)

100%|██████████| 215/215 [00:00<00:00, 16453.35it/s]

file: /home/uih/JYL/Dataset/Others/Translation/dev/newsdev2017-zhen-ref.en.sgm, total count: 2002





In [8]:
filepath = '/home/uih/JYL/Dataset/Others/Translation/dev/newsdev2017-enzh-ref.zh.sgm'
save_dir = "/home/uih/JYL/GitHub/Transformer/data/dev_zh"
sgm2txt(filepath, save_dir)

100%|██████████| 215/215 [00:00<00:00, 13209.34it/s]

file: /home/uih/JYL/Dataset/Others/Translation/dev/newsdev2017-enzh-ref.zh.sgm, total count: 2002





### build vocabulary

In [9]:
def build_vocabylary(model_name, text_filepath):
    model = spacy.load(model_name)
    assert Path(text_filepath).exists()
    text = ""
    with open(text_filepath, 'r') as f:
        for line in tqdm(f.readlines()):
            sentence = line.strip()
            text += sentence
    return [tok.text for tok in model.tokenizer(text)]

model_name = 'zh_core_web_md'
text_filepath = "/home/uih/JYL/GitHub/Transformer/data/dev_zh/newsdev2017-enzh-ref.zh.sgm.txt" 
tokenize = build_vocabylary(model_name, text_filepath)

100%|██████████| 2002/2002 [00:00<00:00, 649922.34it/s]


In [10]:
tokenize[:5]

['加利福尼亚州', '水务', '工程', '的', '新']

In [11]:
# write tokenizes into file
with open('/home/uih/JYL/GitHub/Transformer/data/dev_zh/tokenizes_zh.txt', 'w') as f:
    tokens = ""
    for token in tokenize:
        tokens += token
        tokens += " "  # append separator after every token
    f.write(tokens)

In [12]:
vocab = Counter()

with codecs.open('/home/uih/JYL/GitHub/Transformer/data/dev_zh/tokenizes_zh.txt', encoding='utf-8') as fobj:
    for i, line in enumerate(fobj):
        for word in line.strip('\r\n ').split(' '):
            if word:
                vocab[word] += 1

In [21]:
for k in list(vocab.keys())[:5]:
    print(f"{k}: {vocab[k]}")

加利福尼亚州: 8
水务: 4
工程: 22
的: 2556
新: 51


In [13]:
def tokenization(sgmtxt_filepath, spacy_model_name, save_dir):
    tokenize = build_vocabylary(spacy_model_name, sgmtxt_filepath)
    # write tokenizes into file
    save_path = Path(save_dir) / 'tokenizes.txt'
    with open(str(save_path), 'w') as f:
        tokens = ""
        for token in tokenize:
            tokens += token
            tokens += " "  # append separator after every token
        f.write(tokens)

In [14]:
tokenization(sgmtxt_filepath="/home/uih/JYL/GitHub/Transformer/data/dev_en/newsdev2017-zhen-ref.en.sgm.txt", 
             spacy_model_name="en_core_web_md", 
             save_dir="/home/uih/JYL/GitHub/Transformer/data/dev_en/")

100%|██████████| 2002/2002 [00:00<00:00, 617108.59it/s]


### update vocabulary

In [23]:
def update_vocabulary(tokenize_filepath, is_dict=False):
    """
    Read text and return dictionary that encodes vocabulary
    """

    vocab = Counter()
    with codecs.open(tokenize_filepath, encoding='utf-8') as fobj:
        for i, line in tqdm(enumerate(fobj)):
            if is_dict:
                try:
                    word, count = line.strip('\r\n ').split(' ')
                except:
                    print('Failed reading vocabulary file at line {0}: {1}'.format(i, line))
                    sys.exit(1)
                vocab[word] += int(count)
            else:
                for word in line.strip('\r\n ').split(' '):
                    if word:
                        vocab[word] += 1
    return vocab

In [39]:
tokenize_filepath = '/home/uih/JYL/GitHub/Transformer/data/dev_zh/tokenizes_zh.txt'
vocab = update_vocabulary(tokenize_filepath)
vocab = dict([(tuple(x[:-1])+(x[-1]+'</w>',) ,y) for (x,y) in vocab.items()])

for k in list(vocab.keys())[:5]:
    print(f"{k}: {vocab[k]}")

1it [00:00, 31.10it/s]

('加', '利', '福', '尼', '亚', '州</w>'): 8
('水', '务</w>'): 4
('工', '程</w>'): 22
('的</w>',): 2556
('新</w>',): 51





In [35]:
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
sorted_vocab[-5:]

[(('工', '业', '化</w>'), 1),
 (('一', '如', '既', '往</w>'), 1),
 (('真', '诚</w>'), 1),
 (('附', '带</w>'), 1),
 (('东', '非</w>'), 1)]

In [38]:
"""Count frequency of all symbol pairs, and create index"""

# data structure of pair frequencies
stats = defaultdict(int)

#index from pairs to words
indices = defaultdict(lambda: defaultdict(int))

for i, (word, freq) in enumerate(sorted_vocab):
    prev_char = word[0]
    for char in word[1:]:
        stats[prev_char, char] += freq
        indices[prev_char, char][i] += 1
        prev_char = char

In [44]:
stats

defaultdict(int,
            {('表', '示</w>'): 182,
             ('中', '国</w>'): 163,
             ('我', '们</w>'): 146,
             ('进', '行</w>'): 137,
             ('他', '们</w>'): 101,
             ('一', '个</w>'): 97,
             ('没', '有</w>'): 99,
             ('政', '府</w>'): 94,
             ('可', '能</w>'): 83,
             ('公', '司</w>'): 76,
             ('美', '国</w>'): 71,
             ('发', '生</w>'): 71,
             ('可', '以</w>'): 70,
             ('文', '化</w>'): 71,
             ('工', '作</w>'): 68,
             ('发', '展</w>'): 67,
             ('经', '济</w>'): 66,
             ('这', '些</w>'): 65,
             ('不', '过</w>'): 63,
             ('之', '后</w>'): 62,
             ('英', '国</w>'): 62,
             ('美', '元</w>'): 61,
             ('问', '题</w>'): 59,
             ('国', '家</w>'): 58,
             ('已', '经</w>'): 57,
             ('通', '过</w>'): 56,
             ('成', '为</w>'): 54,
             ('项', '目</w>'): 54,
             ('时', '间</w>'): 54,
             ('合', '作

In [46]:
most_frequent = max(stats, key=lambda x: (stats[x], x))
most_frequent

('表', '示</w>')

In [40]:
uniq_char_internal = set()
uniq_char_final = set()
for word in vocab:
    for char in word[:-1]:
        uniq_char_internal.add(char)
    uniq_char_final.add(word[-1])

In [62]:
import re
pattern = re.compile(r'(?<!\S)' + re.escape('A' + ' ' + 'B') + r'(?!\S)')
new_word = pattern.sub('AB', r"C A B D A B</w>")
new_word = tuple(new_word.split(' '))

In [63]:
new_word

('C', 'AB', 'D', 'A', 'B</w>')

In [1]:
learn_bpe(infile_names=['/home/uih/JYL/GitHub/Transformer/data/dev_zh/newsdev2017-enzh-ref.zh.sgm.txt'], 
          outfile_name='/home/uih/JYL/GitHub/Transformer/data/dev_zh/newsdev2017-enzh-ref.zh.bpe.txt', 
          num_symbols=32000, 
          verbose=True, 
          total_symbols=True)

Collecting vocab from /home/uih/JYL/GitHub/Transformer/data/dev_zh/newsdev2017-enzh-ref.zh.sgm.txt
Number of word-internal characters: 2562
Number of word-final characters: 93
Reducing number of merge operations by 2655
Write vocab file to /home/uih/JYL/GitHub/Transformer/data/dev_zh/newsdev2017-enzh-ref.zh.bpe.txtpair 0: 表 示 -> 表示 (frequency 182)
pair 1: 2 0 -> 20 (frequency 170)
pair 2: 中 国 -> 中国 (frequency 166)
pair 3: 我 们 -> 我们 (frequency 146)
pair 4: 进 行 -> 进行 (frequency 137)
pair 5: 。 ”</w> -> 。”</w> (frequency 137)
pair 6: ， 这 -> ，这 (frequency 119)
pair 7: 0 0 -> 00 (frequency 115)
pair 8: ： “ -> ：“ (frequency 104)
pair 9: 他 们 -> 他们 (frequency 101)
pair 10: 一 个 -> 一个 (frequency 100)
pair 11: 20 1 -> 201 (frequency 100)
pair 12: 没 有 -> 没有 (frequency 99)
pair 13: 政 府 -> 政府 (frequency 94)
pair 14: 表示 ， -> 表示， (frequency 93)
pair 15: ， 而 -> ，而 (frequency 92)
pair 16: 时 ， -> 时， (frequency 88)
pair 17: 可 能 -> 可能 (frequency 88)
pair 18: 后 ， -> 后， (frequency 87)
pair 19: ， 在 -> ，在 (freq

In [16]:
learn_bpe(infile_names=['/home/uih/JYL/GitHub/Transformer/data/dev_en/newsdev2017-zhen-ref.en.sgm.txt'], 
          outfile_name='/home/uih/JYL/GitHub/Transformer/data/dev_en/newsdev2017-zhen-ref.en.learn_bpe.txt', 
          num_symbols=32000, 
          verbose=True, 
          total_symbols=True)

Collecting vocab from /home/uih/JYL/GitHub/Transformer/data/dev_en/newsdev2017-zhen-ref.en.sgm.txt
Number of word-internal characters: 90
Number of word-final characters: 71
Reducing number of merge operations by 161
Write vocab file to /home/uih/JYL/GitHub/Transformer/data/dev_en/newsdev2017-zhen-ref.en.learn_bpe.txtpair 0: t h -> th (frequency 5717)
pair 1: i n -> in (frequency 4230)
pair 2: th e</w> -> the</w> (frequency 3624)
pair 3: a n -> an (frequency 3288)
pair 4: e r -> er (frequency 2680)
pair 5: e n -> en (frequency 2422)
pair 6: r e -> re (frequency 2279)
pair 7: t i -> ti (frequency 2225)
pair 8: o n -> on (frequency 2133)
pair 9: a r -> ar (frequency 1782)
pair 10: o f</w> -> of</w> (frequency 1779)
pair 11: e d</w> -> ed</w> (frequency 1776)
pair 12: in g</w> -> ing</w> (frequency 1675)
pair 13: o r -> or (frequency 1607)
pair 14: an d</w> -> and</w> (frequency 1566)
pair 15: o u -> ou (frequency 1565)
pair 16: t o</w> -> to</w> (frequency 1534)
pair 17: s t -> st (frequ