In [1]:
from datasets import load_dataset

dataset_name = "iwslt2017"
dataset = load_dataset(dataset_name, "iwslt2017-zh-en", cache_dir="./cache")

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 231266
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 8549
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 879
    })
})

In [5]:
# Adds traditional chinese
import opencc
converter = opencc.OpenCC('s2t.json')

# Create dataset for BPE learning
trad_chinese_list = []
eng_list = []
for pair in dataset["train"]["translation"]:
    trad_chinese_list.append(converter.convert(pair["zh"]))
    eng_list.append(pair["en"])

In [None]:
# # Process it but also include other characters
# def is_chinese_character(char):
#     return 0x4e00 <= ord(char) <= 0x9fff

# cnPunct2enPunct = {
#     "。": '.', 
#     "，": ',', 
#     '？': '?', 
#     '‘': '\'', "’": '\'', 
#     '“': '\"', '”': '\"', 
#     '《': "<", "》": ">", 
#     "、":',', 
#     " ！":"!",
#     "：": ":",
#     "；": ";",
#     "·": "`",
#     "（": "(",
#     "）":")",
# }

# def strokify(text):
#     new_text = str()
#     is_english = False
#     is_start = True
#     for char in text:
#         if not is_start and not is_english: new_text += " "
#         is_start = False
#         if is_chinese_character(char):
#             if is_english: new_text += " "
#             new_text += zh2letter[char]
#             is_english = False
#         elif char.isalpha() and ('A' <= char <= 'Z' or 'a' <= char <= 'z'):
#             new_text += char
#             is_english = True
#         elif char in cnPunct2enPunct:
#             new_text += cnPunct2enPunct[char]
#         else:
#             new_text += char
#     return new_text

In [7]:
# Take text that has only chinese characters
import re
all_text = []
chinese_character_pattern = re.compile(r'[\u4e00-\u9fff]+')
# Remove all non-chinese characters
for text in trad_chinese_list:
    for lines in text.splitlines():
        if len(lines) == 0: continue
        chinese_characters_only = chinese_character_pattern.findall(lines)
        if len(chinese_characters_only) == 0: continue
        chinese_text = ''.join(chinese_characters_only)
        all_text.append(chinese_text)

In [9]:
import json
with open(f'./traditional_chinese_sentences_iwslt.json', 'w') as f:
    json.dump(all_text, f)

with open(f'./english_sentences_iwslt.json', 'w') as f:
    json.dump(eng_list, f)

In [None]:
from collections import defaultdict
# Create dict for text into strokes translation and vice versa
with open("zh2letter.txt", 'r', encoding="utf-8") as f:
    conversions = f.read()

conversions = conversions.splitlines()
zh2letter = defaultdict(str)
letter2zh = defaultdict(str)
for line in conversions:
    chinese_char, strokes = line.split()
    zh2letter[chinese_char] = strokes
    letter2zh[strokes] = chinese_char

In [19]:
def is_chinese_character(char):
    return 0x4e00 <= ord(char) <= 0x9fff

def strokify(text):
    strokes = [zh2letter[x] for x in text if zh2letter[x] != []]
    for stroke in strokes:
        if len(stroke) == 0:
            print(text)
            print([zh2letter[x] for x in text])
    return " ".join(strokes)

In [20]:
stroke_sentences = [strokify(sent) for sent in all_text]

In [21]:
# Save stroke_sentences into txt
filename = f"./{dataset_name}_strokes.txt"
with open(filename, 'w', encoding="utf-8") as f:
    for string in stroke_sentences:
        f.write(string + '\n')

In [22]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

In [23]:
import os

save_path = f"./{dataset_name}/"
if not os.path.exists(save_path):
    os.makedirs(save_path)

In [24]:
vocab_sizes = [500, 1000, 5000, 10000]
for vocab_size in vocab_sizes:
    print(vocab_size)
    tokenizer = Tokenizer(BPE(unk_token="[UNK]", continuing_subword_prefix="##", end_of_word_suffix="_"))
    trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=vocab_size)
    tokenizer.pre_tokenizer = Whitespace()
    tokenizer.train([filename], trainer)
    tokenizer.save(f"./{save_path}BPE_{vocab_size}.json")

500
1000
5000
10000
