In [None]:
from datasets import load_dataset

dataset_name = "iwslt2017"
dataset = load_dataset(dataset_name, "iwslt2017-zh-en", cache_dir="./cache")

In [None]:
import opencc
converter = opencc.OpenCC('s2t.json')

In [None]:
dataset

In [None]:
from collections import defaultdict
word_freq = defaultdict(int)
for pair in dataset["train"]["translation"]:
    for word in pair["zh"]:
        word_freq[word] += 1

In [None]:
from collections import defaultdict
# Create dict for text into strokes translation and vice versa
with open("./vocab/zh2letter.txt", 'r', encoding="utf-8") as f:
    conversions = f.read()

conversions = conversions.splitlines()
dic = defaultdict(str)
for line in conversions:
    chinese_char, strokes = line.split()
    dic[chinese_char] = strokes

Strokify

In [None]:
from functools import partial

def is_chinese(uchar):
    """判断一个unicode是否是汉字"""
    if (uchar >= u'\u4e00') and (uchar <= u'\u9fa5'):
        return True
    else:
        return False

def zh2letter(dictionary, line):
    char_set = set(list(line))
    newline = line
    for char in char_set:
        if is_chinese(char):
            newline = newline.replace(char, ' '+dictionary.get(char, '')+' ')
    return ' '.join(newline.split())+'\n'

In [None]:
sorted_freq = [(w,f) for w,f in sorted(word_freq.items(), key=lambda x: x[1], reverse=True) if is_chinese(w)]

In [None]:
avg_len = 0
for w,f in sorted_freq:
    avg_len += len(dic[w])
avg_len / len(sorted_freq)

In [None]:
avg_len = 0
for w,f in sorted_freq:
    avg_len += len(dic[converter.convert(w)])
avg_len / len(sorted_freq)

In [None]:
with open("./frequency.txt", 'w', encoding='utf-8') as f:
    for word,freq in sorted_freq:
        f.write(f'{word} {freq} \n')

In [None]:
TYPES = ["zh", "tz"]
NAMES = ["simp", "trad"]
TYPE = 0 # 0 for simplified, 1 for traditional

In [None]:
split="train"
if TYPE == 0:
    src_text = [pair["zh"] for pair in dataset[split]["translation"]]
else:
    src_text = [converter.convert(pair["zh"]) for pair in dataset[split]["translation"]]
trg_text = [pair["en"] for pair in dataset[split]["translation"]]

In [None]:
from tqdm import tqdm

src = TYPES[TYPE]
trg = "en"

func = partial(zh2letter, dic)
iter = map(func, src_text)

In [None]:
path = f"./data/NIST/{NAMES[TYPE]}"
with open(f"{path}/{split}.{src}-{trg}.{src}", 'w', encoding="utf-8") as f:
    for k in tqdm(iter): f.write(k)

with open(f"{path}/{split}.{src}-{trg}.{trg}", 'w', encoding="utf-8") as f:
    for k in tqdm(trg_text): f.write(f"{k}\n")

Cypher

In [None]:
def shift_vocab(vocab, key):
    dic = {}
    for i in range(len(vocab)):
        dic[vocab[i]] = vocab[(i+key) % len(vocab)]
    return dic

def monophonic(vocab, shifted_vocab, plain_text):
    cipher_text = []
    for c in plain_text:
        if c in vocab:
            cipher_text.append(shifted_vocab[c])
        else:
            cipher_text.append(c)
    return ''.join(cipher_text)

In [None]:
def read_text(path):
    with open(path, 'r', encoding="utf-8") as f:
        text = f.readlines()
    return text

def write_file(src, trg):
    with open(src, 'r',encoding="utf-8") as f1, open(trg, 'w',encoding="utf-8") as f2:
        for k in f1.readlines(): f2.write(k)

In [None]:
import os
splits = ['train', 'validation']
vocab = 'etaoinshrdlcumwfgypbvkjxqz'
src = TYPES[TYPE]
trg = "en"

# for split in splits:
for split in ["test"]:
    src_name = f"{split}.{src}-{trg}.{src}"
    trg_name = f"{split}.{src}-{trg}.{trg}"
    src_file = os.path.join(path, src_name)
    trg_file = os.path.join(path, trg_name)
    text = read_text(src_file)
    for key in [1,2]:
        shifted_vocab = shift_vocab(vocab, key)
        func = partial(monophonic, vocab, shifted_vocab)
        print(f'Generating ciphered-text of {split} data with key {key}.')
        write_file(trg_file, os.path.join(path, f"{split}.{src}{key}-{trg}.{trg}"))
        save_src_cipher = os.path.join(path, f"{split}.{src}{key}-{trg}.{src}{key}")
        
        iter = map(func, text)
        with open(save_src_cipher, 'w', encoding="utf-8") as f:
            for k in tqdm(iter): f.write(k)
        
        write_file(src_file, os.path.join(path, f"{split}.{src}{key}-{src}.{src}"))
        write_file(save_src_cipher, os.path.join(path, f"{split}.{src}{key}-{src}.{src}{key}"))
        print('done \n')