# Prepare for datasets


## Import Packages

In [4]:
import random
random.seed(1)
import re
import os
import csv
from tqdm import tqdm

## Load the Training and Test Dataset

In [5]:
class text_loader:
    def __init__(self, train_address, test_address, split):
        if not os.path.exists(train_address):
            print(f"Path: {train_address} does not exist")
            return None
        if not os.path.exists(test_address):
            print(f"Path: {test_address} does not exist")
            return None
        self.train_address = train_address
        self.test_address = test_address
        self.split = split
        if self.split:
            self.en_train, self.unknown_train, self.en_val, self.unknown_val= self.read_sentences(train_address, split=self.split)
        else:
            self.en_train, self.unknown_train = self.read_sentences(train_address, split=self.split)
        self.left_test, self.right_test = self.read_sentences(test_address)
        self.char2byte = self.get_char2byte()
        self.byte2char = self.get_byte2char()


    def read_sentences(self, file_address, split=False):
        l_sentences = []
        r_sentences = []
        sentences = []
        with open(file_address, 'br') as f:
            for line in f:
                line = line.decode("latin-1")
                # line = re.sub(r"(?:\\x[A-Fa-f0-9]{2})", "", line)
                line = line.replace("\\n", "")
                line = line[0:-1]
                if split:
                    sentences.append(line)
                else:
                    line = re.split("\t", line)
                    if line[0] != line[1]:
                        l_sentences.append(line[0])
                        r_sentences.append(line[1])
        if split:
            random.shuffle(sentences)
            l_sentences = []
            r_sentences = []
            for line in sentences:
                line = re.split("\t", line)
                if line[0] != line[1]:
                    l_sentences.append(line[0])
                    r_sentences.append(line[1])
            split_ind = int(0.9 * len(sentences))
            en_train = l_sentences[:split_ind]
            en_val = l_sentences[split_ind:]
            unknown_train = r_sentences[:split_ind]
            unknown_val = r_sentences[split_ind:]
            return en_train, unknown_train, en_val, unknown_val
        else:
            return l_sentences, r_sentences


    def get_char2byte_from_raw(self, lines, char2byte: dict):
        for line in tqdm(lines):
            for char in line:
                char2byte[char] = ord(char)


    def get_char2byte(self):
        char2byte = dict()
        self.get_char2byte_from_raw(self.en_train, char2byte)
        self.get_char2byte_from_raw(self.unknown_train, char2byte)
        self.get_char2byte_from_raw(self.left_test, char2byte)
        self.get_char2byte_from_raw(self.right_test, char2byte)
        return char2byte


    def get_byte2char(self):
        return {b: c for c, b in self.char2byte.items()}


    def get_data(self):
        if self.split:
            return self.en_train, self.unknown_train, self.en_val, self.unknown_val, self.left_test, self.right_test, self.char2byte, self.byte2char
        else:
            return self.en_train, self.unknown_train, self.left_test, self.right_test, self.char2byte, self.byte2char

## Load Dataset

In [6]:
train_address = f"./data/src/train.txt"
test_address = f"./data/src/test.rand.txt"
data = text_loader(train_address, test_address, True)
en_train, unknown_train, en_val, unknown_val, left_test, right_test, char2byte, byte2char = data.get_data()

100%|██████████| 900000/900000 [00:06<00:00, 141669.88it/s]
100%|██████████| 900000/900000 [00:06<00:00, 138194.25it/s]
100%|██████████| 100000/100000 [00:00<00:00, 140314.89it/s]
100%|██████████| 100000/100000 [00:00<00:00, 141084.27it/s]


### bytes.txt

In [7]:
my_vocab = set(char2byte.values())
my_vocab = sorted(list(my_vocab))
vocab_path = f"./data/bytes.txt"
if os.path.exists(vocab_path):
    os.remove(vocab_path)
else:
    with open(vocab_path, "w") as f:
        for b in tqdm(my_vocab):
            f.write(str(f"{b}\n"))

### char2byte.pkl

In [8]:
import pickle
with open('char2byte.pkl', 'wb') as handle:
    pickle.dump(char2byte, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
def store_lines_into_file(filename: str, lines: list):
    with open(filename, "w", encoding="latin-1") as f:
        for line in lines:
            f.write(line + '\n')

def run_store(filename: str, lines: list):
    if os.path.exists(filename):
        os.remove(filename)
    store_lines_into_file(filename, lines)


filenames = [f"./data/en_train.txt", f"./data/en_val.txt", f"./data/unknown_train.txt", f"./data/unknown_val.txt", f"./data/left_test.txt", f"./data/right_test.txt"]
run_store(filenames[0], en_train)
# run_store(filenames[1], en_val)
run_store(filenames[2], unknown_train)
# run_store(filenames[3], unknown_val)
run_store(filenames[4], left_test)
run_store(filenames[5], right_test)

In [11]:
train_byte_rows_str = []
train_byte_rows_list = []
train_ltb = []
train = []
for idx, (l1, l2) in tqdm(enumerate(zip(en_train, unknown_train))):
    new_l1_str = ""
    new_l1_list = []
    new_l2_str = " "
    new_l2_list = []
    for c1, c2 in zip(l1, l2):
        new_l1_str = new_l1_str + str(char2byte[c1]) + " "
        new_l2_str = new_l2_str + str(char2byte[c2]) + " "
        new_l1_list.append(char2byte[c1])
        new_l2_list.append(char2byte[c2])
    if new_l1_str != new_l2_str:
        # byte_rows_str.append([new_l1_str, new_l1_str, new_l2_str])
        train_ltb.append(['1', '', f'{new_l1_str}'])
        train_ltb.append(['2', '', f'{new_l2_str}'])
        temp1 = []
        temp2 = []
        l1_str = ""
        l2_str = ""
        meet = False
        curr = -1
        for i, (byte1, byte2) in enumerate(zip(new_l1_list, new_l2_list)):
            if byte1 != byte2:
                if not meet:
                    # print(i)
                    # print(max(0, i-5))
                    temp1.extend(new_l1_list[max(0, i-5):i+1])
                    temp2.extend(new_l2_list[max(0, i-5):i+1])
                    meet = True
                    curr = i+1
                else:
                    temp1.extend(new_l1_list[max(curr, i-5):i+1])
                    temp2.extend(new_l2_list[max(curr, i-5):i+1])
                    curr = i+1
        temp1.extend(new_l1_list[curr:min(curr+5, len(new_l1_list))])
        temp2.extend(new_l2_list[curr:min(curr+5, len(new_l2_list))])
        for byte1 in temp1:
            l1_str += str(byte1) + " "
        for byte2 in temp2:
            l2_str += str(byte2) + " "
        train.append(['1', '', f'{l1_str}'])
        train.append(['2', '', f'{l2_str}'])
        # train_byte_row_json = json.dumps({'id': idx, 'en': new_l1_list, 'unknown': new_l2_list})
        # train_byte_rows_list.append(json.loads(train_byte_row_json))

900000it [02:48, 5349.44it/s]


### train.csv

In [12]:
if os.path.exists("./data/train.csv"):
    os.remove("./data/train.csv")
with open('./data/train.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    # writer.writerow(header)
    for ltb in tqdm(train):
        writer.writerow(ltb)

100%|██████████| 1800000/1800000 [00:06<00:00, 274509.99it/s]


In [13]:
test_byte_rows_str = []
test_byte_rows_list = []
test_ltb = []
test = []
for idx, (l1, l2) in tqdm(enumerate(zip(en_val, unknown_val))):
    new_l1_str = ""
    new_l1_list = []
    new_l2_str = " "
    new_l2_list = []
    for c1, c2 in zip(l1, l2):
        new_l1_str = new_l1_str + str(char2byte[c1]) + " "
        new_l2_str = new_l2_str + str(char2byte[c2]) + " "
        new_l1_list.append(char2byte[c1])
        new_l2_list.append(char2byte[c2])
    if new_l1_str != new_l2_str:
        # byte_rows_str.append([new_l1_str, new_l1_str, new_l2_str])
        test_ltb.append(['1', '', f'{new_l1_str}'])
        test_ltb.append(['2', '', f'{new_l2_str}'])
        temp1 = []
        temp2 = []
        l1_str = ""
        l2_str = ""
        meet = False
        curr = -1
        for i, (byte1, byte2) in enumerate(zip(new_l1_list, new_l2_list)):
            if byte1 != byte2:
                if not meet:
                    # print(i)
                    # print(max(0, i-5))
                    temp1.extend(new_l1_list[max(0, i-5):i+1])
                    temp2.extend(new_l2_list[max(0, i-5):i+1])
                    meet = True
                    curr = i+1
                else:
                    temp1.extend(new_l1_list[max(curr, i-5):i+1])
                    temp2.extend(new_l2_list[max(curr, i-5):i+1])
                    curr = i+1
        temp1.extend(new_l1_list[curr:min(curr+5, len(new_l1_list))])
        temp2.extend(new_l2_list[curr:min(curr+5, len(new_l2_list))])
        for byte1 in temp1:
            l1_str += str(byte1) + " "
        for byte2 in temp2:
            l2_str += str(byte2) + " "
        test.append(['1', '', f'{l1_str}'])
        test.append(['2', '', f'{l2_str}'])
        # test_byte_row_json = json.dumps({'id': idx, 'en': new_l1_list, 'unknown': new_l2_list})


100000it [00:18, 5516.75it/s]


### test.csv

In [14]:
if os.path.exists("./data/test.csv"):
    os.remove("./data/test.csv")
with open('./data/test.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    # writer.writerow(header)
    for ltb in tqdm(test):
        writer.writerow(ltb)

100%|██████████| 200000/200000 [00:00<00:00, 286519.27it/s]


### Finished preprossing dataset