# Import Libraries

In [1]:
# Import built-in Python libs
import pickle
from pathlib import Path
from dataclasses import dataclass
from typing import List

# Import data science libs
import numpy as np
import pandas as pd

# Import weights & bias
import wandb

# Import deep learning libs
import torch
import pytorch_lightning as pl

# Import data preprocessing libs
from tokenizers import Tokenizer, pre_tokenizers, decoders
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.normalizers import NFKC
from tokenizers.processors import TemplateProcessing
from torch.utils.data import Dataset, DataLoader

---

# Split Raw Dataset to Source and Target datasets

In [5]:
root_dir = Path.cwd().parent
data_dir = root_dir / "dataset" / "ASPEC-JC"

def extract_jp_ch_datasets(txt_arr):
    p = np.array([list(map(str.strip, text.split("|||")[1:])) for text in txt_arr])
    jp = p[:, 0]
    ch = p[:, 1]
    return ch, jp


def split_raw_text(txt_path):
    with txt_path.open() as f:
        ch, jp = extract_jp_ch_datasets(f.readlines())
        
        np.savetxt(txt_path.parent / "ch.txt", ch, fmt="%s")
        np.savetxt(txt_path.parent / "jp.txt", jp, fmt="%s")


dev_txt = data_dir / "dev" / "dev.txt"
devtest_txt = data_dir / "devtest" / "devtest.txt"
test_txt = data_dir / "test" / "test.txt"
train_txt = data_dir / "train" / "train.txt"

# uncomment to run the split program
# split_raw_text(dev_txt)
# split_raw_text(devtest_txt)
# split_raw_text(test_txt)
# split_raw_text(train_txt)

In [None]:
!head -n 5 {str(data_dir / "train/ch.txt")}
print()
!head -n 5 {str(data_dir / "train/jp.txt")}
print()
line = !wc -l < {str(data_dir / "train/ch.txt")}
print(f"line count: {line}")

In [None]:
!head -n 5 {str(data_dir / "dev/ch.txt")}
print()
!head -n 5 {str(data_dir / "dev/jp.txt")}
print()
line = !wc -l < {str(data_dir / "dev/ch.txt")}
print(f"line count: {line}")

In [None]:
!head -n 5 {str(data_dir / "test/ch.txt")}
print()
!head -n 5 {str(data_dir / "test/jp.txt")}
print()
line = !wc -l < {str(data_dir / "test/ch.txt")}
print(f"line count: {line}")

## Log Raw Data Artifacts

In [4]:
run = wandb.init(project='phonetic-translation', 
                 entity='windsuzu',
                 group="dataset",
                 name="raw_data",
                 job_type="data_upload")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
# save raw txt file to artifact
#
# |-- train
#     |-- ch & jp
# |-- dev
#     |-- ch & jp
# |-- devtest
#     |-- ch & jp
# |-- test
#     |-- ch & jp

root_dir = Path.cwd().parent
data_dir = root_dir / "dataset" / "ASPEC-JC"
raw_data_types = ["train", "dev", "devtest", "test"]

artifacts = {}

for data_type in raw_data_types:
    artifacts[data_type] = wandb.Artifact(data_type, "raw_data")
    artifacts[data_type].add_file(data_dir / data_type / "ch.txt", "ch.txt")
    artifacts[data_type].add_file(data_dir / data_type / "jp.txt", "jp.txt")

for data_type, artifact in artifacts.items():
    run.log_artifact(artifact)

---

# Google Sentencepiece Tokenization

In [24]:
run = wandb.init(project='phonetic-translation', 
                 entity='windsuzu',
                 group="tokenizer",
                 name="sentence_piece",
                 job_type="build_tokenizer")

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [None]:
# Download Raw Data
train_data_art = run.use_artifact("train:latest")
train_data_dir = train_data_art.download()

In [16]:
def sentence_piece_tokenizer(unk_token="[UNK]", dropout: float=None):
    tokenizer = Tokenizer(BPE(dropout=dropout, unk_token=unk_token))
    tokenizer.normalizer = NFKC()   
    tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
        replacement="_", add_prefix_space=True
    )

    tokenizer.decoder = decoders.Metaspace(
        replacement="_", add_prefix_space=True
    )

    return tokenizer

def train_tokenizer(tokenizer, files, unk_token="[UNK]", vocab_size=30000, show_progress=True, min_frequency=1):  
        trainer = BpeTrainer(special_tokens=[unk_token, "[BOS]", "[EOS]", "[PAD]"], 
                             vocab_size=vocab_size, 
                             show_prorgess=show_progress,
                             min_frequency=min_frequency)
        
        if isinstance(files, str):
            files = [files]
        
        tokenizer.train(files, trainer=trainer)


def build_tokenizer(tokenizer_save_path: str, train_text_path: str, min_frequency=1):
    assert train_text_path.exists(), "Training Raw Text does not exist."

    if not tokenizer_save_path.exists():
        tokenizer = sentence_piece_tokenizer()
        
        # Train tokenizer
        train_tokenizer(tokenizer, str(train_text_path), min_frequency=min_frequency)

        # Enable Padding
        tokenizer.enable_padding(pad_id=tokenizer.token_to_id("[PAD]"), pad_token="[PAD]")
        
        # Encode => BOS + sentence + EOS
        set_post_processor(tokenizer)

        tokenizer.save(str(tokenizer_save_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_save_path))

    return tokenizer


def set_post_processor(tokenizer):
    tokenizer.post_processor = TemplateProcessing(
        single="[BOS] $A [EOS]",
        special_tokens=[
            ("[BOS]", tokenizer.token_to_id("[BOS]")),
            ("[EOS]", tokenizer.token_to_id("[EOS]")),
        ],
    )

In [14]:
# Get Tokenizer.json File Path

root_dir = Path.cwd().parent
tokenizer_dir = root_dir / "tokenizer"
tokenizer_dir.mkdir(parents=True, exist_ok=True)

ch_tokenizer_dir = tokenizer_dir / "tokenizer_sentencepiece_ch.json"
jp_tokenizer_dir = tokenizer_dir / "tokenizer_sentencepiece_jp.json"

In [None]:
ch_tokenizer = build_tokenizer(ch_tokenizer_dir, Path(train_data_dir) / "ch.txt", min_frequency=2)
jp_tokenizer = build_tokenizer(jp_tokenizer_dir, Path(train_data_dir) / "jp.txt", min_frequency=2)

In [None]:
# Print first ten vocab
print([(key, val) for key, val in ch_tokenizer.get_vocab().items()][:10])
print(ch_tokenizer.get_vocab_size())
print()

# Encode and Decode Testing
encoded = ch_tokenizer.encode("缅因州的波特兰拥有Ｒｉｖｅｒｓｉｄｅ循环使用设施（ＲＲＦ）😀")

print(encoded.ids)
print(encoded.tokens)
ch_tokenizer.decode(encoded.ids)

In [None]:
# Print first ten vocab
print([(key, val) for key, val in jp_tokenizer.get_vocab().items()][:10])
print(jp_tokenizer.get_vocab_size())
print()

# Encode and Decode Testing
encoded = jp_tokenizer.encode_batch(["Ｃ＆Ｄ管理施設の高度化", "😀異業種ネットワークからの地域ブランド化"])

for i in range(2):
    print(encoded[i].ids)
    print(encoded[i].tokens)
    print(encoded[i].attention_mask)
    print(jp_tokenizer.decode(encoded[i].ids))
    print()

## Log Tokenizer Artifact

In [None]:
artifact = wandb.Artifact("sentencepiece", 
                          type="tokenizer",
                          metadata={"vocab": 30000, 
                                    "method": "SentencePiece",
                                    "min_frequency": 2})

artifact.add_file(ch_tokenizer_dir, "ch_tokenizer.json")
artifact.add_file(jp_tokenizer_dir, "jp_tokenizer.json")
run.log_artifact(artifact)

---

# Preparing Lightning DataModule

In [24]:
run = wandb.init(project='phonetic-translation', 
                 entity='windsuzu',
                 group="dataset",
                 name="sentence_piece",
                 job_type="build_pl_data_module")

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [None]:
train_data_art = run.use_artifact("train:latest")
train_data_dir = train_data_art.download()

dev_data_art = run.use_artifact("dev:latest")
dev_data_dir = dev_data_art.download()

test_data_art = run.use_artifact("test:latest")
test_data_dir = test_data_art.download()

data_dir = {"train": train_data_dir, 
            "dev": dev_data_dir,
            "test": test_data_dir,}

!head -n 2 {str(Path(train_data_dir) / "ch.txt")}
print()
!head -n 2 {str(Path(dev_data_dir) / "ch.txt")}
print()
!head -n 2 {str(Path(test_data_dir) / "ch.txt")}
print()

In [26]:
sentencepiece_tokenizer_art = run.use_artifact("sentencepiece:latest")
sentencepiece_tokenizer_dir = sentencepiece_tokenizer_art.download()
ch_tokenizer_dir = Path(sentencepiece_tokenizer_dir) / "ch_tokenizer.json"
jp_tokenizer_dir = Path(sentencepiece_tokenizer_dir) / "jp_tokenizer.json"

In [27]:
class SentencePieceDataModule(pl.LightningDataModule):
    def __init__(
        self,
        data_dir,
        src_tokenizer_dir,
        trg_tokenizer_dir,
        batch_size=128,
        num_workers=8,
        pin_memory=True,
    ):
        super().__init__()
        self.data_dir = data_dir
        self.src_tokenizer_dir = src_tokenizer_dir
        self.trg_tokenizer_dir = trg_tokenizer_dir
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.pin_memory = pin_memory

    def setup(self, stage=None):
        self.src_tokenizer = self._load_tokenizer(self.src_tokenizer_dir)
        self.trg_tokenizer = self._load_tokenizer(self.trg_tokenizer_dir)

        if stage == "fit":
            self.train_set = self._data_preprocess(self.data_dir["train"])
            self.val_set = self._data_preprocess(self.data_dir["dev"])

        if stage == "test":
            self.test_set = self._data_preprocess(self.data_dir["test"])

    def train_dataloader(self):
        return DataLoader(
            self.train_set,
            self.batch_size,
            shuffle=True,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory,
            collate_fn=self._data_batching_fn,
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_set,
            self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory,
            collate_fn=self._data_batching_fn,
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_set,
            self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory,
            collate_fn=self._data_batching_fn,
        )

    def _read_data_array(self, data_dir):
        with open(data_dir, encoding="utf8") as f:
            arr = f.readlines()
        return arr

    def _load_tokenizer(self, tokenizer_dir):
        return Tokenizer.from_file(str(tokenizer_dir))

    def _data_preprocess(self, data_dir):
        src_txt = self._read_data_array(Path(data_dir) / "ch.txt")
        trg_txt = self._read_data_array(Path(data_dir) / "jp.txt")
        parallel_txt = np.array(list(zip(src_txt, trg_txt)))
        return parallel_txt

    def _data_batching_fn(self, data_batch):
        data_batch = np.array(data_batch)  # shape=(batch_size, 2=src+trg)

        src_batch = data_batch[:, 0]  # shape=(batch_size, )
        trg_batch = data_batch[:, 1]  # shape=(batch_size, )
        
        # src_batch=(batch_size, longest_sentence)
        # trg_batch=(batch_size, longest_sentence)
        src_batch = self.src_tokenizer.encode_batch(src_batch)  
        trg_batch = self.trg_tokenizer.encode_batch(trg_batch)

        # We have to sort the batch by their non-padded lengths in descending order,
        # because the descending order can help in `nn.utils.rnn.pack_padded_sequence()`,
        # which it will help us ignoring the <pad> in training rnn.
        # https://meetonfriday.com/posts/4d6a906a
        src_batch, trg_batch = zip(
            *sorted(
                zip(src_batch, trg_batch),
                key=lambda x: sum(x[0].attention_mask),
                reverse=True,
            )
        )
        
        return src_batch, trg_batch

In [28]:
dm = SentencePieceDataModule(data_dir, ch_tokenizer_dir, jp_tokenizer_dir, 128)

## Log Dataset Artifact

In [29]:
root_dir = Path.cwd().parent
data_module_dir = root_dir / "data_module"
data_module_dir.mkdir(parents=True, exist_ok=True)

with (data_module_dir / "sentence_piece.pkl").open('wb') as f:
    pickle.dump(dm, f, pickle.HIGHEST_PROTOCOL)
    
with (data_module_dir / "sentence_piece.pkl").open('rb') as f:
    dm = pickle.load(f)

In [30]:
artifact = wandb.Artifact("sentencepiece_dm", 
                          type="data_module",
                          metadata={"tokenizer": "SentencePiece",
                                    "Dataset": "train, val, test"})

artifact.add_file(data_module_dir / "sentence_piece.pkl", "sentencepiece_dm.pkl")
run.log_artifact(artifact)

<wandb.sdk.wandb_artifacts.Artifact at 0x7ff5847d5f40>

## Test our DataModule

In [31]:
sentencepiece_dm_art = run.use_artifact("sentencepiece_dm:latest")
sentencepiece_dm_dir = sentencepiece_dm_art.download()

with open(Path(sentencepiece_dm_dir) / "sentencepiece_dm.pkl", 'rb') as f:
    dm = pickle.load(f)

In [32]:
dm.setup("fit")
dm.setup("test")

In [None]:
for src_batch, trg_batch in dm.train_dataloader():
    print(src_batch[0].tokens) # longest sentence = no padding
    print(trg_batch[0].tokens)
    
    print(src_batch[-1].tokens) # shortest sentence = most padding
    print(trg_batch[-1].tokens)
    break

In [None]:
for src_batch, trg_batch in dm.val_dataloader():
    print(src_batch[0].tokens)
    print(trg_batch[0].tokens)
    
    print(src_batch[-1].tokens)
    print(trg_batch[-1].tokens)
    break

In [None]:
for src_batch, trg_batch in dm.test_dataloader():
    print(src_batch[0].tokens)
    print(trg_batch[0].tokens)
    
    print(src_batch[-1].tokens)
    print(trg_batch[-1].tokens)
    break

In [36]:
run.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…