# Load all CSV file and convert to Huggingface Dataset

In [1]:
from transformers import AutoTokenizer
from datasets import Dataset
import pandas as pd
from glob import glob

data_files = glob("../data/scb-mt-en-th-2020-cleaned/*.csv")
df = pd.concat((pd.read_csv(filename) for filename in data_files))
df.rename(columns={'en_text': 'en', 'th_text': 'th'}, inplace=True)

data = {"translation": df.to_dict(orient="records")}
dataset = Dataset.from_dict(data)

dataset = dataset.shuffle(seed=3407)
dataset = dataset.train_test_split(test_size=0.2)

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 801401
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 200351
    })
})

In [3]:
checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [4]:
source_lang = "th"
target_lang = "en"
max_length = 64

def preprocess_function(examples):
    inputs = [example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True)
    return model_inputs


tokenized_sentence = dataset.map(
    preprocess_function, batched=True, num_proc=8)

tokenized_sentence

Map (num_proc=8):   0%|          | 0/801401 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/200351 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 801401
    })
    test: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200351
    })
})

In [5]:
tokenized_sentence.save_to_disk("../data/scb-mt-hf-dataset-tokenized")

Saving the dataset (0/2 shards):   0%|          | 0/801401 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/200351 [00:00<?, ? examples/s]