# Dataset



In [2]:
from datasets import load_dataset

LANG = "be"

dataset = load_dataset("wikimedia/wikipedia", f"20231101.{LANG}", cache_dir=".checkpoints/data")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 236165
    })
})

In [3]:
# Since we dont need any of the other columns, we just keep the text column
dataset = [ data["text"] for data in dataset["train"] ]

# Byte-Level Tokenization

Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model

In [None]:
# from tokenizers import ByteLevelBPETokenizer
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
VOCAB_SIZE = 50_000
tokenizer.train_from_iterator(
    dataset,
    vocab_size=VOCAB_SIZE,
    min_frequency=2, 
    special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ])

tokenizer.enable_truncation(max_length=512)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

Since the tokenization is done at a byte-level, when using UTF-8 encoded characters, the tokens will not remain human readable.
Therefore we need to use a byte-level decoder to combine bytes back into UTF-8 encoded characters so that they are human readable again.

In [None]:
from tokenizers.decoders import ByteLevel

sentence = "Аԥсуаа жәытә-натә аахыс Пицунда амшын аԥшаҳәа Амзара ҳәа иашьҭан"
decoder = ByteLevel()

sentence = tokenizer.encode(sentence)

print("[Tokens]", list(map(lambda x: x.encode("utf-8").hex(), sentence.tokens)))
print("[Decode]", decoder.decode(sentence.tokens))

[Tokens] ['3c733e', 'c390c4b2', 'c394', 'c2a5', 'c391c4a3c391c4a5c390c2b0', 'c390c2b0', 'c4a0c390c2b6', 'c393c4bb', 'c391c4adc391c4a4', 'c393c4bb', '2d', 'c390c2bdc390c2b0c391c4a4', 'c393c4bb', 'c4a0c390c2b0', 'c390c2b0c391c4a7', 'c391c4ad', 'c391c4a3', 'c4a0c390c581', 'c390c2b8c391c4a8', 'c391c4a5c390c2bdc390c2b4c390c2b0', 'c4a0c390c2b0c390c2bc', 'c391c4aac391c4adc390c2bd', 'c4a0c390c2b0', 'c394', 'c2a5', 'c391c4aac390c2b0', 'c392', 'c2b3', 'c393c4bb', 'c390c2b0', 'c4a0c390c4b2c390c2bc', 'c390c2b7c390c2b0c391c4a2c390c2b0', 'c4a0c392', 'c2b3', 'c393c4bb', 'c390c2b0', 'c4a0c390c2b8', 'c390c2b0c391c4aa', 'c391c4ae', 'c392', 'c583', 'c390c2b0c390c2bd', '3c2f733e']
[Decode] <s>Аԥсуаа жәытә-натә аахыс Пицунда амшын аԥшаҳәа Амзара ҳәа иашьҭан</s>


In [None]:
tokenizer.save_model(f".checkpoints/{LANG}-tokenizer")

['.checkpoints/be-tokenizer/vocab.json',
 '.checkpoints/be-tokenizer/merges.txt']

# Model

We use a RoBERTa

In [8]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained(f".checkpoints/{LANG}-tokenizer", max_len=512)

In [9]:
from transformers import RobertaForMaskedLM
from transformers import RobertaConfig

model = RobertaForMaskedLM(
    RobertaConfig(
        vocab_size=VOCAB_SIZE,
        max_position_embeddings=514,
        num_attention_heads=12,
        num_hidden_layers=6,
        type_vocab_size=1,
    ))

In [10]:
model.num_parameters()

81966416

In [11]:
train_dataset = tokenizer(dataset, add_special_tokens=True, truncation=True, max_length=512)["input_ids"]

In [12]:
from transformers.trainer import Trainer, TrainingArguments
from transformers.data.data_collator import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir=f".checkpoints/{LANG}-roberta",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

In [None]:
trainer.train()

  0%|          | 0/59042 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 9.08 GB, other allocations: 8.70 GB, max allowed: 18.13 GB). Tried to allocate 390.62 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
trainer.save_model(f".checkpoints/{LANG}-roberta")