In [3]:
# create filename + path logic variable
from pathlib import Path
paths = [str(x) for x in Path(".").glob("**/*.txt")]

In [4]:
# Initialize a tokenizer
from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer()

In [7]:
# Customize training, display wall time and cpu time results
%%time

tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

CPU times: user 4min 46s, sys: 8.96 s, total: 4min 55s
Wall time: 1min 9s


In [10]:
tokenizer.save_model("esperanto-bert")

['esperanto-bert/vocab.json', 'esperanto-bert/merges.txt']

In [12]:
# initialize tokenizer with previous results
from tokenizers.processors import BertProcessing
tokenizer = ByteLevelBPETokenizer(
    "esperanto-bert/vocab.json",
    "esperanto-bert/merges.txt",
)

In [13]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [14]:
tokenizer.encode("Mi estas Julien.")

Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [15]:
tokenizer.encode("Mi estas Julien.").tokens

['<s>', 'Mi', 'Ġestas', 'ĠJuli', 'en', '.', '</s>']

Now we train our language model using the run_language_modeling.py from transformers. note that we should leave model_name_or_path to None to train from scratch. we are goin to train it on a task of 'mask language modeling' to align with the BERT essence. In this case, we try to predict how to fill the arbitrary tokens that we will be randomly masking in the dataset. 

In [None]:
# the following two blocks are for other training tasks.

# import Dataset, so as to implement a simple subclass of it, to load data from our text files:
from torch.utils.data import Dataset

In [None]:
class esperanto-Dataset(Dataset):
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            'esperanto-bert/vocab.json',
            'esperanto-bert/merges.txt'
        )

        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),    
        )
        
        tokenizer.enable_truncation(max_length=512)

        self.examples = []
        
        src_files = Path('./data/').glob.('*-eval.txt') if evaluate else Path('./data/').glob('*-train.txt')
        for src_file in src_files:
            print('on fire now', src_file)
            lines = src_file.read_text(encoding='utf-8').splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i])



In [20]:
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM
from transformers import RobertaTokenizerFast

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

model = RobertaForMaskedLM(config=config)
model.num_parameters()


83504416

In [21]:
tokenizer = RobertaTokenizerFast.from_pretrained("./esperanto-bert", max_len=512)

In [None]:
# initialize our trainer

from transformers import Trainer, TrainingArguments


training_args = TrainingArguments(
    output_dir="./models/esperanto-bert-v1",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)