In [1]:
# in this notebook we'll only get one of the files (the Oscar one) for the sake of simplicity and performance
!wget -c https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt

--2020-07-23 07:52:10--  https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt
Resolving cdn-datasets.huggingface.co (cdn-datasets.huggingface.co)... 99.86.181.14, 99.86.181.103, 99.86.181.72, ...
Connecting to cdn-datasets.huggingface.co (cdn-datasets.huggingface.co)|99.86.181.14|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 312733741 (298M) [text/plain]
Saving to: ‘oscar.eo.txt’


2020-07-23 07:52:20 (30.1 MB/s) - ‘oscar.eo.txt’ saved [312733741/312733741]



## Train Tokenizer BBPE

In [2]:
%%time 
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path(".").glob("**/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

CPU times: user 22min 34s, sys: 2min 52s, total: 25min 26s
Wall time: 41.5 s


In [3]:
!mkdir EsperBERTo
tokenizer.save_model("EsperBERTo")

['EsperBERTo/vocab.json', 'EsperBERTo/merges.txt']

In [4]:
# Check that we have a GPU
!nvidia-smi

Thu Jul 23 07:53:29 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   47C    P0   157W / 163W |  30531MiB / 32510MiB |    100%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   35C    P0    50W / 163W |   1647MiB / 32510MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM2...  On   | 00000000:0A:00.0 Off |                    0 |
| N/A   

## Roberta

In [5]:
from transformers import (RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM)



config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

tokenizer = RobertaTokenizerFast.from_pretrained("./EsperBERTo", max_len=512)


In [7]:
model = RobertaForMaskedLM(config=config)

In [8]:
model.num_parameters()
# => 84 million parameters

84095008

In [9]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./oscar.eo.txt",
    block_size=128,
)

CPU times: user 5min 38s, sys: 29 s, total: 6min 7s
Wall time: 52 s


In [10]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [12]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="4,5,6,7"

In [13]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./EsperBERTo",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)

In [None]:
%%time
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Iteration', max=1904, style=ProgressStyle(description_width='…