As the model is BERT-like, we’ll train it on a task of *Masked language modeling*, i.e. the predict how to fill arbitrary tokens that we randomly mask in the dataset. This is taken care of by the example script.

In [1]:
%run -r params.py

In [2]:
from transformers import RobertaConfig
config = RobertaConfig(
    vocab_size=4096, #52_000,
    max_position_embeddings=400, #514,
    num_attention_heads=6, #12,
    num_hidden_layers=3, #6,
    type_vocab_size=1,
)

In [3]:
#Now let's re-create our tokenizer in transformers
from transformers import RobertaTokenizerFast
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_dir, max_len=512)

In [4]:
!pip3 install datasets -qqq
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
paths = ['../data/04-19-2021-train-sparql.txt']
# using load_dataset to lazy load data
dataset = load_dataset("text", data_files=paths) #text defines the type

Using custom data configuration default-930acb72d43c1a20
Reusing dataset text (/home/shyaz/.cache/huggingface/datasets/text/default-930acb72d43c1a20/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


In [5]:
def encode(lines): return tokenizer(lines['text'], add_special_tokens=True, truncation=True, max_length=400)

In [6]:
dataset.set_transform(encode)
dataset = dataset['train']

In [7]:
#This is just a small helper that will help us batch different samples of the dataset together into an object that PyTorch knows how to perform backprop on.
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) #mlm=MaskedLM

In [8]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=400,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [9]:
# Check that PyTorch sees cuda
import torch
torch.cuda.is_available()

True

In [10]:
from transformers import RobertaForMaskedLM
model = RobertaForMaskedLM(config=config)

In [11]:
model.num_parameters()

45359054

In [12]:
from transformers import Trainer, TrainingArguments 

In [13]:
!mkdir {model_roberta_mlm} -p
training_args = TrainingArguments(
    output_dir=model_roberta_mlm,
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=10,
    save_steps=100,
    save_total_limit=2,
    prediction_loss_only=True,
    remove_unused_columns=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [14]:
%%time
trainer.train()

Step,Training Loss


CPU times: user 7.28 s, sys: 324 ms, total: 7.6 s
Wall time: 7.59 s


TrainOutput(global_step=41, training_loss=4.726131904415968, metrics={'train_runtime': 7.5641, 'train_samples_per_second': 5.42, 'total_flos': 9454641215760.0, 'epoch': 1.0})

#### 🎉 Save final model (+ tokenizer + config) to disk

In [15]:
trainer.save_model(model_roberta_mlm)