As the model is BERT-like, we’ll train it on a task of *Masked language modeling*, i.e. the predict how to fill arbitrary tokens that we randomly mask in the dataset. This is taken care of by the example script.

In [1]:
%run -r params.py

In [2]:
#Now let's re-create our tokenizer in transformers
from transformers import RobertaTokenizerFast
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_dir, max_len=512)

In [3]:
from transformers import RobertaConfig
config = RobertaConfig(
    vocab_size= tokenizer.vocab_size, #52_000,
    max_position_embeddings=400, #514,
    num_attention_heads=12, #12,
    num_hidden_layers=6, #6,
    type_vocab_size=1,
)

In [4]:
!pip3 install datasets -qqq
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
paths = ['../data/04-19-2021-train-sparql.txt', '../data/04-19-2021-test-sparql.txt']
# using load_dataset to lazy load data
dataset = load_dataset("text", data_files=paths) #text defines the type

Using custom data configuration default-bff6b200e91eadf5
Reusing dataset text (/home/shyaz/.cache/huggingface/datasets/text/default-bff6b200e91eadf5/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


In [5]:
def encode(lines): return tokenizer(lines['text'], add_special_tokens=True, truncation=True, max_length=400)

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 558
    })
})

In [7]:
dataset.set_transform(encode)
dataset = dataset['train']

In [8]:
#This is just a small helper that will help us batch different samples of the dataset together into an object that PyTorch knows how to perform backprop on.
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) #mlm=MaskedLM

In [9]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=400,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [10]:
# Check that PyTorch sees cuda
import torch
torch.cuda.is_available()

True

In [11]:
from transformers import RobertaForMaskedLM
model = RobertaForMaskedLM(config=config)

In [12]:
model.num_parameters()

45359054

In [13]:
from transformers import Trainer, TrainingArguments 

In [20]:
!mkdir {model_roberta_mlm} -p
training_args = TrainingArguments(
    output_dir=model_roberta_mlm,
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=10,
    save_steps=100,
    save_total_limit=2,
    prediction_loss_only=True,
    remove_unused_columns=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [21]:
%%time
trainer.train()

Step,Training Loss


CPU times: user 54.7 s, sys: 5.08 s, total: 59.7 s
Wall time: 59.5 s


TrainOutput(global_step=280, training_loss=3.1985850742885042, metrics={'train_runtime': 59.5195, 'train_samples_per_second': 4.704, 'total_flos': 19944013171368.0, 'epoch': 5.0})

#### 🎉 Save final model (+ tokenizer + config) to disk

In [22]:
trainer.save_model(model_roberta_mlm)

## View Results on Tensorboard

In [17]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [18]:
%tensorboard --logdir runs  --host localhost 

In [19]:
# %tensorboard dev upload --logdir runs 