In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    AutoModelForMaskedLM,
)


# checkpoint = "bert-base-uncased"
# tokenizer_checkpoint = "bert-base-uncased"

# checkpoint = "roberta-base"
# tokenizer_checkpoint = "roberta-base"

checkpoint = "albert/albert-base-v2"
tokenizer_checkpoint = "albert/albert-base-v2"


dataset_name = "xu-song/cc100-samples"

dataset = load_dataset(dataset_name, "en", split="train[:100%]")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)

def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
    )

# Tokenize
dataset = dataset.map(tokenize_function, batched=True)

# split the dataset in train and test
dataset = dataset.train_test_split(test_size=0.2)

print(dataset)
# print(dataset["train"][0])

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)



Map: 100%|██████████| 10000/10000 [00:00<00:00, 51284.77 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})


In [9]:

model = AutoModelForMaskedLM.from_pretrained(checkpoint)
model.cuda()

training_args = TrainingArguments(
    output_dir = "mase-trainer",
    report_to="none",
    num_train_epochs=3,
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)


eval_results = trainer.evaluate()
print(f"Evaluation loss: {eval_results['eval_loss']}")

trainer.train()

eval_results = trainer.evaluate()
print(f"Evaluation loss: {eval_results['eval_loss']}")

Some weights of the model checkpoint at albert/albert-base-v2 were not used when initializing AlbertForMaskedLM: ['albert.pooler.bias', 'albert.pooler.weight']
- This IS expected if you are initializing AlbertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer = Trainer(


Evaluation loss: 8.586492538452148


Step,Training Loss
500,3.1109
1000,2.9765
1500,2.696
2000,2.6389
2500,2.4322
3000,2.3444


Evaluation loss: 2.5266470909118652
