In [1]:
pip install transformers datasets torch

Note: you may need to restart the kernel to use updated packages.


In [11]:
import warnings
warnings.filterwarnings('ignore')

In [12]:
import torch
from transformers import RobertaTokenizer, DistilBertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset

In [13]:
dataset = load_dataset("eli5_category", split='train[:100]')

In [14]:
dataset[0]

{'q_id': '5lchat',
 'title': "Why there was a 'leap second' added to the end of 2016?",
 'selftext': '',
 'category': 'Other',
 'subreddit': 'explainlikeimfive',
 'answers': {'a_id': ['dbuoyxl', 'dbur7gi', 'dbuotht'],
  'text': ['the rotation of the earth is not a constant. in fact the rotation of the earth is slowing down, which means that a full day is getting slightly longer. without leap seconds our clocks would slowly drift ever so slightly out of sync with the actual day. we could deal with this by redefining how how long 1 second is, making it slightly longer so that one day is still exactly 24*60*60 seconds. but in practice that is really inconvenient for a lot of our technology which relies on very precise timing. its easier to just move us ahead one second every couple of years or so.',
   "The Earth's rotation is not regular. It varies a bit, so sometimes we add a second. We do this to ensure that noon is always going to be sometime around mid-day. If we did not add leap sec

In [15]:
tokenizer = RobertaTokenizer.from_pretrained("distilroberta-base")

def tokenize_function(examples):
    return tokenizer(examples["title"], padding="max_length", truncation=True, max_length=128)
tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [17]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

In [18]:
model =  DistilBertForMaskedLM.from_pretrained("distilroberta-base")

You are using a model of type roberta to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.
Some weights of DistilBertForMaskedLM were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.word_embeddings.weight', 'transformer.layer.0.attention.k_lin.bias', 'transformer.layer.0.attention.k_lin.weight', 'transformer.layer.0.attention.out_lin.bias', 'transformer.layer.0.attention.out_lin.weight', 'transformer.layer.0.attention.q_lin.bias', 'transformer.layer.0.attention.q_lin.weight', 'transformer.layer.0.attention.v_lin.bias', 'transformer.layer.0.attention.v_lin.weight', 'transformer.layer.0.ffn.lin1.bias', 'transformer.layer.0.ffn.lin1.weight', 'transformer.layer.0.ffn.lin2.bias', 'transformer.layer.0.ffn.lin2.weight', 'transformer.layer.0.output_layer_norm.bias', 'transf

In [19]:
pip install transformers[torch]




In [20]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [21]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    data_collator=data_collator,
)

In [22]:
# Train model
trainer.train()

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 10.614102363586426, 'eval_runtime': 13.8808, 'eval_samples_per_second': 7.204, 'eval_steps_per_second': 0.504, 'epoch': 1.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 10.461077690124512, 'eval_runtime': 14.1902, 'eval_samples_per_second': 7.047, 'eval_steps_per_second': 0.493, 'epoch': 2.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 10.382346153259277, 'eval_runtime': 15.1862, 'eval_samples_per_second': 6.585, 'eval_steps_per_second': 0.461, 'epoch': 3.0}
{'train_runtime': 153.1832, 'train_samples_per_second': 1.958, 'train_steps_per_second': 0.137, 'train_loss': 10.606701078869047, 'epoch': 3.0}


TrainOutput(global_step=21, training_loss=10.606701078869047, metrics={'train_runtime': 153.1832, 'train_samples_per_second': 1.958, 'train_steps_per_second': 0.137, 'train_loss': 10.606701078869047, 'epoch': 3.0})

In [23]:
# Evaluate model
trainer.evaluate()

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 10.434917449951172,
 'eval_runtime': 13.395,
 'eval_samples_per_second': 7.465,
 'eval_steps_per_second': 0.523,
 'epoch': 3.0}

In [27]:
def predict_masked_text(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits

    # Get the predictions for the masked token
    mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
    predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
    predicted_token = tokenizer.decode(predicted_token_id)

    # Replace the mask token with the predicted token
    filled_text = text.replace(tokenizer.mask_token, predicted_token)
    return filled_text

# Example usage
input_text = "Good morning <mask>."
output_text = predict_masked_text(input_text)
print(output_text)

Good morning  flavors.
