In [5]:
# Data preprocessing for trainer using Hugging Face Datasets and Transformers
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

In [7]:
# Set up training arguments
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="../data/models/mrpc-model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [8]:
# Set up the model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Set up the trainer
# This will take a long time without GPU acceleration
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
)
trainer.train()

[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: Enter your choice:


KeyboardInterrupt: 

In [None]:
# Eval
# predicitons is a namedtuple with three fields: predictions, label_ids, metrics
# predicitons.predictions is a numpy array of shape (num_examples, num_labels), logits
# predicitons.label_ids is a numpy array of shape (num_examples,), true labels
# predicitons.metrics is a dict of metrics, e.g., accuracy, f1, etc.

predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

In [None]:
# Transform logits to predicted labels
import numpy as np
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
import evaluate
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

In [None]:
# Define our eval metric
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# Set up the trainer with the compute_metrics function
training_args = TrainingArguments(
    output_dir="../data/models/mrpc-model",
    eval_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()