# Fine-tuning

## load datasets

In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

## Preprocessing (tokenization + padding)

In [2]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"],
        examples["sentence2"],
        truncation=True)
tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
)
tokenized_datasets = tokenized_datasets.remove_columns(
    ["sentence1", "sentence2", "idx"]
)
tokenized_datasets = tokenized_datasets.rename_column(
    "label", "labels"
)


## Load the model

In [3]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Load the metrics

In [4]:
import evaluate

metric = evaluate.load("glue", "mrpc")

In [5]:
import numpy as np

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

## Baseline results

In [6]:
# Test the current model on the validation set
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments("test-trainer", eval_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=raw_datasets["train"],
    eval_dataset=raw_datasets["validation"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    processing_class=tokenizer,
)

In [7]:
trainer.evaluate(tokenized_datasets["test"])



{'eval_loss': 0.6545491814613342,
 'eval_model_preparation_time': 0.0044,
 'eval_accuracy': 0.664927536231884,
 'eval_f1': 0.7987465181058496,
 'eval_runtime': 11.4425,
 'eval_samples_per_second': 150.754,
 'eval_steps_per_second': 18.877}

In [8]:
# Test the model on few examples of the test set
from transformers import pipeline
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
examples = raw_datasets["test"][:10]

for sentence1, sentence2, lable in zip(
    examples["sentence1"], examples["sentence2"], examples["label"]
):
    print(f"Sentence 1: {sentence1}")
    print(f"Sentence 2: {sentence2}")
    print(f"Label: {lable}")
    print("Prediction:", classifier(f"{sentence1} [SEP] {sentence2}"))
    print()

Device set to use mps:0


Sentence 1: PCCW 's chief operating officer , Mike Butcher , and Alex Arena , the chief financial officer , will report directly to Mr So .
Sentence 2: Current Chief Operating Officer Mike Butcher and Group Chief Financial Officer Alex Arena will report to So .
Label: 1
Prediction: [{'label': 'LABEL_1', 'score': 0.5593006610870361}]

Sentence 1: The world 's two largest automakers said their U.S. sales declined more than predicted last month as a late summer sales frenzy caused more of an industry backlash than expected .
Sentence 2: Domestic sales at both GM and No. 2 Ford Motor Co. declined more than predicted as a late summer sales frenzy prompted a larger-than-expected industry backlash .
Label: 1
Prediction: [{'label': 'LABEL_1', 'score': 0.566108763217926}]

Sentence 1: According to the federal Centers for Disease Control and Prevention ( news - web sites ) , there were 19 reported cases of measles in the United States in 2002 .
Sentence 2: The Centers for Disease Control and Pre

## Fine-tuning

In [9]:
from transformers import Trainer, TrainingArguments

In [10]:
training_args = TrainingArguments("test-trainer", eval_strategy="epoch")

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)


In [11]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.384006,0.821078,0.877311
2,0.529100,0.42466,0.862745,0.902098
3,0.295300,0.653003,0.852941,0.896194




TrainOutput(global_step=1377, training_loss=0.3377254655072026, metrics={'train_runtime': 296.5983, 'train_samples_per_second': 37.101, 'train_steps_per_second': 4.643, 'total_flos': 405114969714960.0, 'train_loss': 0.3377254655072026, 'epoch': 3.0})

In [12]:

trainer.evaluate(tokenized_datasets["test"])




{'eval_loss': 0.6925812363624573,
 'eval_accuracy': 0.8504347826086956,
 'eval_f1': 0.8924103419516264,
 'eval_runtime': 9.7401,
 'eval_samples_per_second': 177.104,
 'eval_steps_per_second': 22.176,
 'epoch': 3.0}

In [13]:
# Test the model on few examples of the test set
from transformers import pipeline
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
examples = raw_datasets["test"][:10]

for sentence1, sentence2, lable in zip(
    examples["sentence1"], examples["sentence2"], examples["label"]
):
    print(f"Sentence 1: {sentence1}")
    print(f"Sentence 2: {sentence2}")
    print(f"Label: {lable}")
    print("Prediction:", classifier(f"{sentence1} [SEP] {sentence2}"))
    print()

Device set to use mps:0


Sentence 1: PCCW 's chief operating officer , Mike Butcher , and Alex Arena , the chief financial officer , will report directly to Mr So .
Sentence 2: Current Chief Operating Officer Mike Butcher and Group Chief Financial Officer Alex Arena will report to So .
Label: 1
Prediction: [{'label': 'LABEL_0', 'score': 0.9873570203781128}]

Sentence 1: The world 's two largest automakers said their U.S. sales declined more than predicted last month as a late summer sales frenzy caused more of an industry backlash than expected .
Sentence 2: Domestic sales at both GM and No. 2 Ford Motor Co. declined more than predicted as a late summer sales frenzy prompted a larger-than-expected industry backlash .
Label: 1
Prediction: [{'label': 'LABEL_0', 'score': 0.9931199550628662}]

Sentence 1: According to the federal Centers for Disease Control and Prevention ( news - web sites ) , there were 19 reported cases of measles in the United States in 2002 .
Sentence 2: The Centers for Disease Control and Pr

In [14]:
classifier.save_pretrained("mrpc-bert-base-uncased")