In [7]:
import torch
from datasets import load_dataset
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

In [8]:
import torch
from datasets import load_dataset
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from datasets import load_metric
from sklearn.metrics import f1_score, classification_report
import numpy as np

model_name = 'textattack/bert-base-uncased-MNLI'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)  # MNLI has 3 labels: entailment, neutral, contradiction



In [9]:
from datasets import load_dataset

mnli = load_dataset('glue', 'mnli')

def preprocess(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, padding='max_length', max_length=128)

encoded_dataset = mnli.map(preprocess, batched=True)

# Set the format for PyTorch
encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [10]:
from transformers import Trainer, TrainingArguments
import numpy as np
from datasets import load_metric

# Define the metric and trainer
accuracy_metric = load_metric("accuracy")

def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=1)
    acc = accuracy_metric.compute(predictions=preds, references=labels)
    f1_weighted = f1_score(labels, preds, average='weighted')
    f1_per_class = f1_score(labels, preds, average=None)
    return {'accuracy': acc['accuracy'], 'f1_weighted': f1_weighted, 'f1_per_class': f1_per_class}

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    per_device_eval_batch_size=16,   # batch size for evaluation
    do_train=False,                  # don't perform training
    do_eval=True,                    # perform evaluation
    logging_dir='./logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                     # the pre-trained model
    args=training_args,              # training arguments
    eval_dataset=encoded_dataset['validation_matched'], # evaluation dataset
    compute_metrics=compute_metrics  # evaluation metric function
)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [11]:
eval_result = trainer.evaluate()

  0%|          | 0/614 [00:00<?, ?it/s]

Trainer is attempting to log a value of "[0.04289277 0.3900935  0.24649659]" of type <class 'numpy.ndarray'> for key "eval/f1_per_class" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


In [12]:
acc = eval_result['eval_accuracy'] * 100
f1_weighted = eval_result['eval_f1_weighted'] * 100
f1_per_class = eval_result['eval_f1_per_class'] * 100

print(f"Accuracy: {acc:.1f}%")
print(f"Weighted F1 Score: {f1_weighted:.1f}%")
print(f"F1 Score for each class: {f1_per_class}")

Accuracy: 27.8%
Weighted F1 Score: 22.0%
F1 Score for each class: [ 4.28927681 39.00934951 24.64965864]
