# Appendix: Introduction to using HF transformers

This appendix introduces the basics of using the Hugging Face transformers package, for the example task of fine-tuning an encoder-only transformer model, Electra (released by Google in 2020) to perform medical natural language inference.


References:

https://huggingface.co/docs/transformers/en/training

https://github.com/gregdurrett/fp-dataset-artifacts

- many portions of the code below were adapted from Greg Durrett's NLP course package.


In [20]:
import os
import json
from datetime import datetime
import numpy as np
import torch
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, EvalPrediction, TrainingArguments
import datasets
import warnings
warnings.filterwarnings("ignore")


## Set paramaters

TrainingArguments
- num_train_epochs — Total number of training epochs to perform
- per_device_train_batch — try to make this as large as you can without getting CUDA out-of-memory errors
- eval_steps — Interval between two evaluations. Should be an integer as number of update steps, or a float in range [0,1) as ratio of total training steps.
- logging_steps — Number of update steps between two logs
- save_steps — Number of updates steps before two checkpoint saves.
- report_to — Platforms to report the results and logs to, such as "azure_ml", "clearml", "codecarbon", "comet_ml", "dagshub", "dvclive", "flyte", "mlflow", "neptune", "tensorboard", and "wandb".


In [2]:
# Pre-trained transformer model to use
model_id = 'google/electra-base-discriminator'

In [4]:
# Set parameters
from dataclasses import dataclass
@dataclass
class Args:
    model_id : str = model_id
    output_dir : str = os.path.join('models', model_id)
    eval_dir : str = os.path.join(output_dir,
                                  'logs',
                                  datetime.now().strftime("%Y%m%d-%H%M"))
    dataset : str = 'snli'
    max_length : int = 128 
    max_train_samples: int = -1  # limit number of examples to train on
    max_dev_samples: int = -1    # limit number of examples to validate with
    max_test_samples: int = -1   # limit number of examples to test on

In [5]:
# Parameters for this run
args = Args(dataset = dict(train='mednli/mli_train_v1.jsonl',
                           dev='mednli/mli_dev_v1.jsonl',
                           test='mednli/mli_test_v1.jsonl'))

In [6]:
# Parameters for the Trainer
training_args = TrainingArguments(
    output_dir=args.output_dir,
    do_train=True,
    do_eval=True,
    num_train_epochs=8.0,
    per_device_train_batch_size=32,
    evaluation_strategy='steps',
    save_steps= 0.2,          # checkpoint interval
    logging_steps = 0.1,      # logging interval
    eval_steps = 0.1,         # evaluation interval
    report_to="tensorboard",
)

## Load the model and tokenizer

Select to load model from HF hub, or previously checkpoint-saved folder
This should either be a HuggingFace model ID (see https://huggingface.co/models)
or a path to a saved model checkpoint (a folder containing
config.json and model.save_tensors)

Auto Classes automatically retrieve the relevant model architecture with the right model fine-tuning head.

The AutoTokenizer.from_pretrained() class method will be instantiated with one of the tokenizer classes for preparing inputs for a model. The library contains tokenizers for all the models.

In [7]:
# Select the model architecture
model_class = AutoModelForSequenceClassification

# Where to load model from
from_checkpoint = args.model_id     # load pre-trained from HF hub
#from_checkpoint = args.output_dir   # load from local folder
#from_checkpoint = os.path.join(args.output_dir, "checkpoint-6740")

In [8]:
model = model_class.from_pretrained(from_checkpoint, num_labels=3)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
tokenizer = AutoTokenizer.from_pretrained(args.model_id, use_fast=True)

In [10]:
print(f"CUDA: {torch.cuda.memory_allocated()/1e9:.2f} GB")
# !nvidia-smi

CUDA: 0.00 GB


## Load dataset

NLI models need to have the output label count specified
(label 0 is "entailed", 1 is "neutral", and 2 is "contradiction")

If not default SNLI, need to format the json dataset appropriately,
with each line containing one example as follows:
{"premise": "Two women are embracing.", 
 "hypothesis": "The sisters are hugging.",
 "label": 1}

In [11]:
nli_labels = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
if args.dataset == 'snli':
    dataset = datasets.load_dataset('snli')
else:
    dataset = datasets.DatasetDict()
    def prepare_label(ex):
        """To pre-process mednli dataset examples"""
        lab = ex['label']
        ex['label'] = nli_labels[lab] if lab in nli_labels else -1
        return ex
    
    for split in args.dataset.keys():
        print('Loading', split, ':', args.dataset[split], '...')
        
        # By default, "json" loader places all examples in the "train" split
        dataset[split] = datasets\
            .load_dataset('json', data_files=args.dataset[split])['train']\
            .rename_columns({'gold_label': 'label',
                             'sentence1': 'premise',
                             'sentence2': 'hypothesis'})\
            .select_columns(['label', 'premise', 'hypothesis'])\
            .map(prepare_label)                

Loading train : mednli/mli_train_v1.jsonl ...
Loading dev : mednli/mli_dev_v1.jsonl ...
Loading test : mednli/mli_test_v1.jsonl ...


In [12]:
# remove NLI examples with no label
dataset = dataset.filter(lambda ex: int(ex['label']) in nli_labels.values())
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'premise', 'hypothesis'],
        num_rows: 11232
    })
    dev: Dataset({
        features: ['label', 'premise', 'hypothesis'],
        num_rows: 1395
    })
    test: Dataset({
        features: ['label', 'premise', 'hypothesis'],
        num_rows: 1422
    })
})

## Prepare data

In [13]:
def prepare_dataset(examples, tokenizer=tokenizer, max_length=args.max_length):
    """Preprocess an NLI dataset, tokenizing premises and hypotheses"""
    max_length = max_length if max_length else tokenizer.model_max_length
    tokenized_examples = tokenizer(examples['premise'],
                                   examples['hypothesis'],
                                   truncation=True,
                                   max_length=max_length,
                                   padding='max_length')
    tokenized_examples['label'] = examples['label']
    return tokenized_examples

In [14]:
# Prepare train and dev set splits
if training_args.do_train:
    train_dataset = dataset['train']
    if args.max_train_samples > 0:
        train_dataset = train_dataset.select(range(args.max_train_samples))
    train_dataset_tokenized = \
        train_dataset.map(prepare_dataset,
                          batched=True,
                          remove_columns=train_dataset.column_names)
    dev_dataset = dataset['dev']
    if args.max_dev_samples > 0:
        dev_dataset = dev_dataset.select(range(args.max_dev_samples))
    dev_dataset_tokenized = \
        dev_dataset.map(prepare_dataset,
                        batched=True,
                        remove_columns=dev_dataset.column_names)
else:
    train_dataset = None
    train_dataset_tokenized = None
    dev_dataset = None
    dev_dataset_tokenized = None

In [15]:
# Prepare test set
if training_args.do_eval:
    test_dataset = dataset['test']
    if args.max_test_samples > 0:
        test_dataset = test_dataset.select(range(args.max_test_samples))
    test_dataset_tokenized = \
        test_dataset.map(prepare_dataset,
                         batched=True,
                         remove_columns=test_dataset.column_names)
else:
    test_dataset = None
    test_dataset_tokenized = None    

Map: 100%|██████████| 1422/1422 [00:00<00:00, 14380.02 examples/s]


## Train model

Trainer is a complete training and evaluation loop implemented in the Transformers library. You only need to pass it the necessary pieces for training: model, tokenizer, dataset, evaluation function, and hyperparameters.

If you want to use custom evaluation metrics, provide your own compute_metrics() function

If you want to customize the way the loss is computed, you should subclass Trainer and override the "compute_loss" method (see https://huggingface.co/transformers/_modules/transformers/trainer.html#Trainer.compute_loss).

You can also add training hooks using Trainer.add_callback: See https://huggingface.co/transformers/main_classes/trainer.html#transformers.Trainer.add_callback and https://huggingface.co/transformers/main_classes/callback.html#transformers.TrainerCallback


In [16]:
# If you want to use custom metrics, define your own "compute_metrics" function.
def compute_metrics(eval_prediction: EvalPrediction):
    """computes sentence-classification accuracy"""
    return {'accuracy': (np.argmax(eval_prediction.predictions, axis=1) ==
                         eval_prediction.label_ids).astype(np.float32).mean().item()}

In [17]:
# If you want to change how predictions are computed, you should
# subclass Trainer and override the "prediction_step" method
# (see https://huggingface.co/transformers/_modules/transformers/trainer.html#Trainer.prediction_step).
# If you do this your custom prediction_step should probably start by
# calling super().prediction_step and modifying the values that it returns
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=train_dataset_tokenized,
                  eval_dataset=dev_dataset_tokenized,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)

In [18]:
if training_args.do_train:    
    print('Training the model...')
    trainer.train()
    trainer.save_model(args.output_dir)

Training the model...


 10%|█         | 281/2808 [02:47<25:40,  1.64it/s]

{'loss': 0.7606, 'grad_norm': 5.441277027130127, 'learning_rate': 4.4996438746438744e-05, 'epoch': 0.8}


                                                  
 10%|█         | 281/2808 [02:57<25:40,  1.64it/s]

{'eval_loss': 0.5451989769935608, 'eval_accuracy': 0.7820788621902466, 'eval_runtime': 10.0562, 'eval_samples_per_second': 138.72, 'eval_steps_per_second': 17.402, 'epoch': 0.8}


 20%|██        | 562/2808 [05:43<22:38,  1.65it/s]  

{'loss': 0.5005, 'grad_norm': 5.723459720611572, 'learning_rate': 3.999287749287749e-05, 'epoch': 1.6}


                                                  
 20%|██        | 562/2808 [05:52<22:38,  1.65it/s]Checkpoint destination directory models/google/electra-base-discriminator/checkpoint-562 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 0.5031224489212036, 'eval_accuracy': 0.8057347536087036, 'eval_runtime': 9.6876, 'eval_samples_per_second': 143.998, 'eval_steps_per_second': 18.064, 'epoch': 1.6}


 30%|███       | 843/2808 [08:40<19:45,  1.66it/s]  

{'loss': 0.3853, 'grad_norm': 4.627773284912109, 'learning_rate': 3.498931623931624e-05, 'epoch': 2.4}


                                                  
 30%|███       | 843/2808 [08:50<19:45,  1.66it/s]

{'eval_loss': 0.5096697807312012, 'eval_accuracy': 0.8179211616516113, 'eval_runtime': 10.0252, 'eval_samples_per_second': 139.15, 'eval_steps_per_second': 17.456, 'epoch': 2.4}


 40%|████      | 1124/2808 [11:36<16:45,  1.68it/s] 

{'loss': 0.3047, 'grad_norm': 7.183603286743164, 'learning_rate': 2.9985754985754986e-05, 'epoch': 3.2}


                                                   
 40%|████      | 1124/2808 [11:46<16:45,  1.68it/s]Checkpoint destination directory models/google/electra-base-discriminator/checkpoint-1124 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 0.5592992305755615, 'eval_accuracy': 0.8272401690483093, 'eval_runtime': 10.0634, 'eval_samples_per_second': 138.622, 'eval_steps_per_second': 17.39, 'epoch': 3.2}


 50%|█████     | 1405/2808 [14:35<14:01,  1.67it/s]  

{'loss': 0.2257, 'grad_norm': 7.553471088409424, 'learning_rate': 2.4982193732193735e-05, 'epoch': 4.0}


                                                   
 50%|█████     | 1405/2808 [14:45<14:01,  1.67it/s]

{'eval_loss': 0.5295200943946838, 'eval_accuracy': 0.8250896334648132, 'eval_runtime': 10.0387, 'eval_samples_per_second': 138.962, 'eval_steps_per_second': 17.432, 'epoch': 4.0}


 60%|██████    | 1686/2808 [17:32<11:12,  1.67it/s]  

{'loss': 0.1495, 'grad_norm': 1.6455395221710205, 'learning_rate': 1.997863247863248e-05, 'epoch': 4.8}


                                                   
 60%|██████    | 1686/2808 [17:42<11:12,  1.67it/s]Checkpoint destination directory models/google/electra-base-discriminator/checkpoint-1686 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 0.6505078673362732, 'eval_accuracy': 0.832974910736084, 'eval_runtime': 10.056, 'eval_samples_per_second': 138.723, 'eval_steps_per_second': 17.403, 'epoch': 4.8}


 70%|███████   | 1967/2808 [20:31<08:28,  1.66it/s]  

{'loss': 0.114, 'grad_norm': 7.634776592254639, 'learning_rate': 1.4975071225071227e-05, 'epoch': 5.6}


                                                   
 70%|███████   | 1967/2808 [20:41<08:28,  1.66it/s]

{'eval_loss': 0.7827601432800293, 'eval_accuracy': 0.832974910736084, 'eval_runtime': 10.0688, 'eval_samples_per_second': 138.546, 'eval_steps_per_second': 17.38, 'epoch': 5.6}


 80%|████████  | 2248/2808 [23:25<05:31,  1.69it/s]

{'loss': 0.0954, 'grad_norm': 1.790868878364563, 'learning_rate': 9.971509971509972e-06, 'epoch': 6.4}


                                                   
 80%|████████  | 2248/2808 [23:35<05:31,  1.69it/s]Checkpoint destination directory models/google/electra-base-discriminator/checkpoint-2248 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 0.8643640279769897, 'eval_accuracy': 0.832974910736084, 'eval_runtime': 9.8849, 'eval_samples_per_second': 141.125, 'eval_steps_per_second': 17.704, 'epoch': 6.4}


 90%|█████████ | 2529/2808 [26:21<02:45,  1.68it/s]

{'loss': 0.0653, 'grad_norm': 0.3601096272468567, 'learning_rate': 4.9679487179487175e-06, 'epoch': 7.21}


                                                   
 90%|█████████ | 2529/2808 [26:31<02:45,  1.68it/s]

{'eval_loss': 0.8337998390197754, 'eval_accuracy': 0.8365591168403625, 'eval_runtime': 9.8705, 'eval_samples_per_second': 141.331, 'eval_steps_per_second': 17.73, 'epoch': 7.21}


100%|██████████| 2808/2808 [29:13<00:00,  1.60it/s]


{'train_runtime': 1753.89, 'train_samples_per_second': 51.232, 'train_steps_per_second': 1.601, 'train_loss': 0.26511958046176837, 'epoch': 8.0}


## Evaluate model

accuracy_score()
- Accuracy: fraction of correct predictions

classification_report()
- Precision: number of true positives divided by the number of true positives plus the number of false positives
- Recall: number of true positives divided by the number of true positives plus the number of false negatives 
- F1-score: harmonic mean of precision and recall
- Macro average: averages the unweighted mean per label
- Weighted average: averages the support-weighted mean per label

confusion matrix()
- $C_{ij}$ is equal to the number of observations known to be in group $i$ and predicted to be in group $j$

In [21]:
if training_args.do_eval:
    print('Evaluating results:')
    predictions, _, results = trainer.predict(test_dataset=test_dataset_tokenized)
    print(results)

    os.makedirs(args.eval_dir, exist_ok=True)
    with open(os.path.join(args.eval_dir, 'metrics.json'),
              encoding='utf-8', mode='w') as f:
        json.dump(results, f)

    labels= {0: "entailed", 1: "neutral", 2: "contradiction"}
    with open(os.path.join(args.eval_dir, 'predictions.jsonl'),
              encoding='utf-8', mode='w') as f:
        y_true = list()
        for i, example in enumerate(test_dataset):
            ex = dict(example)
            ex['predicted_scores'] = predictions[i].tolist()
            ex['predicted_label'] = int(predictions[i].argmax())
            y_true.append(labels[ex['predicted_label']])
            f.write(json.dumps(ex))
            f.write('\n')

    y_pred = [labels[label] for label in test_dataset['label']]
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
                
    class_report = classification_report(y_true=y_true, y_pred=y_pred)                
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix                                                       
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred)
    print('\nConfusion Matrix:')
    print(conf_matrix)



Evaluating results:


100%|██████████| 178/178 [00:10<00:00, 17.21it/s]


{'test_loss': 0.982203483581543, 'test_accuracy': 0.8087201118469238, 'test_runtime': 10.4103, 'test_samples_per_second': 136.596, 'test_steps_per_second': 17.098}
Accuracy: 0.809

Classification Report:
               precision    recall  f1-score   support

contradiction       0.87      0.89      0.88       462
     entailed       0.78      0.77      0.78       481
      neutral       0.77      0.76      0.77       479

     accuracy                           0.81      1422
    macro avg       0.81      0.81      0.81      1422
 weighted avg       0.81      0.81      0.81      1422


Confusion Matrix:
[[412  23  27]
 [ 28 372  81]
 [ 34  79 366]]
