# BERT Fine-Tuning Tutorial with PyTorch

By Chris McCormick and Nick Ryan

*Revised on 12/13/19 to use the new [transformers](https://github.com/huggingface/transformers) interface.*




In [2]:
import torch
from transformers import BertTokenizer
from datasets import Dataset
import pathlib as pl
import os
os.chdir('/new-stg/home/banghua/Amazon-Rating-Prediction')
from load_dataset import load_dataset
from tqdm import tqdm
import pickle

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name())
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

  from .autonotebook import tqdm as notebook_tqdm


There are 2 GPU(s) available.
We will use the GPU: NVIDIA RTX A6000


# Preparing Dataset

In [3]:
current_path = pl.Path.cwd()

train_path = current_path / 'dataset' / 'train.json.gz'
val_path = current_path / 'dataset' / 'val.json.gz'
test_path = current_path / 'dataset' / 'test.json.gz'

train_dataset = load_dataset(train_path)
val_dataset = load_dataset(val_path)
test_dataset = load_dataset(test_path)

Loading dataset from /new-stg/home/banghua/Amazon-Rating-Prediction/dataset/train.json.gz...
Loading dataset from /new-stg/home/banghua/Amazon-Rating-Prediction/dataset/val.json.gz...
Loading dataset from /new-stg/home/banghua/Amazon-Rating-Prediction/dataset/test.json.gz...


In [4]:
def get_review_text(entry):
    try:
        return entry['reviewText']
    except KeyError:
        return ''
    
def get_review_rating(entry):
    try:
        return entry['overall']
    except KeyError:
        return -1

# Get indices of entries with empty reviewText
empty_train_indices = [i for i, entry in enumerate(train_dataset) if get_review_text(entry) == '']
empty_val_indices = [i for i, entry in enumerate(val_dataset) if get_review_text(entry) == '']
empty_test_indices = [i for i, entry in enumerate(test_dataset) if get_review_text(entry) == '']

print('Number of empty reviewText in train dataset:', len(empty_train_indices))
print('Number of empty reviewText in val dataset:', len(empty_val_indices))
print('Number of empty reviewText in test dataset:', len(empty_test_indices))

# Filter out entries with empty reviewText or overall rating
train_dataset = [entry for i, entry in enumerate(train_dataset) if i not in empty_train_indices]
val_dataset = [entry for i, entry in enumerate(val_dataset) if i not in empty_val_indices]
test_dataset = [entry for i, entry in enumerate(test_dataset) if i not in empty_test_indices]

print('Number of entries in train dataset:', len(train_dataset))
print('Number of entries in val dataset:', len(val_dataset))
print('Number of entries in test dataset:', len(test_dataset))

Number of empty reviewText in train dataset: 2331
Number of empty reviewText in val dataset: 702
Number of empty reviewText in test dataset: 763
Number of entries in train dataset: 3923158
Number of entries in val dataset: 1307795
Number of entries in test dataset: 1307734


In [32]:
sentences_train = [get_review_text(entry) for entry in train_dataset]
sentences_val = [get_review_text(entry) for entry in val_dataset]
sentences_test = [get_review_text(entry) for entry in test_dataset]

print('Number of training sentences: {:,}'.format(len(sentences_train)))
print('Number of validation sentences: {:,}'.format(len(sentences_val)))
print('Number of testing sentences: {:,}'.format(len(sentences_test)))

Number of training sentences: 3,925,489
Number of validation sentences: 1,308,497
Number of testing sentences: 1,308,497


In [8]:
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('LiYuan/amazon-review-sentiment-analysis')
# 17,280 rows of training set 4,320 rows of dev set. test set: 2,400 rows.

def get_encoded_sentences(sentences, file_path):
    if os.path.exists(file_path):
        print('Loading input_ids from {}'.format(file_path))
        encoded_sentences = torch.load(file_path)
        print('Loaded input_ids.')
    else:
        print('Encoding sentences...')
        encoded_sentences = tokenizer(sentences, add_special_tokens=True, padding=True, truncation=True, max_length=512, return_tensors='pt')
        print('Saving input_ids to {}'.format(file_path))
        torch.save(encoded_sentences, file_path)
        print('Saved input_ids.')
    return encoded_sentences

# Correct the path by expanding the tilde to the user's home directory
file_path_train = current_path / 'dataset' / 'input_ids' / 'input_ids_train.pickle'
file_path_valid = current_path / 'dataset' / 'input_ids' / 'input_ids_valid.pickle'
file_path_test = current_path / 'dataset' / 'input_ids' / 'input_ids_test.pickle'

encoded_sentences_train = get_encoded_sentences(sentences_train, file_path_train)
encoded_sentences_valid = get_encoded_sentences(sentences_val, file_path_valid)
encoded_sentences_test = get_encoded_sentences(sentences_test, file_path_test)

Loading BERT tokenizer...
Loading input_ids from /new-stg/home/banghua/Amazon-Rating-Prediction/dataset/input_ids/input_ids_train.pickle
Loaded input_ids.
Loading input_ids from /new-stg/home/banghua/Amazon-Rating-Prediction/dataset/input_ids/input_ids_valid.pickle
Loaded input_ids.
Loading input_ids from /new-stg/home/banghua/Amazon-Rating-Prediction/dataset/input_ids/input_ids_test.pickle
Loaded input_ids.


In [6]:
def build_dataset_dict(dataset, encoded_sentences):
    dataset_dict = {
        'orig_idx': [i for i in range(len(dataset))], # original index
        'input_ids': encoded_sentences['input_ids'],
        'attention_masks': encoded_sentences['attention_mask'],
        'labels': [entry['overall'] for entry in dataset]
    }
    return dataset_dict

def convert_to_list_from_big_dict(big_dict, big_dict_orig_idx, dataset):
    list_of_dict = []
    for i in range(len(big_dict['input_ids'])):
        list_of_dict.append({
            'orig_idx': big_dict_orig_idx[i],
            'input_ids': big_dict['input_ids'][i],
            'attention_masks': big_dict['attention_mask'][i],
            'labels': int(dataset[big_dict_orig_idx[i]]['overall'] - 1)
        })
    return list_of_dict

# Downsampling the dataset
import random
random.seed(42)

dataset_train_downsampled_orig_idx = random.sample(range(len(train_dataset)), 30000)
dataset_valid_downsampled_orig_idx = random.sample(range(len(val_dataset)), 10000)
dataset_test_downsampled_orig_idx = random.sample(range(len(test_dataset)), 10000)

encoded_sentences_train_downsampled = {key: value[dataset_train_downsampled_orig_idx] for key, value in encoded_sentences_train.items()}
encoded_sentences_valid_downsampled = {key: value[dataset_valid_downsampled_orig_idx] for key, value in encoded_sentences_valid.items()}
encoded_sentences_test_downsampled = {key: value[dataset_test_downsampled_orig_idx] for key, value in encoded_sentences_test.items()}

# Convert encoded_sentences_train_downsampled to list of entries
encoded_sentences_train_downsampled_list = convert_to_list_from_big_dict(encoded_sentences_train_downsampled, dataset_train_downsampled_orig_idx, train_dataset)
encoded_sentences_valid_downsampled_list = convert_to_list_from_big_dict(encoded_sentences_valid_downsampled, dataset_valid_downsampled_orig_idx, val_dataset)
encoded_sentences_test_downsampled_list = convert_to_list_from_big_dict(encoded_sentences_test_downsampled, dataset_test_downsampled_orig_idx, test_dataset)

# Build dataset
dataset_train_downsampled = Dataset.from_list(encoded_sentences_train_downsampled_list)
dataset_valid_downsampled = Dataset.from_list(encoded_sentences_valid_downsampled_list)
dataset_test_downsampled = Dataset.from_list(encoded_sentences_test_downsampled_list)

In [7]:
path_downsampled = current_path / 'downsampled_dataset'
# Save the dataset
dataset_train_downsampled.save_to_disk(path_downsampled / 'train')
dataset_valid_downsampled.save_to_disk(path_downsampled / 'valid')
dataset_test_downsampled.save_to_disk(path_downsampled / 'test')

Saving the dataset (1/1 shards): 100%|████████████████████████████████████████████| 30000/30000 [00:01<00:00, 16903.41 examples/s]
Saving the dataset (1/1 shards): 100%|████████████████████████████████████████████| 10000/10000 [00:00<00:00, 16295.33 examples/s]
Saving the dataset (1/1 shards): 100%|████████████████████████████████████████████| 10000/10000 [00:00<00:00, 16310.30 examples/s]


# Load Dataset

In [3]:
path_downsampled = current_path / 'downsampled_dataset'

# Load the dataset
dataset_train_downsampled = Dataset.load_from_disk(path_downsampled / 'train')
dataset_valid_downsampled = Dataset.load_from_disk(path_downsampled / 'valid')
dataset_test_downsampled = Dataset.load_from_disk(path_downsampled / 'test')

In [7]:
dataset_test_downsampled

Dataset({
    features: ['orig_idx', 'input_ids', 'attention_masks', 'labels'],
    num_rows: 10000
})

In [8]:
dataset_test_downsampled
samples_per_class = [0] * 5
for entry in dataset_train_downsampled["labels"]:
    samples_per_class[entry] += 1
samples_per_class

[2920, 1716, 2398, 3951, 19015]

# BERT Model

In [10]:
from transformers import AutoModelForSequenceClassification, AdamW

model = AutoModelForSequenceClassification.from_pretrained("LiYuan/amazon-review-sentiment-analysis", num_labels = 5)
model.cuda()

2023-12-02 14:16:41.609652: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-02 14:16:41.661668: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-02 14:16:41.661700: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-02 14:16:41.662872: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-02 14:16:41.670436: I tensorflow/core/platform/cpu_feature_guar

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

## 4.2. Optimizer & Learning Rate Scheduler

In [19]:
metric_name = "accuracy"
print(metric_name)
model_name = "AmazonBERT"
print(model_name)

from transformers import Trainer, TrainingArguments
from balanced_loss import Loss

focal_loss = Loss(
    loss_type="focal_loss",
    samples_per_class=samples_per_class,
    class_balanced=True
)

batch_size = 32

args = TrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_steps=100,
)

from datasets import load_metric
import numpy as np

actual_task = "mnli"
metric = load_metric('glue', actual_task)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

accuracy
AmazonBERT


In [20]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels").to(torch.int64)
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = focal_loss(logits, labels)
        # loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss_fct, outputs) if return_outputs else loss_fct
    

trainer = CustomTrainer(
    model,
    args,
    train_dataset=dataset_test_downsampled,
    eval_dataset=dataset_valid_downsampled,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.309521,0.7521
2,No log,0.317563,0.7561


TrainOutput(global_step=314, training_loss=0.21982853883390974, metrics={'train_runtime': 390.0533, 'train_samples_per_second': 51.275, 'train_steps_per_second': 0.805, 'total_flos': 5262362849280000.0, 'train_loss': 0.21982853883390974, 'epoch': 2.0})

In [26]:
trainer.evaluate(dataset_train_downsampled)

{'eval_loss': 0.32204750180244446,
 'eval_accuracy': 0.7510666666666667,
 'eval_runtime': 134.8201,
 'eval_samples_per_second': 222.519,
 'eval_steps_per_second': 3.479,
 'epoch': 2.0}

In [27]:
trainer.evaluate(dataset_valid_downsampled)

{'eval_loss': 0.31756341457366943,
 'eval_accuracy': 0.7561,
 'eval_runtime': 43.9036,
 'eval_samples_per_second': 227.772,
 'eval_steps_per_second': 3.576,
 'epoch': 2.0}

In [24]:
trainer.evaluate(dataset_test_downsampled)

{'eval_loss': 0.17688967287540436,
 'eval_accuracy': 0.8633,
 'eval_runtime': 43.3153,
 'eval_samples_per_second': 230.865,
 'eval_steps_per_second': 3.625,
 'epoch': 2.0}