The aim of this notebook is to fine tune a BERT pretrained model for text classification.

- **Model:** https://huggingface.co/bert-base-uncased
- **Dataset:** https://huggingface.co/datasets/ag_news

**Guide**: https://huggingface.co/docs/transformers/traininghttps://huggingface.co/docs/transformers/training

**Authors**

    - Tom Axberg (taxberg@kth.se)
    - Antonio Nieto (antonio.nieto@datatonic.com)

# Environment setup

!pip install transformers datasets numpy torch tensorflow ipywidgets

# Imports

In [2]:
import numpy as np
import random
import torch
import tensorflow as tf
from datasets import load_dataset, load_metric
from transformers.file_utils import is_tf_available, is_torch_available
from transformers import BertTokenizerFast, TFAutoModelForSequenceClassification, Trainer, DataCollatorWithPadding

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Custom functions

In [3]:
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf

        tf.random.set_seed(seed)

set_seed(1)

# Parameters

In [4]:
dataset_name = 'ag_news'
num_targets = 4 
model_name = "bert-large-uncased"
max_length = 512

# Dataset

In [5]:
# Manually specify the number of unique targets
train_dataset = load_dataset(dataset_name, split="train[10%:]")
val_dataset = load_dataset(dataset_name, split="train[:10%]")
test_dataset = load_dataset(dataset_name, split="test")

Using custom data configuration default
Reusing dataset ag_news (/Users/tomaxberg/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)
Using custom data configuration default
Reusing dataset ag_news (/Users/tomaxberg/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)
Using custom data configuration default
Reusing dataset ag_news (/Users/tomaxberg/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


In [6]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 108000
})

In [7]:
train_dataset[0]

{'text': 'RocketInfo Partners with Canadian Press, Helps Nascar RocketInfo Partners with Canadian Press, Helps Nascar\\\\Rocketinfo Inc., news search engine announced yesterday that it has formed a key reseller alliance with the Canadian Press (CP), one of the top-rated multimedia news agencies in the world. CP plans to expand their media monitoring services by offering clients access to the ...',
 'label': 3}

# Tokenizer

In [8]:
# load the tokenizer (convert our text to sequence of tokens)
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [9]:
# tokenize the dataset, truncate when passed 'max_length' and pad with 0's when less than 'max_length'
train_tokenized = train_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length'), batched=True)
val_tokenized = val_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length'), batched=True)
test_tokenized = test_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length'), batched=True)

Loading cached processed dataset at /Users/tomaxberg/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-dec63c70e6e50560.arrow
Loading cached processed dataset at /Users/tomaxberg/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-f71c81a33e474491.arrow
Loading cached processed dataset at /Users/tomaxberg/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-f97efe5561747d9d.arrow


In [10]:
train_tokenized

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 108000
})

# Prepare dataset

In [11]:
# get data in standard tf.data.Dataset and remove 'text' label as it is not longer needed
tf_train_dataset = train_tokenized.remove_columns(['text']).with_format('tensorflow')
tf_val_dataset = val_tokenized.remove_columns(['text']).with_format('tensorflow')
tf_test_dataset = test_tokenized.remove_columns(['text']).with_format('tensorflow')

# convert to tensors
train_features = {x: tf_train_dataset[x] for x in tokenizer.model_input_names}
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, tf_train_dataset["label"]))
train_tf_dataset = train_tf_dataset.shuffle(len(tf_train_dataset)).batch(8)

val_features = {x: tf_val_dataset[x] for x in tokenizer.model_input_names}
val_tf_dataset = tf.data.Dataset.from_tensor_slices((val_features, tf_val_dataset["label"]))
val_tf_dataset = val_tf_dataset.shuffle(len(val_tf_dataset)).batch(8)

test_features = {x: tf_test_dataset[x] for x in tokenizer.model_input_names}
test_tf_dataset = tf.data.Dataset.from_tensor_slices((test_features, tf_test_dataset["label"]))
test_tf_dataset = test_tf_dataset.shuffle(len(test_tf_dataset)).batch(8)

Metal device set to: AMD Radeon Pro 560

systemMemory: 16.00 GB
maxCacheSize: 2.00 GB



2022-05-15 13:24:49.799448: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-15 13:24:49.802553: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-05-15 13:24:49.803392: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [11]:
test_features['input_ids'][3426]

<tf.Tensor: shape=(512,), dtype=int64, numpy=
array([  101,  8130,  2115,  3819,  7473,  2072,  4671,  7473,  2126,
        2067,  1999,  2238,  1045,  4081,  2017,  2907,  2125,  2006,
        9343,  1037,  2047,  7473,  2127,  3001,  2007,  7473,  2072,
        4671, 12057,  1012,  1996,  2047,  2974,  2038,  1996,  4022,
        2000, 12099,  5335,  2836,  2138,  2009, 20736,  1996, 26202,
        2100,  2214,   102,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,   

# Train the model

In [21]:
# load the model (pre-trained weights)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_targets)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy()
)

model.fit(train_tf_dataset, validation_data=val_tf_dataset, epochs=50)

Epoch 1/50


2022-04-22 13:40:19.051140: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Error: Canceled future for execute_request message before replies were done

In [17]:
model.save_pretrained(f"../models/{model_name}-trained")

# Load trained model

In [12]:
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_targets)
model.load_weights(f"../models/{model_name}-trained/tf_model.h5")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  335141888 
                                                                 
 dropout_73 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  4100      
                                                                 
Total params: 335,145,988
Trainable params: 335,145,988
Non-trainable params: 0
_________________________________________________________________


## Test the model

In [27]:
input_text = "Fears for T N pension after talks Unions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul."
# input_text = test_tf_dataset
input_text_tokenized = tokenizer.encode(input_text,
                                        truncation=True,
                                        padding=True,
                                        return_tensors="tf")
prediction = model(input_text_tokenized)
prediction_logits = prediction[0]
prediction_probs = tf.nn.softmax(prediction_logits,axis=1).numpy()
print(f'The predicted label is: {np.argmax(prediction_probs)}')
test_tf_dataset

The predicted label is: 2


<BatchDataset shapes: ({input_ids: (None, 512), token_type_ids: (None, 512), attention_mask: (None, 512)}, (None,)), types: ({input_ids: tf.int64, token_type_ids: tf.int64, attention_mask: tf.int64}, tf.int64)>

In [26]:
from transformers import TrainingArguments

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
training_args = TrainingArguments("test-trainer")
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=val_tf_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


AttributeError: 'TFBertForSequenceClassification' object has no attribute 'to'

In [27]:
from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2_as_graph


def get_flops(model):
    concrete = tf.function(lambda inputs: model(inputs))
    concrete_func = concrete.get_concrete_function(
        [tf.TensorSpec([1, *inputs.shape[1:]]) for inputs in model.inputs])
    frozen_func, graph_def = convert_variables_to_constants_v2_as_graph(
        concrete_func)
    with tf.Graph().as_default() as graph:
        tf.graph_util.import_graph_def(graph_def, name='')
        run_meta = tf.compat.v1.RunMetadata()
        opts = tf.compat.v1.profiler.ProfileOptionBuilder.float_operation()
        flops = tf.compat.v1.profiler.profile(
            graph=graph, run_meta=run_meta, cmd="op", options=opts)
        return flops.total_float_ops


print(f"GFLOPS: {get_flops(model) / 10 ** 9:.06} ")


TypeError: 'NoneType' object is not iterable

In [5]:
flops, params = profile(model, inputs, verbose=False)
flops 

NameError: name 'profile' is not defined

### Roc and stuff

In [17]:
%pip install sklearn seaborn

[31mERROR: Could not find a version that satisfies the requirement sklearn (from versions: none)[0m
[31mERROR: No matching distribution found for sklearn[0m
Note: you may need to restart the kernel to use updated packages.


In [14]:
from tqdm import tqdm
from tqdm import tqdm_notebook
from sklearn.metrics import auc
from sklearn.metrics import classification_report
import seaborn as sns
import matplotlib.pyplot as plt

mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

def plot_loss(history):
# Use a log scale to show the wide range of values.
    plt.semilogy(history.epoch,  history.history['loss'],
               color='red', label='Train Loss')
    plt.semilogy(history.epoch,  history.history['val_loss'],
          color='green', label='Val Loss',
          linestyle="--")
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
  
    plt.legend()
    
    
def plot_metrics(history):
    metrics =  ['loss', 'auc', 'precision', 'recall']
    for n, metric in enumerate(metrics):
        name = metric.replace("_"," ").capitalize()
        plt.subplot(2,2,n+1)
        plt.plot(history.epoch,  history.history[metric], color=colors[0], label='Train')
        plt.plot(history.epoch, history.history['val_'+metric],
                 color=colors[0], linestyle="--", label='Val')
        plt.xlabel('Epoch')
        plt.ylabel(name)
        if metric == 'loss':
            plt.ylim([0, plt.ylim()[1]])
        elif metric == 'auc':
            plt.ylim([0.8,1])
        else:
            plt.ylim([0,1])

        plt.legend()

ModuleNotFoundError: No module named 'sklearn'

In [None]:
def plot_cm(y_true, y_pred, title):
    ''''
    input y_true-Ground Truth Labels
          y_pred-Predicted Value of Model
          title-What Title to give to the confusion matrix
    
    Draws a Confusion Matrix for better understanding of how the model is working
    
    return None
    
    '''
    
    figsize=(10,10)
    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
            elif c == 0:
                annot[i, j] = ''
            else:
                annot[i, j] = '%.1f%%\n%d' % (p, c)
    cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
    cm.index.name = 'Actual'
    cm.columns.name = 'Predicted'
    fig, ax = plt.subplots(figsize=figsize)
    plt.title(title)
    sns.heatmap(cm, cmap= "YlGnBu", annot=annot, fmt='', ax=ax)

def roc_curve_plot(fpr,tpr,roc_auc):
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' %roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

In [18]:
y_predict=model.predict(valid, verbose=1)
y_predict[ y_predict> 0.5] = 1
y_predict[y_predict <= 0.5] = 0
plot_cm(y_valid, y_predict, 'Distil BERT Performance-Confusion Matrix')

NameError: name 'valid' is not defined

In [None]:
y_predict_prob=model.predict(valid, verbose=1)
fpr, tpr, _ = roc_curve(y_valid,y_predict_prob)
roc_auc = auc(fpr, tpr)
roc_curve_plot(fpr,tpr,roc_auc)

In [13]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy()
)
result = model.evaluate(test_tf_dataset)
print("test loss, test acc:", result)

2022-05-15 13:25:15.971702: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




In [None]:
result = model.evaluate(test_tf_dataset)

RuntimeError: You must compile your model before training/testing. Use `model.compile(optimizer, loss)`.

In [2]:
results = [0.25786691904067993, 0.9277631044387817]

In [4]:
print("test loss, test acc:", results)

test loss, test acc: [0.25786691904067993, 0.9277631044387817]


In [18]:
results = model(test_tf_dataset)

ValueError: Exception encountered when calling layer "tf_bert_for_sequence_classification" (type TFBertForSequenceClassification).

Data of type <class 'tensorflow.python.data.ops.dataset_ops.BatchDataset'> is not allowed only (<class 'tensorflow.python.framework.ops.Tensor'>, <class 'bool'>, <class 'int'>, <class 'transformers.utils.generic.ModelOutput'>, <class 'tuple'>, <class 'list'>, <class 'dict'>, <class 'numpy.ndarray'>, <class 'tensorflow.python.keras.engine.keras_tensor.KerasTensor'>) is accepted for input_ids.

Call arguments received:
  • input_ids=<BatchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>
  • attention_mask=None
  • token_type_ids=None
  • position_ids=None
  • head_mask=None
  • inputs_embeds=None
  • output_attentions=None
  • output_hidden_states=None
  • return_dict=None
  • labels=None
  • training=False

In [14]:
# from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.metrics import roc_auc_score

X, y = test_features['input_ids'].numpy(), results
# X, y = load_iris(return_X_y=True)
clf = LogisticRegression(solver="liblinear").fit(X, y)
roc_auc_score(y, clf.predict_proba(X), multi_class='ovr')

test_dataset['label'] 

ModuleNotFoundError: No module named 'sklearn'

In [30]:
input_text = "Fears for T N pension after talks Unions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul."
# input_text = test_tf_dataset
input_text_tokenized = tokenizer.encode(input_text,
                                        truncation=True,
                                        padding=True,
                                        return_tensors="tf")

In [56]:
input_text_tokenized

<tf.Tensor: shape=(1, 32), dtype=int32, numpy=
array([[  101, 10069,  2005,  1056,  1050, 11550,  2044,  7566,  9209,
         5052,  3667,  2012,  6769,  2047,  8095,  2360,  2027,  2024,
         1005,  9364,  1005,  2044,  7566,  2007, 16654,  6687,  3813,
         2976,  9587, 24848,  1012,   102]], dtype=int32)>

In [57]:
test_features['input_ids'][1]

<tf.Tensor: shape=(512,), dtype=int64, numpy=
array([  101,  1996,  2679,  2003,  2006,  1024,  2117,  2797,  2136,
        4520,  4888,  3058,  2005,  2529,  2686, 28968,  1006,  2686,
        1012,  4012,  1007,  2686,  1012,  4012,  1011,  4361,  1010,
        2710,  1011,  1011,  1037,  2117,  1032,  2136,  1997,  7596,
       22862,  6637,  2005,  1996,  1001,  4029,  1025,  2184,  2454,
        2019, 22740,  1060,  3396,  1010,  1037,  5049,  2005,  1032,
        9139,  6787,  4942,  2953, 16313,  2389,  2686,  3462,  1010,
        2038,  3985,  2623,  1996,  2034,  1032,  4888,  3058,  2005,
        2049, 15371,  7596,  1012,   102,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,   

In [58]:
model(test_features['input_ids'][1])

ValueError: Exception encountered when calling layer "bert" (type TFBertMainLayer).

not enough values to unpack (expected 2, got 1)

Call arguments received:
  • input_ids=tf.Tensor(shape=(512,), dtype=int64)
  • attention_mask=None
  • token_type_ids=None
  • position_ids=None
  • head_mask=None
  • inputs_embeds=None
  • encoder_hidden_states=None
  • encoder_attention_mask=None
  • past_key_values=None
  • use_cache=None
  • output_attentions=False
  • output_hidden_states=False
  • return_dict=True
  • training=False

In [59]:
model(input_text_tokenized)[0]

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=
array([[ 0.74708337, -1.6014335 ,  2.7002594 , -2.2938297 ]],
      dtype=float32)>

In [39]:
results = model.predict(test_tokenized['input_ids'])

2022-04-28 14:05:14.745709: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [1]:
tokenized_text = tokenizer.encode(req['instances'][i]['text'], truncation=True, padding=True, return_tensors="tf")
            class_names[np.argmax(tf.nn.softmax(model(tokenized_text)[0], axis=1).numpy())]
            result.append(class_names[np.argmax(tf.nn.softmax(model(tokenized_text)[0], axis=1).numpy())])

NameError: name 'results' is not defined

In [None]:
X, y = test_features['input_ids'].numpy(), test_dataset['label'] 
# X, y = load_iris(return_X_y=True)
clf = LogisticRegression(solver="liblinear").fit(X, y)
roc_auc_score(y, clf.predict_proba(X), multi_class='ovr')