The aim of this notebook is to fine tune a BERT pretrained model for text classification.

- **Model:** https://huggingface.co/bert-base-uncased
- **Dataset:** https://huggingface.co/datasets/ag_news

**Guide**: https://huggingface.co/docs/transformers/traininghttps://huggingface.co/docs/transformers/training

**Authors**

    - Tom Axberg (taxberg@kth.se)
    - Antonio Nieto (antonio.nieto@datatonic.com)

# Environment setup

!pip install transformers datasets numpy torch tensorflow ipywidgets

# Imports

In [1]:
import numpy as np
import random
import torch
import tensorflow as tf
from datasets import load_dataset
from transformers.file_utils import is_tf_available, is_torch_available
from transformers import RobertaTokenizerFast, TFAutoModelForSequenceClassification #OBS

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Custom functions

In [2]:
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf

        tf.random.set_seed(seed)

set_seed(1)

# Parameters

In [3]:
dataset_name = 'ag_news'
num_targets = 4 
model_name = "roberta-base"
max_length = 512


# Dataset

In [4]:
# Manually specify the number of unique targets
train_dataset = load_dataset(dataset_name, split="train[10%:]")
val_dataset = load_dataset(dataset_name, split="train[:10%]")
test_dataset = load_dataset(dataset_name, split="test")

Using custom data configuration default
Reusing dataset ag_news (/Users/tomaxberg/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)
Using custom data configuration default
Reusing dataset ag_news (/Users/tomaxberg/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)
Using custom data configuration default
Reusing dataset ag_news (/Users/tomaxberg/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


In [5]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 108000
})

In [14]:
train_dataset[0]

{'text': 'RocketInfo Partners with Canadian Press, Helps Nascar RocketInfo Partners with Canadian Press, Helps Nascar\\\\Rocketinfo Inc., news search engine announced yesterday that it has formed a key reseller alliance with the Canadian Press (CP), one of the top-rated multimedia news agencies in the world. CP plans to expand their media monitoring services by offering clients access to the ...',
 'label': 3}

# Tokenizer

In [8]:
# load the tokenizer (convert our text to sequence of tokens)
tokenizer = RobertaTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [9]:
# tokenize the dataset, truncate when passed 'max_length' and pad with 0's when less than 'max_length'
train_tokenized = train_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length'), batched=True)
val_tokenized = val_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length'), batched=True)
test_tokenized = test_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length'), batched=True)

Loading cached processed dataset at /Users/tomaxberg/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-05d0c53da221e9aa.arrow
Loading cached processed dataset at /Users/tomaxberg/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-911e8fa04cd1023a.arrow
Loading cached processed dataset at /Users/tomaxberg/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-f343d3eb7887bcbe.arrow


In [10]:
train_tokenized

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 108000
})

# Prepare dataset

In [11]:
# get data in standard tf.data.Dataset and remove 'text' label as it is not longer needed
tf_train_dataset = train_tokenized.remove_columns(['text']).with_format('tensorflow')
tf_val_dataset = val_tokenized.remove_columns(['text']).with_format('tensorflow')
tf_test_dataset = test_tokenized.remove_columns(['text']).with_format('tensorflow')

# convert to tensors
train_features = {x: tf_train_dataset[x] for x in tokenizer.model_input_names}
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, tf_train_dataset["label"]))
train_tf_dataset = train_tf_dataset.shuffle(len(tf_train_dataset)).batch(8)

val_features = {x: tf_val_dataset[x] for x in tokenizer.model_input_names}
val_tf_dataset = tf.data.Dataset.from_tensor_slices((val_features, tf_val_dataset["label"]))
val_tf_dataset = val_tf_dataset.shuffle(len(val_tf_dataset)).batch(8)

test_features = {x: tf_test_dataset[x] for x in tokenizer.model_input_names}
test_tf_dataset = tf.data.Dataset.from_tensor_slices((test_features, tf_test_dataset["label"]))
test_tf_dataset = test_tf_dataset.shuffle(len(test_tf_dataset)).batch(8)

Metal device set to: AMD Radeon Pro 560

systemMemory: 16.00 GB
maxCacheSize: 2.00 GB



2022-05-01 20:59:03.911377: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-01 20:59:03.911973: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-05-01 20:59:03.912249: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


# Train the model

In [15]:
# load the model (pre-trained weights)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_targets)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy()
)

model.fit(train_tf_dataset, validation_data=val_tf_dataset, epochs=3)

Epoch 1/3


2022-02-15 07:45:04.366719: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/3


<keras.callbacks.History at 0x7fa379429a90>

In [17]:
model.save_pretrained(f"../models/{model_name}-trained")

# Load trained model

In [15]:
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_targets)
model.load_weights(f"../models/{model_name}-trained/tf_model.h5")

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Test the model

In [16]:
input_text = "Fears for T N pension after talks Unions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul."
input_text_tokenized = tokenizer.encode(input_text,
                                        truncation=True,
                                        padding=True,
                                        return_tensors="tf")
prediction = model(input_text_tokenized)
prediction_logits = prediction[0]
prediction_probs = tf.nn.softmax(prediction_logits,axis=1).numpy()
print(f'The predicted label is: {np.argmax(prediction_probs)}')

The predicted label is: 0


In [29]:
model.save_pretrained(f"../models/{model_name}-trained")

In [13]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy()
)
result = model.evaluate(test_tf_dataset)
print("test loss, test acc:", result)

2022-05-01 20:59:22.217630: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


test loss, test acc: [1.386419415473938, 0.2499999850988388]
