The aim of this notebook is to fine tune a distilBERT pretrained model for text classification.

- **Model:** https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english
- **Dataset:** https://huggingface.co/datasets/ag_news

**Guide**: https://huggingface.co/docs/transformers/traininghttps://huggingface.co/docs/transformers/training

**Authors**

    - Tom Axberg (taxberg@kth.se)
    - Antonio Nieto (antonio.nieto@datatonic.com)

# Environment setup

In [2]:
%pip install transformers datasets numpy torch tensorflow ipywidgets

Collecting datasets
  Downloading datasets-2.0.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 13.7 MB/s eta 0:00:01
Collecting torch
  Downloading torch-1.11.0-cp37-none-macosx_10_9_x86_64.whl (129.9 MB)
[K     |████████████████████████████████| 129.9 MB 56.6 MB/s eta 0:00:01
Collecting ipywidgets
  Downloading ipywidgets-7.7.0-py2.py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 53.3 MB/s eta 0:00:01
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 58.7 MB/s eta 0:00:01
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-macosx_10_9_x86_64.whl (570 kB)
[K     |████████████████████████████████| 570 kB 51.2 MB/s eta 0:00:01
[?25hCollecting pyarrow>=5.0.0
  Downloading pyarrow-7.0.0-cp37-cp37m-macosx_10_13_x86_64.whl (20.2 MB)
[K     |████████████████████████████████| 20.2 MB 16.5 MB/s eta 0:00:01     |██████████████████████

# Imports

In [6]:
import numpy as np
import random
import torch
import tensorflow as tf
from datasets import load_dataset
from transformers.file_utils import is_tf_available, is_torch_available
from transformers import DistilBertTokenizerFast, TFAutoModelForSequenceClassification

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Custom functions

In [7]:
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf

        tf.random.set_seed(seed)

set_seed(1)

# Parameters

In [8]:
dataset_name = 'ag_news'
num_targets = 4 
model_name = "distilbert-base-uncased"
max_length = 512

# Dataset

In [9]:
# Manually specify the number of unique targets
train_dataset = load_dataset(dataset_name, split="train[10%:]")
val_dataset = load_dataset(dataset_name, split="train[:10%]")
test_dataset = load_dataset(dataset_name, split="test")

Using custom data configuration default
Reusing dataset ag_news (/Users/tomaxberg/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)
Using custom data configuration default
Reusing dataset ag_news (/Users/tomaxberg/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)
Using custom data configuration default
Reusing dataset ag_news (/Users/tomaxberg/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


In [10]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 108000
})

In [11]:
train_dataset[0]

{'text': 'RocketInfo Partners with Canadian Press, Helps Nascar RocketInfo Partners with Canadian Press, Helps Nascar\\\\Rocketinfo Inc., news search engine announced yesterday that it has formed a key reseller alliance with the Canadian Press (CP), one of the top-rated multimedia news agencies in the world. CP plans to expand their media monitoring services by offering clients access to the ...',
 'label': 3}

# Tokenizer

In [12]:
# load the tokenizer (convert our text to sequence of tokens)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [13]:
# tokenize the dataset, truncate when passed 'max_length' and pad with 0's when less than 'max_length'
train_tokenized = train_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length'), batched=True)
val_tokenized = val_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length'), batched=True)
test_tokenized = test_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length'), batched=True)

Loading cached processed dataset at /Users/tomaxberg/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-b7a2597aa4e1db63.arrow
Loading cached processed dataset at /Users/tomaxberg/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-b7b942247f78c386.arrow
Loading cached processed dataset at /Users/tomaxberg/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-a8da77fd8a03c41a.arrow


In [14]:
train_tokenized

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 108000
})

# Prepare dataset

In [15]:
# get data in standard tf.data.Dataset and remove 'text' label as it is not longer needed
tf_train_dataset = train_tokenized.remove_columns(['text']).with_format('tensorflow')
tf_val_dataset = val_tokenized.remove_columns(['text']).with_format('tensorflow')
tf_test_dataset = test_tokenized.remove_columns(['text']).with_format('tensorflow')

# convert to tensors
train_features = {x: tf_train_dataset[x] for x in tokenizer.model_input_names}
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, tf_train_dataset["label"]))
train_tf_dataset = train_tf_dataset.shuffle(len(tf_train_dataset)).batch(8)

val_features = {x: tf_val_dataset[x] for x in tokenizer.model_input_names}
val_tf_dataset = tf.data.Dataset.from_tensor_slices((val_features, tf_val_dataset["label"]))
val_tf_dataset = val_tf_dataset.shuffle(len(val_tf_dataset)).batch(8)

test_features = {x: tf_test_dataset[x] for x in tokenizer.model_input_names}
test_tf_dataset = tf.data.Dataset.from_tensor_slices((test_features, tf_test_dataset["label"]))
test_tf_dataset = test_tf_dataset.shuffle(len(test_tf_dataset)).batch(8)

2022-05-01 20:28:15.941187: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-01 20:28:15.947911: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-05-01 20:28:15.949346: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Metal device set to: AMD Radeon Pro 560

systemMemory: 16.00 GB
maxCacheSize: 2.00 GB



# Train the model

In [16]:
# load the model (pre-trained weights)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_targets)

2022-05-01 20:28:20.794806: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_transform', 'activation_13', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint 

In [17]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy()
)
model.fit(train_tf_dataset, validation_data=val_tf_dataset, epochs=3)

Epoch 1/3


2022-04-14 15:26:33.068825: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Error: Canceled future for execute_request message before replies were done

In [15]:
model.save_pretrained(f"../models/{model_name}-trained")

# Load trained model

In [18]:
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_targets)
model.load_weights(f"../models/{model_name}-trained/tf_model.h5")

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_transform', 'activation_13', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_39', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [5]:
# model.summary()

tf.config.list_logical_devices()

Metal device set to: AMD Radeon Pro 560

systemMemory: 16.00 GB
maxCacheSize: 2.00 GB



2022-04-14 10:05:56.029307: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-14 10:05:56.031735: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-04-14 10:05:56.032490: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


[LogicalDevice(name='/device:CPU:0', device_type='CPU'),
 LogicalDevice(name='/device:GPU:0', device_type='GPU')]

## Test the model

In [22]:
input_text = "Up until Python 3.6 a script called pyvenv was also included as a wrapper around venv, but this has been deprecated. It will be completely removed in Python 3.8. The exact same functionality is available when using venv, and any existing documentation should be updated. "
input_text_tokenized = tokenizer.encode(input_text,
                                        truncation=True,
                                        padding=True,
                                        return_tensors="tf")
prediction = model(input_text_tokenized)
prediction_logits = prediction[0]
prediction_probs = tf.nn.softmax(prediction_logits,axis=1).numpy()
print(f'The predicted label is: {np.argmax(prediction_probs)}')

The predicted label is: 3


In [29]:
model.save_pretrained(f"../models/{model_name}-trained")

In [19]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy()
)
result = model.evaluate(test_tf_dataset)
print("test loss, test acc:", result)

2022-05-01 20:28:53.678520: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


test loss, test acc: [0.21650908887386322, 0.9347367882728577]
