# Finetuning Bert

In [1]:
!pip install datasets



In [2]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
from datasets import load_dataset

In [3]:
model = TFAutoModel.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT

In [4]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")



In [5]:
inputs = tokenizer(['Hello world', 'Hi how are you'], padding=True, truncation=True,
                  return_tensors='tf')
inputs

{'input_ids': <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[ 101, 7592, 2088,  102,    0,    0],
       [ 101, 7632, 2129, 2024, 2017,  102]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 1, 1]], dtype=int32)>}

In [6]:
output = model(inputs)
output

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<tf.Tensor: shape=(2, 6, 768), dtype=float32, numpy=
array([[[-0.16888303,  0.13606322, -0.13940069, ..., -0.62511253,
          0.05217256,  0.36714554],
        [-0.36327457,  0.14121902,  0.8799883 , ...,  0.10432974,
          0.28875703,  0.3726792 ],
        [-0.69859457, -0.69879764,  0.06450184, ..., -0.22103673,
          0.00986813, -0.59397924],
        [ 0.8309828 ,  0.12366725, -0.15119071, ...,  0.10309596,
         -0.67792666, -0.26285237],
        [-0.40266633, -0.01928249,  0.5732501 , ..., -0.20656863,
          0.02338587,  0.201263  ],
        [-0.62284064, -0.27453512,  0.18117598, ..., -0.12944904,
         -0.03839107, -0.05733179]],

       [[ 0.09286551, -0.02636355, -0.12239316, ..., -0.2106356 ,
          0.17386383,  0.17250958],
        [ 0.40742028, -0.05930961,  0.55234647, ..., -0.6790565 ,
          0.65557384, -0.29456487],
        [-0.21155296, -0.6858637 , -0.46280813, ...,  0.15278494

In [7]:
emotions = load_dataset("gretelai/symptom_to_diagnosis")

In [8]:
emotions

DatasetDict({
    train: Dataset({
        features: ['output_text', 'input_text'],
        num_rows: 853
    })
    test: Dataset({
        features: ['output_text', 'input_text'],
        num_rows: 212
    })
})

In [9]:
# Modify the Dataset dict to be in the right format.

# Switch columns.

In [10]:
print(type(emotions['train']))

<class 'datasets.arrow_dataset.Dataset'>


In [11]:
print(type(emotions))

<class 'datasets.dataset_dict.DatasetDict'>


In [12]:
# I assume each dataset is like a list of dictionaries.
emotions['train'][0]

{'output_text': 'cervical spondylosis',
 'input_text': "I've been having a lot of pain in my neck and back. I've also been having trouble with my balance and coordination. I've been coughing a lot and my limbs feel weak."}

# Manipulating dataset

In [13]:
from datasets import load_dataset
from datasets import Dataset, DatasetDict

In [14]:
# Obtaining training dataset.

train_text = emotions['train']['input_text']
train_labels = emotions['train']['output_text']

unique_labels = list(set(train_labels))
label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
numerical_labels = [label_to_id[label] for label in train_labels]


train_dict = {"text": train_text, "label": numerical_labels, "label_text": train_labels}
train_dataset = Dataset.from_dict(train_dict)
print(train_dataset)
print(train_dataset[0])

Dataset({
    features: ['text', 'label', 'label_text'],
    num_rows: 853
})
{'text': "I've been having a lot of pain in my neck and back. I've also been having trouble with my balance and coordination. I've been coughing a lot and my limbs feel weak.", 'label': 7, 'label_text': 'cervical spondylosis'}


In [15]:
# Obtaining testing dataset.

test_text = emotions['test']['input_text']
test_labels = emotions['test']['output_text']

unique_labels = list(set(test_labels))
label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
test_numerical_labels = [label_to_id[label] for label in test_labels]


test_dict = {"text": test_text, "label": test_numerical_labels, "label_text": test_labels}
test_dataset = Dataset.from_dict(test_dict)
print(test_dataset)
print(test_dataset[0])

Dataset({
    features: ['text', 'label', 'label_text'],
    num_rows: 212
})
{'text': "I have a burning sensation in my stomach that comes and goes. It's worse when I eat and when I lie down. I also have heartburn and indigestion.", 'label': 5, 'label_text': 'peptic ulcer disease'}


In [16]:
# Combine the datasets into a DatasetDict
emotions = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

# Print the DatasetDict to verify
print(emotions)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 853
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 212
    })
})


# Finetuning continued

In [17]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [18]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/853 [00:00<?, ? examples/s]

Map:   0%|          | 0/212 [00:00<?, ? examples/s]

In [19]:
emotions_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 853
    })
    test: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 212
    })
})

In [20]:
# setting 'input_ids', 'attention_mask', 'token_type_ids', and 'label'
# to the tensorflow format. Now if you access this dataset you will get these
# columns in `tf.Tensor` format

emotions_encoded.set_format('tf',
                            columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])
emotions_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 853
    })
    test: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 212
    })
})

In [21]:
# setting BATCH_SIZE to 64.
BATCH_SIZE = 64

def order(inp):
    '''
    This function will group all the inputs of BERT
    into a single dictionary and then output it with
    labels.
    '''
    data = list(inp.values())
    return {
        'input_ids': data[1],
        'attention_mask': data[2],
        'token_type_ids': data[3]
    }, data[0]

# converting train split of `emotions_encoded` to tensorflow format
train_dataset = tf.data.Dataset.from_tensor_slices(emotions_encoded['train'][:])
train_dataset

<_TensorSliceDataset element_spec={'label': TensorSpec(shape=(), dtype=tf.int64, name=None), 'input_ids': TensorSpec(shape=(76,), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(76,), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(76,), dtype=tf.int64, name=None)}>

In [22]:
# set batch_size and shuffle
train_dataset = train_dataset.batch(BATCH_SIZE).shuffle(1000)
train_dataset

<_ShuffleDataset element_spec={'label': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'input_ids': TensorSpec(shape=(None, 76), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(None, 76), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 76), dtype=tf.int64, name=None)}>

In [23]:
# map the `order` function
train_dataset = train_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)
train_dataset

<_ParallelMapDataset element_spec=({'input_ids': TensorSpec(shape=(None, 76), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 76), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(None, 76), dtype=tf.int64, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [24]:
# ... doing the same for test set ...
test_dataset = tf.data.Dataset.from_tensor_slices(emotions_encoded['test'][:])
test_dataset = test_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

In [25]:
inp, out = next(iter(train_dataset)) # a batch from train_dataset
print(inp, '\n\n', out)

{'input_ids': <tf.Tensor: shape=(64, 76), dtype=int64, numpy=
array([[ 101, 1045, 1005, ...,    0,    0,    0],
       [ 101, 1045, 2031, ...,    0,    0,    0],
       [ 101, 1045, 1005, ...,    0,    0,    0],
       ...,
       [ 101, 1045, 2031, ...,    0,    0,    0],
       [ 101, 1045, 2031, ...,    0,    0,    0],
       [ 101, 1045, 2411, ...,    0,    0,    0]])>, 'attention_mask': <tf.Tensor: shape=(64, 76), dtype=int64, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>, 'token_type_ids': <tf.Tensor: shape=(64, 76), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>} 

 tf.Tensor(
[10  3 13  0  0 16 12 12 13  5 13 11 17 21  0  4 19 17  7  4  6

In [26]:
class BERTForClassification(tf.keras.Model):

    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        self.fc = tf.keras.layers.Dense(num_classes, activation='softmax')

    def call(self, inputs):
        x = self.bert(inputs)[1]
        return self.fc(x)

In [27]:
classifier = BERTForClassification(model, num_classes=22)

classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

In [28]:
history = classifier.fit(
    train_dataset,
    epochs=20
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [29]:
classifier.evaluate(test_dataset)



[0.2796798348426819, 0.9433962106704712]