#Deep Learning for Natural Language Processing
##Chapter 10  Applications of Transformers: Hands-on with BERT

BERT in TensorFlow Hub: https://tfhub.dev/google/collections/bert/1

In [78]:
import tensorflow as tf
#!pip install keras-bert
from keras_bert import gen_batch_inputs, get_base_dict
from tensorflow import keras
from keras_bert import get_model, compile_model
import numpy as np

In [43]:
!pip install transformers
from transformers import TFBertModel, BertTokenizer
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
Insta

###Listing 10.1 A dedicated Keras layer for BERT models

In [7]:
class BertLayer(tf.keras.layers.Layer):

    def __init__(
        self,
        n_fine_tune_layers=12,
        bert_path=
        "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
        **kwargs
    ):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.bert_path = bert_path

        super(BertLayer, self).__init__(**kwargs)


    def build(self, input_shape):
        self.bert = hub.Module(
            self.bert_path,
            trainable=self.trainable,
            name=f"{self.name}_module"
        )
        trainable_vars = self.bert.variables
        trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
        trainable_vars = trainable_vars[-self.n_fine_tune_layers :]

        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]

        input_ids, input_mask, segment_ids = inputs

        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask,
            segment_ids=segment_ids
        )
        result = self.bert(inputs=bert_inputs, signature="tokens",
        as_dict=True)[
            "sequence_output"
        ]
        return result

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

###Listing 10.2 Processing input data for BERT

In [8]:
def readSentencePairs(fn):
    with open(fn) as f:
        lines = f.readlines()

    pairs=zip(lines, lines[1:])
    paired_sentences=[[a.rstrip().split(),b.rstrip().split()]
    for (a,b) in pairs]

    tokenD = get_base_dict()

    for pairs in paired_sentences:
        for token in pairs[0] + pairs[1]:
            if token not in tokenD:
                tokenD[token] = len(tokenD)
    tokenL = list(tokenD.keys())
    return (paired_sentences,tokenD,tokenL)

###Listing 10.3 Generating batch data for BERT

In [11]:
# use !pip install keras-bert
def BertGenerator(paired_sentences, tokenD, tokenL):
    while True:
        yield gen_batch_inputs(
            paired_sentences,
            tokenD,
            tokenL,
            seq_len=200,
            mask_rate=0.3,
            swap_sentence_rate=0.5,
        )

###Listing 10.4 Training a proprietary BERT model on data

In [15]:
def buildBertModel(paired_sentences,tokenD,tokenL, model_path):
    model = get_model(
        token_num=len(tokenD),
        head_num=5,
        transformer_num=12,
        embed_dim=256,
        feed_forward_dim=100,
        seq_len=200,
        pos_num=200,
        dropout_rate=0.05
    )
    compile_model(model)

    model.fit_generator(
        generator=BertGenerator(paired_sentences,tokenD,tokenL),
        steps_per_epoch=100,
        epochs=10
    )
    model.save(model_path)

In [68]:
# buildBertModel2 is modified by ChatGPT from buildBertModel
def buildBertModel2(paired_sentences, tokenD, tokenL, model_path):
    # Load a pre-trained BERT model and tokenizer
    model_name = "bert-base-uncased"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    bert_model = TFBertModel.from_pretrained(model_name)

    # Define your custom layers for fine-tuning
    input_ids = Input(shape=(None,), dtype=tf.int32, name="input_ids")
    outputs = bert_model(input_ids)
    pooled_output = outputs[1]  # Use the pooled output for classification
    dense_layer = Dense(2, activation="softmax")(pooled_output)  # Example: Binary classification

    # Create the custom model
    custom_model = Model(inputs=input_ids, outputs=dense_layer)

    # Compile the model
    optimizer = Adam(learning_rate=1e-5)
    custom_model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"])

    # Tokenize your input data (paired_sentences) and prepare it for training
    input_data = tokenizer(paired_sentences, padding=True, truncation=True, return_tensors="tf")
    labels = tokenL  # Assuming tokenL contains labels for classification

    # Train your model
    custom_model.fit(input_data, labels, epochs=10, batch_size=32)  # Adjust batch_size as needed

    # Save the model
    custom_model.save(model_path)

# Example usage:
# buildBertModel(paired_sentences, tokenD, tokenL, "./bert_model")

In [31]:
!pwd

/content


In [80]:
#upload The Cask of Amontillado.txt as sample
sentences="The Cask of Amontillado.txt"
(paired_sentences,tokenD,tokenL)=readSentencePairs(sentences)

print(paired_sentences)
print(tokenD)
print(tokenL)
print(len(tokenL))

[[['"The', 'Cask', 'of', 'Amontillado"', 'is', 'a', 'short', 'story', 'by', 'Edgar', 'Allan', 'Poe.', 'It', 'tells', 'the', 'tale', 'of', 'Montresor,', 'who', 'seeks', 'revenge', 'against', 'Fortunato', 'for', 'some', 'unknown', 'offense.', 'Montresor', 'is', 'a', 'cunning', 'character', 'who', 'lures', 'Fortunato', 'into', 'the', 'catacombs', 'of', 'his', 'family', 'estate.'], []], [[], ['Fortunato', 'is', 'known', 'for', 'his', 'expertise', 'in', 'wine.', 'He', 'takes', 'pride', 'in', 'his', 'connoisseurship', 'and', 'is', 'always', 'on', 'the', 'lookout', 'for', 'rare', 'and', 'valuable', 'wines.', 'This', 'weakness', 'becomes', 'his', 'downfall', 'when', 'Montresor', 'uses', "Fortunato's", 'passion', 'for', 'wine', 'to', 'trap', 'him.']], [['Fortunato', 'is', 'known', 'for', 'his', 'expertise', 'in', 'wine.', 'He', 'takes', 'pride', 'in', 'his', 'connoisseurship', 'and', 'is', 'always', 'on', 'the', 'lookout', 'for', 'rare', 'and', 'valuable', 'wines.', 'This', 'weakness', 'becomes

In [86]:
#upload The Cask of Amontillado.txt as sample
sentences1="The Cask of Amontillado1.txt"
(paired_sentences1,tokenD,tokenL)=readSentencePairs(sentences1)

print(paired_sentences1)
print(tokenD)
print(tokenL)
print(len(tokenL))

[[['He', 'had', 'a', 'weak', 'point', '—', 'this', 'Fortunato', '—', 'although', 'in', 'other', 'regards', 'he', 'was', 'a', 'man', 'to', 'be', 'respected', 'and', 'even', 'feared.', 'He', 'prided', 'himself', 'on', 'his', 'connoisseurship', 'in', 'wine.', 'Few', 'Italians', 'have', 'the', 'true', 'virtuoso', 'spirit.', 'For', 'the', 'most', 'part', 'their', 'enthusiasm', 'is', 'adopted', 'to'], ['suit', 'the', 'time', 'and', 'opportunity—to', 'practise', 'imposture', 'upon', 'the', 'British', 'and', 'Austrian.']], [['suit', 'the', 'time', 'and', 'opportunity—to', 'practise', 'imposture', 'upon', 'the', 'British', 'and', 'Austrian.'], ['millionaires']]]
{'': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3, '[MASK]': 4, 'He': 5, 'had': 6, 'a': 7, 'weak': 8, 'point': 9, '—': 10, 'this': 11, 'Fortunato': 12, 'although': 13, 'in': 14, 'other': 15, 'regards': 16, 'he': 17, 'was': 18, 'man': 19, 'to': 20, 'be': 21, 'respected': 22, 'and': 23, 'even': 24, 'feared.': 25, 'prided': 26, 'himself': 27, 'on

In [89]:
model_path="./bert.model"
paired_sentences1 = paired_sentences1[0]
buildBertModel2(paired_sentences1,tokenD,tokenL,model_path)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

ValueError: ignored

In [81]:
print(type(paired_sentences))
print(type(paired_sentences[1]))
print(type(paired_sentences[1][1]))
print(type(paired_sentences[1][1][1]))

<class 'list'>
<class 'list'>
<class 'list'>
<class 'str'>


In [82]:
paired_sentences = paired_sentences[0]

In [83]:
# Assuming tokenL is a list of tokens
# Define labels for sentence classification
# In this example, each sentence is assigned a label, and 'O' represents 'Other' or 'Not Classified'
labels = ['O']  # Initialize labels with 'Other' for the first token
current_label = 'O'  # Current label

for token in tokenL[1:]:  # Start from the second token since the first token is ''
    if token in ['[CLS]', '[SEP]']:
        current_label = 'O'  # Reset label for new sentence
    else:
        labels.append(current_label)  # Assign the current label to the token

# Now labels contains a label for each token in your text, including 'O' for tokens not belonging to a sentence

In [85]:
model_path="./bert.model"
labels = np.array(tokenL)
buildBertModel2(paired_sentences,tokenD,tokenL,model_path)  #tokenL -> labels

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

ValueError: ignored