BERT training practice test subject: using Keras and Tensorflow for text classification instead of Transformers library

Author: Jerry Zou

To install completel tensorflow files: \
pip install -q tensorflow==2.3.0  \
git clone --depth 1 -b v2.3.0 https://github.com/tensorflow/models.git  \
pip install -Uqr models/official/requirements.txt 

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib as plt
from tensorflow import keras
import tensorflow_hub as hub
import tensorflow_text as text
from transformers import AutoTokenizer, TFBertModel
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

# If needed: 
# import kagglehub
# # Download latest version of encoder
# path = kagglehub.model_download("tensorflow/bert/tensorFlow2/en-uncased-l-12-h-768-a-12")
# print("Path to model files:", path)

# # Download latest version of preprocessor
# path = kagglehub.model_download("tensorflow/bert/tensorFlow2/en-uncased-preprocess")
# print("Path to model files:", path)

In [None]:
dataFrame = pd.read_csv("/Users/Jerry/Desktop/KerasPracticeQuoraTrain.csv")
#EXAMPLE: dataFrame.groupby("target").describe()

In [None]:
xTraining, xTesting, yTraining, yTesting = train_test_split(dataFrame["question_text"], dataFrame["target"], test_size=0.2, stratify=dataFrame["target"])
# it is important to have stratify for imbalanced datasets

#EXAMPLE: yTraining.value_counts()

In [None]:
#using keras layer
bertPreprocessName = '/Users/Jerry/.cache/kagglehub/models/tensorflow/bert/tensorFlow2/en-uncased-preprocess/3'
#main processing
bertPreprocess = hub.KerasLayer(bertPreprocessName)
#main encoder
bertEncoder = hub.KerasLayer("/Users/Jerry/.cache/kagglehub/models/tensorflow/bert/tensorFlow2/en-uncased-l-12-h-768-a-12/4")

In [None]:
def getSentenceEmbedding(sentences):
    preprocessedText = bertPreprocess(sentences)
    return bertEncoder(preprocessedText)["pooled_output"]

#EXAMPLE: getSentenceEmbedding(["los angeles is a wonderdful city", "it is not quite good."])
#EXAMPLE:
# e = getSentenceEmbedding([
#     "banana",
#     "grapes",
#     "mango",
#     "jeff bezos",
#     "elon musk",
#     "bill gates"
# ])
# cosine_similarity([e[4]],[e[3]])

In [None]:
inputLayer = tf.keras.layers.Input(shape=(), dtype=tf.string, name="text")
preprocessedText = bertPreprocess(inputLayer)
outputs = bertEncoder(preprocessedText)
layer = tf.keras.layers.Dropout(0.1, name="dropout")(outputs["pooled_output"])
layer = tf.keras.layers.Dense(1, activation="sigmoid", name="output")(layer)

model = tf.keras.Model(inputs=[inputLayer], outputs = [layer])
model.summary()

In [None]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

#TRAINING:
model.fit(xTraining, yTraining, epochs = 5)

In [None]:
model.evaluate(xTesting, yTesting)

In [None]:
# RESULT:
# input content and receive result. Over 0.5 (or 50%) would be considered as matching.
review = ["content content sentence", "content sentence content"]
model.predict(review)

---

BELOW ARE FAILED CODE THAT I'M KEEPING FOR FUTURE REFERENCE

In [None]:
# import numpy as np
# import pandas as pd
# import matplotlib as plt
# import sys
# sys.path.append("models")
# from official.nlp.data import classifier_data_lib
# from official.nlp.bert import tokenization
# from official.nlp import optimization
# from sklearn.model_selection import train_test_split
# from transformers import AutoTokenizer, TFBertModel
# import tensorflow as tf
# import tensorflow_hub as hub
# from tensorflow import keras
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Activation, Dense
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.metrics import categorical_crossentropy

In [None]:
# model = 'bert-base-uncased'
# tokenizer = AutoTokenizer.from_pretrained(model)

# labelList = [0, 1]
# maxSeqLength = 128

# # Define the mapping function to tokenize text and convert label
# def toFeatureMap(text, label):
#     encoding = tokenizer(text.numpy().decode('utf-8'), truncation=True, padding='max_length', max_length=maxSeqLength, return_tensors='tf')
    
#     # Extract element from the batched tensors
#     inputID = encoding['input_ids'][0]
#     attentionMask = encoding['attention_mask'][0]
#     tokenTypeID = encoding['token_type_ids'][0] if 'token_type_ids' in encoding else tf.zeros_like(input_ids)
    
#     labelID = tf.convert_to_tensor(label, dtype=tf.int32)
#     return {'input_ids': inputID, 'attention_mask': attentionMask, 'token_type_ids': tokenTypeID}, labelID

# # Wrap the function with tf.py_function to make it compatible with TensorFlow's graph execution
# def toFeatureMapWrapper(text, label):
#     return tf.py_function(
#         toFeatureMap, 
#         inp=[text, label], 
#         Tout=(
#             {
#                 'input_ids': tf.int32,
#                 'attention_mask': tf.int32,
#                 'token_type_ids': tf.int32
#             },
#             tf.int32
#         )
#     )

# #------

# dataFrame = pd.read_csv("/Users/Jerry/Desktop/KerasPracticeQuoraTrain.csv")
# dataFrame.tail(20)
# # qid - unique question identifier
# # question_text - Quora question text
# # target - a question labeled "insincere" has a value of 1, otherwise 0

# #dataFrame.target.plot(kind="hist", title="Target distribution")



# #Split into training and testing datasets
# trainingData, remaining = train_test_split(dataFrame, train_size=0.005, stratify=dataFrame["target"])
# validationData, _ = train_test_split(remaining, random_state=45, train_size=0.0005, stratify=remaining["target"])
# #trainingData.shape, validationData.shape

# #------

# # Creating tensorflow dataset to create python iterables. These datasets have two values, so use "for x, y in ..." during iteration.
# trainData = tf.data.Dataset.from_tensor_slices((trainingData["question_text"].values, trainingData["target"].values))
# validData = tf.data.Dataset.from_tensor_slices((validationData["question_text"].values, validationData["target"].values))

# #maps the data to prepare for training
# trainData = trainData.map(toFeatureMapWrapper, num_parallel_calls=tf.data.experimental.AUTOTUNE)
# validData = validData.map(toFeatureMapWrapper, num_parallel_calls=tf.data.experimental.AUTOTUNE)

# for question, targetValue in trainData.take(1):
#     print(question)
#     print(targetValue)

# # -------

# # Preprocessing
# trainBatchSize = 32
# trainData = trainData.batch(trainBatchSize).prefetch(tf.data.experimental.AUTOTUNE)
# validData = validData.batch(trainBatchSize).prefetch(tf.data.experimental.AUTOTUNE)

# modelName = 'bert-base-uncased'
# tokenizer = AutoTokenizer.from_pretrained(modelName)
# model = TFBertModel.from_pretrained(modelName)

# class BertClassifier(tf.keras.Model):
#     def __init__(self, modelName):
#         super(BertClassifier, self).__init__()
#         self.bert = modelName
#         self.dense = tf.keras.layers.Dense(2, activation='softmax')  # Assuming binary classification
    
#     def call(self, inputs):
#         inputIDs, attentionMask = inputs
#         outputs = self.bert(inputIDs, attention_mask=attentionMask)
#         pooled_output = outputs.pooler_output
#         return self.dense(pooled_output)
    
# model = BertClassifier(modelName)

#-----

# # Preprocessing
# trainBatchSize = 32
# trainData = trainData.batch(trainBatchSize).prefetch(tf.data.experimental.AUTOTUNE)
# validData = validData.batch(trainBatchSize).prefetch(tf.data.experimental.AUTOTUNE)

# modelName = 'bert-base-uncased'
# tokenizer = AutoTokenizer.from_pretrained(modelName)
# model = TFBertModel.from_pretrained(modelName)

# class BertClassifier(tf.keras.Model):
#     def __init__(self, modelName):
#         super(BertClassifier, self).__init__()
#         self.bert = modelName
#         self.dense = tf.keras.layers.Dense(2, activation='softmax')  # Assuming binary classification
    
#     def call(self, inputs):
#         inputIDs, attentionMask = inputs
#         outputs = self.bert(inputIDs, attention_mask=attentionMask)
#         pooled_output = outputs.pooler_output
#         return self.dense(pooled_output)
    
# model = BertClassifier(modelName)


# #------


# text = "Hi How are you doing?"
# # Displays each individual word
# #tokenizer.tokenize(text)

# # Displays each token's input ID
# #tokenizer.encode(text)

# #dispays input id, type id, and attention mask
# tokenizer(text)

# #-----
# encodings = tokenizer(text, return_tensors="tf", truncation=True, padding="max_length", max_length=128)
# input_ids = encodings['input_ids']
# attention_mask = encodings['attention_mask']
# print(input_ids)

In [None]:
# import tensorflow as tf
# from transformers import AutoTokenizer, TFBertModel
# import pandas as pd
# from sklearn.model_selection import train_test_split

# # Define the tokenizer
# model_name = 'bert-base-uncased'
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# labelList = [0, 1]  # Example label list
# maxSeqLength = 128  # Example maximum sequence length

# # Define the mapping function to tokenize text and convert label
# def toFeatureMap(text, label):
#     # Tokenize the text
#     encoding = tokenizer(
#         text.numpy().decode('utf-8'),
#         truncation=True,
#         padding='max_length',
#         max_length=maxSeqLength,
#         return_tensors='tf'
#     )
    
#     # Extract the first (and only) element from the batched tensors
#     input_ids = tf.convert_to_tensor(encoding['input_ids'][0], dtype=tf.int32)
#     attention_mask = tf.convert_to_tensor(encoding['attention_mask'][0], dtype=tf.int32)
#     token_type_ids = tf.convert_to_tensor(encoding['token_type_ids'][0] if 'token_type_ids' in encoding else tf.zeros_like(input_ids), dtype=tf.int32)
    
#     # Convert the label to its corresponding ID
#     label_id = tf.convert_to_tensor(label, dtype=tf.int32)
    
#     return input_ids, attention_mask, token_type_ids, label_id

# # Wrap the function with tf.py_function to make it compatible with TensorFlow's graph execution
# def toFeatureMapWrapper(text, label):
#     input_ids, attention_mask, token_type_ids, label_id = tf.py_function(
#         toFeatureMap, 
#         inp=[text, label], 
#         Tout=[tf.int32, tf.int32, tf.int32, tf.int32]
#     )
#     input_ids.set_shape([maxSeqLength])
#     attention_mask.set_shape([maxSeqLength])
#     token_type_ids.set_shape([maxSeqLength])
#     label_id.set_shape([])
#     return {
#         'input_ids': input_ids,
#         'attention_mask': attention_mask,
#         'token_type_ids': token_type_ids
#     }, label_id

# # Load the CSV file
# dataFrame = pd.read_csv("/Users/Jerry/Desktop/KerasPracticeQuoraTrain.csv")
# print(dataFrame.tail(5))

# # Split the data into training and validation sets
# trainingData, remaining = train_test_split(dataFrame, train_size=0.005, stratify=dataFrame["target"])
# validationData, _ = train_test_split(remaining, random_state=45, train_size=0.0005, stratify=remaining["target"])

# # Create TensorFlow datasets from the training and validation data
# trainData = tf.data.Dataset.from_tensor_slices((trainingData["question_text"].values, trainingData["target"].values))
# validData = tf.data.Dataset.from_tensor_slices((validationData["question_text"].values, validationData["target"].values))

# # Map the dataset using the mapping function
# trainData = trainData.map(toFeatureMapWrapper, num_parallel_calls=tf.data.experimental.AUTOTUNE)
# validData = validData.map(toFeatureMapWrapper, num_parallel_calls=tf.data.experimental.AUTOTUNE)
# print(trainData)
# print(validData)

# # Batch and prefetch the datasets
# trainBatchSize = 32
# trainData = trainData.batch(trainBatchSize).prefetch(tf.data.experimental.AUTOTUNE)
# validData = validData.batch(trainBatchSize).prefetch(tf.data.experimental.AUTOTUNE)

# # Define the model
# class BertClassifier(tf.keras.Model):
#     def __init__(self, modelName):
#         super(BertClassifier, self).__init__()
#         self.bert = TFBertModel.from_pretrained(modelName)  # Load BERT model here
#         self.dense = tf.keras.layers.Dense(2, activation='softmax')  # Assuming binary classification

#     def call(self, inputs):
#         input_ids = inputs['input_ids']
#         attention_mask = inputs['attention_mask']
#         token_type_ids = inputs['token_type_ids']
#         outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
#         pooled_output = outputs.pooler_output
#         return self.dense(pooled_output)

# # Initialize the model
# model = BertClassifier(model_name)

# # Compile the model
# model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
#               loss=tf.keras.losses.SparseCategoricalCrossentropy(),
#               metrics=['accuracy'])

# # Train the model
# model.fit(trainData, validation_data=validData, epochs=3)