In [113]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [114]:
import pandas as pd
import numpy as np
import tf_keras
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix 

In [115]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/codebasics/deep-learning-keras-tf-tutorial/refs/heads/master/47_BERT_text_classification/spam.csv")
df.shape

(5572, 2)

In [116]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [117]:
df_spam = df[df['Category'] == 'spam']
df_spam.shape

(747, 2)

In [118]:
df_ham = df[df['Category'] == 'ham']
df_ham.shape

(4825, 2)

In [119]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [120]:
df_balanced = pd.concat([df_ham_downsampled, df_spam])
df_balanced.shape

(1494, 2)

In [121]:
df_balanced['Category'].value_counts()

Category
ham     747
spam    747
Name: count, dtype: int64

In [122]:
df_balanced['spam'] = df_balanced['Category'].apply(
    lambda x: 1 if x == 'spam' else 0)
df_balanced.sample(5)

Unnamed: 0,Category,Message,spam
1133,ham,Good morning princess! How are you?,0
4056,ham,When is school starting. Where will you stay. ...,0
3548,spam,Rock yr chik. Get 100's of filthy films &XXX p...,1
2879,spam,Hey Boys. Want hot XXX pics sent direct 2 ur p...,1
3978,spam,Great NEW Offer - DOUBLE Mins & DOUBLE Txt on ...,1


In [123]:
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced['Message'], df_balanced['spam'], stratify=df_balanced['spam'])

In [124]:
bert_preprocess = hub.KerasLayer(
    "https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-preprocess/3")
bert_encoder = hub.KerasLayer(
    "https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-l-12-h-768-a-12/4")

In [125]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']


get_sentence_embeding([
    "500$ discount. hurry up",
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.84351677, -0.5132727 , -0.88845724, ..., -0.74748874,
        -0.75314736,  0.91964495],
       [-0.8720836 , -0.5054398 , -0.9444668 , ..., -0.8584751 ,
        -0.7174536 ,  0.8808298 ]], dtype=float32)>

In [126]:
e = get_sentence_embeding([
    "banana",
    "grapes",
    "mango",
    "jeff bezos",
    "elon musk",
    "bill gates"
]
)

In [127]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([e[0]], [e[5]])

array([[0.95718384]], dtype=float32)

In [128]:
text_input = tf_keras.layers.Input(shape=(), dtype=tf.string, name="text")

preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

l = tf_keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf_keras.layers.Dense(1, activation='sigmoid', name="output")(l)

model = tf_keras.Model(inputs=text_input, outputs=[l])

model.summary()



Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 text (InputLayer)           [(None,)]                    0         []                            
                                                                                                  
 keras_layer_20 (KerasLayer  {'input_type_ids': (None,    0         ['text[0][0]']                
 )                           128),                                                                
                              'input_mask': (None, 128)                                           
                             , 'input_word_ids': (None,                                           
                              128)}                                                               
                                                                                            

In [132]:
METRICS = [
    tf_keras.metrics.BinaryAccuracy(name='accuracy'),
    tf_keras.metrics.Precision(name='precision'),
    tf_keras.metrics.Recall(name='recall')
]

model.compile(
    optimizer=tf_keras.optimizers.legacy.Adam(),
    loss=tf_keras.losses.BinaryCrossentropy(),
    metrics=METRICS
)


In [133]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    X_train.values,
    y_train.values
)).batch(16).prefetch(tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((
    X_test.values,
    y_test.values
)).batch(16).prefetch(tf.data.AUTOTUNE)

history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=10,
    verbose=1
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [131]:
model.evaluate(test_dataset)



[0.2736600637435913,
 0.9010695219039917,
 0.8947368264198303,
 0.9090909361839294]