In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [None]:
import pandas as pd
import numpy as np
import tf_keras
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix 

In [None]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/codebasics/deep-learning-keras-tf-tutorial/refs/heads/master/47_BERT_text_classification/spam.csv")
df.shape

In [None]:
df.groupby('Category').describe()

In [None]:
df_spam = df[df['Category'] == 'spam']
df_spam.shape

In [None]:
df_ham = df[df['Category'] == 'ham']
df_ham.shape

In [None]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

In [None]:
df_balanced = pd.concat([df_ham_downsampled, df_spam])
df_balanced.shape

In [None]:
df_balanced['Category'].value_counts()

In [None]:
df_balanced['spam'] = df_balanced['Category'].apply(
    lambda x: 1 if x == 'spam' else 0)
df_balanced.sample(5)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced['Message'], df_balanced['spam'], stratify=df_balanced['spam'])

In [None]:
bert_preprocess = hub.KerasLayer(
    "https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-preprocess/3")
bert_encoder = hub.KerasLayer(
    "https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-l-12-h-768-a-12/4")

In [None]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']


get_sentence_embeding([
    "500$ discount. hurry up",
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

In [None]:
e = get_sentence_embeding([
    "banana",
    "grapes",
    "mango",
    "jeff bezos",
    "elon musk",
    "bill gates"
]
)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([e[0]], [e[5]])

In [None]:
text_input = tf_keras.layers.Input(shape=(), dtype=tf.string, name="text")

preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

l = tf_keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf_keras.layers.Dense(1, activation='sigmoid', name="output")(l)

model = tf_keras.Model(inputs=text_input, outputs=[l])

model.summary()



In [None]:
METRICS = [
    tf_keras.metrics.BinaryAccuracy(name='accuracy'),
    tf_keras.metrics.Precision(name='precision'),
    tf_keras.metrics.Recall(name='recall')
]

model.compile(
    optimizer=tf_keras.optimizers.legacy.Adam(),
    loss=tf_keras.losses.BinaryCrossentropy(),
    metrics=METRICS
)


In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    X_train.values,
    y_train.values
)).batch(16).prefetch(tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((
    X_test.values,
    y_test.values
)).batch(16).prefetch(tf.data.AUTOTUNE)

history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=10,
    verbose=1
)


In [None]:
model.evaluate(test_dataset)