In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [None]:
#Import the dataset (Dataset is taken from kaggle)
import pandas as pd

df = pd.read_csv("spam.csv")
df.head(5)

In [None]:
df.groupby('Category').describe()

In [None]:
df['Category'].value_counts()

In [None]:
#15% spam emails, 85% ham emails: This indicates class imbalance
#So we undersample to make the classes even
df_spam = df[df['Category']=='spam']
df_spam.shape

In [None]:
df_ham = df[df['Category']=='ham']
df_ham.shape

In [None]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

In [None]:
df_balanced = pd.concat([df_ham_downsampled, df_spam])
df_balanced.shape

In [None]:
df_balanced['Category'].value_counts()

In [None]:
#If spam we set 1 else 0
df_balanced['spam']=df_balanced['Category'].apply(lambda x: 1 if x=='spam' else 0)
df_balanced.sample(5)

In [None]:
#Split it into training and test data set

"""
df_balanced['Message'],  # Features (input data) - likely text messages
    df_balanced['spam'],     # Target labels (output) - whether a message is spam (1) or not (0)
    stratify=df_balanced['spam']  # Ensures balanced class distribution in train & test sets
"""
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_balanced['Message'],df_balanced['spam'], stratify=df_balanced['spam'])

In [None]:
X_train.head(4)

In [None]:
#Now lets import BERT model and get embeding vectors for few sample statements

bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [None]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "500$ discount. hurry up", 
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

In [None]:
#Get embeding vectors for few sample words. Compare them using cosine similarity
"""
Values near to 1 means they are similar. 0 means they are very different. 
Above you can use comparing "banana" vs "grapes" you get 0.99 similarity as they both are fruits
"""
e = get_sentence_embeding([
    "banana", 
    "grapes",
    "mango",
    "jeff bezos",
    "elon musk",
    "bill gates"
]
)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([e[0]],[e[1]])

In [None]:
cosine_similarity([e[0]],[e[3]])

In [None]:
cosine_similarity([e[3]],[e[4]])

In [None]:
"""
Build Model
There are two types of models you can build in tensorflow.

(1) Sequential (2) Functional

So far we are building sequential models where neurons of different layers are interconnected

In functional way every layer is a function and its output is passed as input to the next layer

IN the below model input shape is variable thats what shape=() means
"""

In [None]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
"""
(outputs['pooled_output'])

means set pooled_output from all outputs as the input for this layer
"""
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [None]:
model.summary()

In [None]:
len(X_train)

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [None]:
# Train the model
model.fit(X_train, y_train, epochs=10)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
y_predicted = model.predict(X_test)

#y predicted is 2d so we flatten to 1d
y_predicted = y_predicted.flatten()

In [None]:
"""
If value more than 0.5 1 if less than 0.5 set to 0
"""
import numpy as np

y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_predicted)
cm 

In [None]:
from matplotlib import pyplot as plt
import seaborn as sn
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')

In [None]:
print(classification_report(y_test, y_predicted))

In [None]:
reviews = [
    'Enter a chance to win $5000, hurry up, offer valid until march 31, 2021',
    
    'You are awarded a SiPix Digital Camera! call 09061221061 from landline. Delivery within 28days. 
    T Cs Box177. M221BP. 2yr warranty. 150ppm. 16 . p pÂ£3.99',
    
    'it to 80488. Your 500 free text messages are valid until 31 December 2005.',
    
    'Hey Sam, Are you coming for a cricket game tomorrow',
    
    "Why don't you wait 'til at least wednesday to see if you get your ."
]
model.predict(reviews)