# Bert tutorial: Classify spam vs no spam
---

In [None]:
!pip -q install tensorflow_text tensorflow tensorflow_hub

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

# Import the dataset

In [None]:
df = pd.read_csv('/kaggle/input/spam-email-big/emails.csv')
df.head(5)

# Under-sampling Data


In [None]:
df['spam'].value_counts()

In [None]:
df_ham = df[df['spam'] == 0]
df_ham.shape

In [None]:
df_spam = df[df['spam'] == 1]
df_spam.shape

In [None]:
df_ham_down = df_ham.sample(df_spam.shape[0])
df_ham_down.shape

In [None]:
df_ham_down['spam'].value_counts()

In [None]:
df_balanced = pd.concat([df_ham_down, df_spam])
df_balanced.shape

In [None]:
df_balanced['spam'].value_counts()

# Over-Sampling Data

In [None]:
df_spam_over = df_spam.sample(df_ham.shape[0], replace = True)
df_test_over = pd.concat([df_spam_over,df_ham], axis = 0)

df_test_over.shape

In [None]:
df_test_over.spam.value_counts()

In [None]:
# cell for spam-email-big 
# df_balanced['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

# Split it into train and test

In [None]:
X = df_test_over.drop('spam', axis = 'columns')
y = df_test_over['spam']


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2,random_state= 142, stratify= y)

In [None]:
X_train.head()

# Now lest import BERT model and get embedding vector for few samplel statements

In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1")

# Build Model

There are two types of models you can build in tensorflow

1: Sequential , 2: Functional

So far we have build sequential model. But below we will functinal model. More information on these two is here:

https://becominghuman.ai/sequential-vs-functional-model-in-keras-20684f766057

In [None]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)


l = tf.keras.layers.Dropout(0.1, name='dropout')(outputs['pooled_output'])
l = tf.keras.layers.Dense(1,activation = 'sigmoid', name = 'output')(l)

model = tf.keras.Model(inputs = [text_input], outputs = [l])

In [None]:
model.summary()

In [None]:
len(X_train)

In [None]:
METRICS = [
    tf.keras.metrics.BinaryAccuracy(name = 'accuracy'),
    tf.keras.metrics.Precision(name = 'precision'),
    tf.keras.metrics.Recall(name = 'recall')
]

model.compile(optimizer = 'adam',
             loss = 'binary_crossentropy',
             metrics = METRICS)


# Train the model

In [None]:
model.fit(X_train, y_train, epochs = 10)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()

In [None]:
y_predicted = np.where(y_predicted > 0.5 , 1, 0)
y_predicted

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_predicted)
cm

In [None]:
import seaborn as sns

sns.heatmap(cm, annot= True, fmt = 'd')
plt.xlabel('Predicted')
plt.ylabel('Truth')

In [None]:
print(classification_report(y_test, y_predicted))

# Inference

In [None]:
reviews = [
    'I plane to give on this month end.',
    'Wah lucky man... Then can save money... Hee...',
    'Finished class where are you.',
    'HI BABE IM AT HOME NOW WANNA DO SOMETHING? XX',
    'K..k:)where are you?how did you performed?',
    'U can call me now...'
]

y_predicted = model.predict(reviews)
y_predicted

In [None]:
y_predicted = np.where(y_predicted > 0.5 , 1, 0)
y_predicted

In [None]:
model.save('model_spam_email.h5')