In [26]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [27]:
import pandas as pd

df = pd.read_csv("spam.csv")


df.rename(columns = {'v1':'Category','v2':'Message'}, inplace = True)

df=df[['Category','Message']]
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [28]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [29]:
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [5]:
df_spam = df[df['Category']=='spam']
df_spam.shape

(747, 2)

In [30]:
df_spam

Unnamed: 0,Category,Message
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
5537,spam,Want explicit SEX in 30 secs? Ring 02073162414...
5540,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,spam,Had your contract mobile 11 Mnths? Latest Moto...
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...


In [31]:
df_ham = df[df['Category']=='ham']
df_ham.shape

(4825, 2)

In [32]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [33]:
df_ham_downsampled

Unnamed: 0,Category,Message
3735,ham,Hows the street where the end of library walk is?
3336,ham,I AM AT THE GAS STATION. GO THERE.
2706,ham,S now only i took tablets . Reaction morning o...
2905,ham,"Helloooo... Wake up..! \Sweet\"" \""morning\"" \""..."
1097,ham,Don't fret. I'll buy the ovulation test strips...
...,...,...
2564,ham,"Under the sea, there lays a rock. In the rock,..."
4445,ham,Merry christmas to u too annie!
2483,ham,Mm have some kanji dont eat anything heavy ok
1684,ham,Do you want bold 2 or bb torch


In [34]:
df_balanced = pd.concat([df_ham_downsampled, df_spam])
df_balanced

Unnamed: 0,Category,Message
3735,ham,Hows the street where the end of library walk is?
3336,ham,I AM AT THE GAS STATION. GO THERE.
2706,ham,S now only i took tablets . Reaction morning o...
2905,ham,"Helloooo... Wake up..! \Sweet\"" \""morning\"" \""..."
1097,ham,Don't fret. I'll buy the ovulation test strips...
...,...,...
5537,spam,Want explicit SEX in 30 secs? Ring 02073162414...
5540,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,spam,Had your contract mobile 11 Mnths? Latest Moto...
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...


In [35]:
df_balanced['Category'].value_counts()

ham     747
spam    747
Name: Category, dtype: int64

In [36]:
df_balanced['spam']=pd.get_dummies(df_balanced['Category']).iloc[:,1];
df_balanced['spam']

3735    0
3336    0
2706    0
2905    0
1097    0
       ..
5537    1
5540    1
5547    1
5566    1
5567    1
Name: spam, Length: 1494, dtype: uint8

In [37]:
df_balanced['spam'].value_counts()

0    747
1    747
Name: spam, dtype: int64

In [38]:
df_balanced.sample(5)

Unnamed: 0,Category,Message,spam
4951,spam,"Welcome to Select, an O2 service with added be...",1
3005,ham,Also hi wesley how've you been,0
104,ham,Umma my life and vava umma love you lot dear,0
2790,spam,U�۪ve Bin Awarded �50 to Play 4 Instant Cash. ...,1
5127,ham,Cuz ibored. And don wanna study,0


In [39]:
#Split into training and Testing data set:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_balanced['Message'],df_balanced['spam'], stratify=df_balanced['spam'])

In [40]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [41]:

len(X_train)

1120

In [42]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [43]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer_2 (KerasLayer)     {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                    

In [44]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [45]:
model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f2228a7c40>

In [46]:
y_predicted = model.predict(X_test)



In [47]:
y_predicted

array([[0.05868604],
       [0.65388286],
       [0.21606986],
       [0.03101232],
       [0.50422823],
       [0.08578812],
       [0.35097006],
       [0.9230536 ],
       [0.8949697 ],
       [0.63761413],
       [0.5422714 ],
       [0.88396186],
       [0.08615966],
       [0.05482065],
       [0.88687986],
       [0.67826784],
       [0.33982772],
       [0.06838435],
       [0.06165968],
       [0.10513464],
       [0.8271612 ],
       [0.8564848 ],
       [0.14047295],
       [0.9025954 ],
       [0.24800824],
       [0.8364605 ],
       [0.23710665],
       [0.6809351 ],
       [0.9638962 ],
       [0.02973141],
       [0.07326338],
       [0.03301296],
       [0.8049297 ],
       [0.1192528 ],
       [0.948433  ],
       [0.839236  ],
       [0.18805559],
       [0.8680349 ],
       [0.06964193],
       [0.1125729 ],
       [0.15147771],
       [0.94322115],
       [0.6416068 ],
       [0.00906616],
       [0.5061192 ],
       [0.6426238 ],
       [0.92071825],
       [0.056

In [48]:
y_predicted = y_predicted.flatten()

In [49]:
import numpy as np

y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted

array([0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,

In [50]:
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_predicted)
cm 

array([[167,  20],
       [ 15, 172]], dtype=int64)

In [51]:
reviews = [
    'Enter a chance to win $5000, hurry up, offer valid until march 31, 2021',
    'You are awarded a SiPix Digital Camera! call 09061221061 from landline. Delivery within 28days. T Cs Box177. M221BP. 2yr warranty. 150ppm. 16 . p pÂ£3.99',
    'it to 80488. Your 500 free text messages are valid until 31 December 2005.',
    'Hey Sam, Are you coming for a cricket game tomorrow',
    "Why don't you wait 'til at least wednesday to see if you get your ."
]
t=model.predict(reviews)
print(t)

[[0.72859466]
 [0.8171603 ]
 [0.7633054 ]
 [0.15988478]
 [0.08176429]]
