In [24]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd

In [25]:
df=pd.read_csv('dataset/spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [26]:
df=df.drop_duplicates()

In [27]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4516,4516,"Go until jurong point, crazy.. Available only ...",1
spam,641,641,Free entry in 2 a wkly comp to win FA Cup fina...,1


Now we handle the imbalanced data by downsampling

In [28]:
df['Category'].value_counts()

Category
ham     4516
spam     641
Name: count, dtype: int64

In [29]:
df_spam=df[df.Category=='spam']
df_ham=df[df.Category=='ham']
df_ham.shape,df_spam.shape

((4516, 2), (641, 2))

In [30]:
df_ham_downsampled=df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(641, 2)

In [31]:
df_balanced=pd.concat([df_spam , df_ham_downsampled])
df_balanced.shape

(1282, 2)

In [32]:
df_balanced["Category"].value_counts()

Category
spam    641
ham     641
Name: count, dtype: int64

In [33]:
df_balanced['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df_balanced.sample(4)

Unnamed: 0,Category,Message,spam
3808,ham,Pls dont restrict her from eating anythin she ...,0
89,ham,"Ela kano.,il download, come wen ur free..",0
1000,ham,"Aight will do, thanks again for comin out",0
3460,spam,Not heard from U4 a while. Call me now am here...,1


Train_test_Split

In [34]:
from sklearn.model_selection import train_test_split
X=df_balanced['Message']
y=df_balanced['spam']
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y)

In [35]:
X_train.shape,y_train.shape

((961,), (961,))

In [36]:
preprocess_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
encoder_url ="https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-l-12-h-768-a-12/4"

In [37]:
bert_preprocess=hub.KerasLayer(preprocess_url)
bert_encoder=hub.KerasLayer(encoder_url)

Build Functional Model which is slightly different than sequential model

In [38]:
#bert layers
text_input=tf.keras.layers.Input(shape=(),dtype=tf.string,name='text')
preprocess_text=bert_preprocess(text_input)
outputs=bert_encoder(preprocess_text)

#neural network layers
l=tf.keras.layers.Dropout(0.1,name='dropout')(outputs['pooled_output'])
l=tf.keras.layers.Dense(1,activation='sigmoid',name='output')(l)

#construct final model
model=tf.keras.Model(inputs=[text_input],outputs=[l])
model.compile(optimizer='adam',loss='binary_crossentropy',metrics='accuracy')

In [39]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 text (InputLayer)           [(None,)]                    0         []                            
                                                                                                  
 keras_layer_2 (KerasLayer)  {'input_type_ids': (None,    0         ['text[0][0]']                
                             128),                                                                
                              'input_mask': (None, 128)                                           
                             , 'input_word_ids': (None,                                           
                              128)}                                                               
                                                                                            

In [40]:
model.fit(X_train,y_train,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fee3cf56cd0>

In [41]:
model.evaluate(X_test,y_test)



[0.3148680329322815, 0.9158878326416016]

The chances of spam are predicted below

In [42]:
reviews = [
    'Enter a chance to win $5000, hurry up, offer valid until march 31, 2021',
    'You are awarded a SiPix Digital Camera! call 09061221061 from landline. Delivery within 28days. T Cs Box177. M221BP. 2yr warranty. 150ppm. 16 . p pÂ£3.99',
    'it to 80488. Your 500 free text messages are valid until 31 December 2005.',
    'Hey Sam, Are you coming for a cricket game tomorrow',
    "Why don't you wait 'til at least wednesday to see if you get your ."
]
model.predict(reviews)



array([[0.7309525 ],
       [0.7911377 ],
       [0.7239688 ],
       [0.24910018],
       [0.17388599]], dtype=float32)

In [43]:
model.save('email_models/dropouts')

INFO:tensorflow:Assets written to: email_models/dropouts/assets


INFO:tensorflow:Assets written to: email_models/dropouts/assets
