In [1]:
import pandas as  pd
import numpy as np

In [2]:
import tensorflow as tf

In [4]:
df = pd.read_csv("spam.csv",  encoding='ISO-8859-1')

In [10]:
df = df[['v1','v2']]

In [12]:
df.rename({'v1': 'category', 'v2': 'message'}, axis=1, inplace=True)

In [13]:
df.head()

Unnamed: 0,category,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
df.shape

(5572, 2)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  5572 non-null   object
 1   message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [16]:
df.describe()

Unnamed: 0,category,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [19]:
# encode spam with 1, ham with 0

In [17]:
df["category"] = [1 if category == "spam" else 0 for category in df["category"]]

In [18]:
df.head()

Unnamed: 0,category,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
emails = df.iloc[:, 1].values

In [22]:
labels = df.iloc[:, 0].values

In [None]:
# Tokenize using word2vec

In [216]:
import gensim
gensim_model = gensim.models.Word2Vec(
    window = 10,
    min_count=2,
    workers=2
)

In [None]:
# split email by space

In [217]:
emails_parsed = [email.split(" ") for email in emails]

In [218]:
gensim_model.build_vocab(emails_parsed, progress_per=1000)

In [219]:
gensim_model.train(emails_parsed, total_examples=gensim_model.corpus_count, epochs=gensim_model.epochs)

(326302, 434805)

In [None]:
# tokenize each email

In [240]:
vectors = [[gensim_model.wv[word] for word in email if word in gensim_model.wv] for email in emails_parsed]

In [None]:
# find email length that is tokenized with max length

In [243]:
max_length = max(len(seq) for seq in vectors)

In [245]:
print(max_length)
print(len(vectors[0]))
print(len(vectors[1]))

154
14
5


In [None]:
# convert each email vector to max_length vector length by padding with 0's

In [246]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_sequences = pad_sequences(vectors, maxlen=max_length, padding='post', dtype='float32')

In [247]:
print(len(padded_sequences[0]))
print(len(padded_sequences[1]))

154
154


In [248]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, SimpleRNN, Flatten, Dense, Input

In [249]:
model = Sequential()

In [None]:
# No need to add embedding layer since vectors are already proper tokenized vectors

In [250]:
model.add(Input(shape=(max_length, gensim_model.vector_size))),
model.add(Bidirectional(SimpleRNN(64, return_sequences=True)))
model.add(Bidirectional(SimpleRNN(64), merge_mode="concat"))
model.add(Flatten())
model.add(Dense(24, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [251]:
model.compile(optimizer='adam', loss='binary_crossentropy')

In [140]:
model.summary()

In [252]:
model.fit(padded_sequences, labels, epochs=25)

Epoch 1/25
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 146ms/step - loss: 0.4021
Epoch 2/25
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 139ms/step - loss: 0.3708
Epoch 3/25
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 139ms/step - loss: 0.3579
Epoch 4/25
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 139ms/step - loss: 0.3893
Epoch 5/25
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 141ms/step - loss: 0.3423
Epoch 6/25
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 137ms/step - loss: 0.3597
Epoch 7/25
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 135ms/step - loss: 0.3264
Epoch 8/25
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 139ms/step - loss: 0.3110
Epoch 9/25
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 127ms/step - loss: 0.3532
Epoch 10/25
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

<keras.src.callbacks.history.History at 0x7b79021700a0>

In [268]:
text = "You won Free iphone in lottery"
words = text.split()
vector = [gensim_model.wv[word] for word in words if word in gensim_model.wv]

In [271]:
len(vector)

4

In [274]:
print(len(vector))
len(padded_output[0])

4


154

In [262]:
# 1st argument should be 2d array

In [273]:
padded_output = pad_sequences([vector], maxlen=max_length, padding='post', dtype='float32')

In [275]:
prob = model.predict(padded_output)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step


In [266]:
# since spam is 1 and ham is 0 in labels, output probability is probability of spam

In [276]:
prob[0][0]

0.028969891