In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense



In [26]:
data=pd.read_csv('datasets/Spam Email.csv')
data

Unnamed: 0,ID,Mail,Text,Label
0,1,ham,Subject: christmas tree farm pictures\r\n,0
1,2,ham,"Subject: vastar resources , inc .\r\ngary , pr...",0
2,3,ham,Subject: calpine daily gas nomination\r\n- cal...,0
3,4,ham,Subject: re : issue\r\nfyi - see note below - ...,0
4,5,ham,Subject: meter 7268 nov allocation\r\nfyi .\r\...,0
...,...,...,...,...
5166,5167,spam,Subject: our pro - forma invoice attached\r\nd...,1
5167,5168,spam,Subject: str _ rndlen ( 2 - 4 ) } { extra _ ti...,1
5168,5169,spam,Subject: check me out !\r\n61 bb\r\nhey derm\r...,1
5169,5170,spam,Subject: hot jobs\r\nglobal marketing specialt...,1


In [27]:
data=data.drop(columns=["ID","Mail"])

In [28]:
data

Unnamed: 0,Text,Label
0,Subject: christmas tree farm pictures\r\n,0
1,"Subject: vastar resources , inc .\r\ngary , pr...",0
2,Subject: calpine daily gas nomination\r\n- cal...,0
3,Subject: re : issue\r\nfyi - see note below - ...,0
4,Subject: meter 7268 nov allocation\r\nfyi .\r\...,0
...,...,...
5166,Subject: our pro - forma invoice attached\r\nd...,1
5167,Subject: str _ rndlen ( 2 - 4 ) } { extra _ ti...,1
5168,Subject: check me out !\r\n61 bb\r\nhey derm\r...,1
5169,Subject: hot jobs\r\nglobal marketing specialt...,1


In [29]:
# Preprocess the text data (e.g., remove punctuation, tokenize, lowercase)
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(data['Text'], data['Label'], test_size=0.2, random_state=42)


In [30]:

# Tokenization and padding
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train)
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)
maxlen = 100  # Maximum length of sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=maxlen)
x_test_pad = pad_sequences(x_test_seq, maxlen=maxlen)


array([[   0,    0,    0, ...,  144, 7394, 1583],
       [ 190,   18, 1003, ...,   62,    1,  196],
       [1808,   26,    2, ..., 6687, 7401, 8387],
       ...,
       [   0,    0,    0, ...,   30,    1, 1870],
       [   0,    0,    0, ...,  610,    2, 5323],
       [   1,   16,  880, ...,  744,    1,  278]])

In [33]:

# Define the RNN model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=32, input_length=maxlen))
model.add(SimpleRNN(units=32))
model.add(Dense(units=1, activation='sigmoid'))


In [35]:

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [36]:

# Train the model
history = model.fit(x_train_pad, y_train, epochs=5, batch_size=128, validation_split=0.2)


Epoch 1/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 221ms/step - accuracy: 0.6632 - loss: 0.6130 - val_accuracy: 0.7138 - val_loss: 0.5274
Epoch 2/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 138ms/step - accuracy: 0.7978 - loss: 0.4559 - val_accuracy: 0.9118 - val_loss: 0.3169
Epoch 3/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 138ms/step - accuracy: 0.9509 - loss: 0.2077 - val_accuracy: 0.9336 - val_loss: 0.1945
Epoch 4/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 134ms/step - accuracy: 0.9809 - loss: 0.0908 - val_accuracy: 0.9336 - val_loss: 0.1736
Epoch 5/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 150ms/step - accuracy: 0.9887 - loss: 0.0610 - val_accuracy: 0.9372 - val_loss: 0.1559


In [37]:

# Evaluate the model
test_loss, test_acc = model.evaluate(x_test_pad, y_test)
print(f'Test accuracy: {test_acc}')

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.9397 - loss: 0.1532
Test accuracy: 0.939130425453186


In [40]:
# Reshape the individual sequence to match the expected input shape
individual_sequence = x_test_pad[1].reshape(1, -1)

# Predict on the reshaped individual sequence
prediction = model.predict(individual_sequence)
print(prediction)

# Print the corresponding true label
print(y_test.iloc[1])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step
[[0.01968108]]
0
