In [11]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Download NLTK stopwords
nltk.download('stopwords')
import os

# Load the dataset
data = pd.read_csv("spam_sms.csv", encoding='latin-1')


data = data.rename(columns={"v1": "label", "v2": "message"})
data = data[['label', 'message']]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# Encode labels ('ham' -> 0, 'spam' -> 1)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [14]:
# Data preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stopwords
    return text

data['message'] = data['message'].apply(preprocess_text)


In [15]:
# Tokenization and Padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['message'])
X_seq = tokenizer.texts_to_sequences(data['message'])
max_length = 100
X_pad = pad_sequences(X_seq, maxlen=max_length, padding='post')


In [16]:
# Labels
y = data['label'].values

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)


In [17]:
# Define the model
embedding_dim = 100
vocab_size = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))




In [18]:
# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])


In [19]:
# Train the model
early_stop = EarlyStopping(monitor='val_loss', patience=3)
checkpoint = ModelCheckpoint('best_sms_model.keras', monitor='val_loss', save_best_only=True)
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test),
                    callbacks=[early_stop, checkpoint])


Epoch 1/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 22ms/step - accuracy: 0.8754 - loss: 0.3242 - val_accuracy: 0.9821 - val_loss: 0.0630
Epoch 2/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.9868 - loss: 0.0453 - val_accuracy: 0.9812 - val_loss: 0.0575
Epoch 3/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.9968 - loss: 0.0160 - val_accuracy: 0.9830 - val_loss: 0.0603
Epoch 4/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9992 - loss: 0.0044 - val_accuracy: 0.9767 - val_loss: 0.0747
Epoch 5/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9996 - loss: 0.0025 - val_accuracy: 0.9785 - val_loss: 0.0845


In [21]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")



[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9828 - loss: 0.0592
Test Loss: 0.0844985619187355, Test Accuracy: 0.9784753322601318
