In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [3]:
data = pd.read_csv('D:/Datasets/spam.csv')
print(data.head())

     v1                                                 v2  Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...         NaN   
1   ham                      Ok lar... Joking wif u oni...         NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...         NaN   
3   ham  U dun say so early hor... U c already then say...         NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...         NaN   

   Unnamed: 3  Unnamed: 4  
0         NaN         NaN  
1         NaN         NaN  
2         NaN         NaN  
3         NaN         NaN  
4         NaN         NaN  


In [5]:
data = data.rename(columns={'v1': 'label', 'v2': 'message'})
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [7]:
messages = data['message'].values
labels = data['label'].values

In [9]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(messages)
X = tokenizer.texts_to_sequences(messages)
X = pad_sequences(X, maxlen=100)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [13]:
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])



In [15]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 81ms/step - accuracy: 0.8783 - loss: 0.3544 - val_accuracy: 0.9785 - val_loss: 0.0657
Epoch 2/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 76ms/step - accuracy: 0.9819 - loss: 0.0632 - val_accuracy: 0.9883 - val_loss: 0.0458
Epoch 3/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 74ms/step - accuracy: 0.9942 - loss: 0.0260 - val_accuracy: 0.9910 - val_loss: 0.0403
Epoch 4/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 69ms/step - accuracy: 0.9968 - loss: 0.0106 - val_accuracy: 0.9910 - val_loss: 0.0372
Epoch 5/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 70ms/step - accuracy: 0.9982 - loss: 0.0073 - val_accuracy: 0.9910 - val_loss: 0.0405


<keras.src.callbacks.history.History at 0x25a3b3929f0>

In [19]:
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step


In [21]:
print('Classification Report:')
print(classification_report(y_test, y_pred))
print("Accuracy:",accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.99      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

Accuracy: 0.9910313901345291


In [23]:
model.save('sms_spam_detection_model.h5')



In [25]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [27]:
model = tf.keras.models.load_model('sms_spam_detection_model.h5')



In [29]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(messages)

In [31]:
def predict_sms(message):
    # Preprocess the input message
    sequence = tokenizer.texts_to_sequences([message])
    padded_sequence = pad_sequences(sequence, maxlen=100)
    prediction = model.predict(padded_sequence)
    label = "Spam" if prediction > 0.5 else "Ham"
    confidence = prediction[0][0] if prediction > 0.5 else 1 - prediction[0][0]
    
    return label, confidence

In [33]:
new_sms = "You've been selected for a free iPhone giveaway! Claim your prize now before it's too late. Reply with your details."
label, confidence = predict_sms(new_sms)

print("Message:",new_sms)
print("Predicted Label:",label)
print("Confidence:",confidence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 235ms/step
Message: You've been selected for a free iPhone giveaway! Claim your prize now before it's too late. Reply with your details.
Predicted Label: Spam
Confidence: 0.99921465
