In [9]:
!pip install pandas numpy nltk scikit-learn tensorflow keras




In [10]:
import pandas as pd

# Load dataset
df = pd.read_csv("spam.csv", encoding="latin-1")[['v1', 'v2']]
df.columns = ['label', 'text']

# Convert labels to binary (spam = 1, ham = 0)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Display data
print(df.head())
print(df['label'].value_counts())  # Check class distribution


   label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
label
0    4825
1     747
Name: count, dtype: int64


In [16]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]  # Remove stopwords & stem
    return " ".join(words)

df["clean_text"] = df['text'].apply(clean_text)
print(df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vyshn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   label                                               text  \
0      0  Go until jurong point, crazy.. Available only ...   
1      0                      Ok lar... Joking wif u oni...   
2      1  Free entry in 2 a wkly comp to win FA Cup fina...   
3      0  U dun say so early hor... U c already then say...   
4      0  Nah I don't think he goes to usf, he lives aro...   

                                           clean_tex  \
0  go jurong point crazi avail bugi n great world...   
1                              ok lar joke wif u oni   
2  free entri wkli comp win fa cup final tkt st m...   
3                u dun say earli hor u c alreadi say   
4          nah dont think goe usf live around though   

                                          clean_text  
0  go jurong point crazi avail bugi n great world...  
1                              ok lar joke wif u oni  
2  free entri wkli comp win fa cup final tkt st m...  
3                u dun say earli hor u c alreadi say  
4        

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text data to TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["clean_text"]).toarray()
y = df["label"]

print("TF-IDF Shape:", X.shape)



TF-IDF Shape: (5572, 5000)


In [22]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Naïve Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predictions
y_pred_nb = nb_model.predict(X_test)

# Evaluate
print("Naïve Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


Naïve Bayes Accuracy: 0.9713004484304932
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       1.00      0.79      0.88       150

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [29]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.9775784753363229
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.83      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [30]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['clean_text'])
X_nn = tokenizer.texts_to_sequences(df['clean_text'])

# Padding sequences
X_nn = pad_sequences(X_nn, maxlen=100)

# Split dataset
X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(X_nn, y, test_size=0.2, random_state=42)


In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Define LSTM model
lstm_model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

# Compile model
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
lstm_model.fit(X_train_nn, y_train_nn, epochs=5, batch_size=32, validation_data=(X_test_nn, y_test_nn))

# Evaluate LSTM
_, lstm_acc = lstm_model.evaluate(X_test_nn, y_test_nn)
print("LSTM Accuracy:", lstm_acc)




Epoch 1/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 176ms/step - accuracy: 0.8948 - loss: 0.3019 - val_accuracy: 0.9821 - val_loss: 0.0620
Epoch 2/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 164ms/step - accuracy: 0.9873 - loss: 0.0444 - val_accuracy: 0.9865 - val_loss: 0.0518
Epoch 3/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 166ms/step - accuracy: 0.9956 - loss: 0.0169 - val_accuracy: 0.9857 - val_loss: 0.0575
Epoch 4/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 161ms/step - accuracy: 0.9992 - loss: 0.0059 - val_accuracy: 0.9830 - val_loss: 0.0669
Epoch 5/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 158ms/step - accuracy: 0.9995 - loss: 0.0035 - val_accuracy: 0.9758 - val_loss: 0.0785
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - accuracy: 0.9783 - loss: 0.0618
LSTM Accuracy: 0.9757847785949707


In [32]:
from tensorflow.keras.layers import GRU

# Define GRU model
gru_model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    GRU(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

# Compile model
gru_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
gru_model.fit(X_train_nn, y_train_nn, epochs=5, batch_size=32, validation_data=(X_test_nn, y_test_nn))

# Evaluate GRU
_, gru_acc = gru_model.evaluate(X_test_nn, y_test_nn)
print("GRU Accuracy:", gru_acc)


Epoch 1/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 178ms/step - accuracy: 0.8859 - loss: 0.3256 - val_accuracy: 0.9857 - val_loss: 0.0626
Epoch 2/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 166ms/step - accuracy: 0.9891 - loss: 0.0444 - val_accuracy: 0.9839 - val_loss: 0.0588
Epoch 3/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 155ms/step - accuracy: 0.9984 - loss: 0.0085 - val_accuracy: 0.9839 - val_loss: 0.0622
Epoch 4/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 159ms/step - accuracy: 0.9982 - loss: 0.0112 - val_accuracy: 0.9830 - val_loss: 0.0756
Epoch 5/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 151ms/step - accuracy: 0.9997 - loss: 0.0012 - val_accuracy: 0.9839 - val_loss: 0.0864
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.9875 - loss: 0.0585
GRU Accuracy: 0.9838564991950989


In [33]:
print("Naïve Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("LSTM Accuracy:", lstm_acc)
print("GRU Accuracy:", gru_acc)


Naïve Bayes Accuracy: 0.9713004484304932
Random Forest Accuracy: 0.9775784753363229
LSTM Accuracy: 0.9757847785949707
GRU Accuracy: 0.9838564991950989


In [36]:
import pickle

# Save the trained model (use Naïve Bayes or any best-performing model)
with open("spam_model.pkl", "wb") as model_file:
    pickle.dump(nb_model, model_file)

# Save the TF-IDF vectorizer
with open("tfidf_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)
