In [5]:
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, GlobalAveragePooling1D, Dense



In [6]:

# Load your dataset (with 'label' and 'message' columns already cleaned)
df = pd.read_csv("spam.csv", encoding='latin1')
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

# Convert labels to binary
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Clean the text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['message'] = df['message'].apply(clean_text)


In [7]:

# Tokenize
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['message'])

sequences = tokenizer.texts_to_sequences(df['message'])
X = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')
y = df['label'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)


In [8]:

model = Sequential([
    Embedding(input_dim=10000, output_dim=32, input_length=100),
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [9]:
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=32)


Epoch 1/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - accuracy: 0.8688 - loss: 0.4128 - val_accuracy: 0.8664 - val_loss: 0.3674
Epoch 2/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8683 - loss: 0.3611 - val_accuracy: 0.8664 - val_loss: 0.3508
Epoch 3/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8698 - loss: 0.3385 - val_accuracy: 0.8664 - val_loss: 0.3092
Epoch 4/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8652 - loss: 0.2870 - val_accuracy: 0.8717 - val_loss: 0.2386
Epoch 5/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9136 - loss: 0.1990 - val_accuracy: 0.9453 - val_loss: 0.1642
Epoch 6/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9620 - loss: 0.1289 - val_accuracy: 0.9704 - val_loss: 0.1277
Epoch 7/10
[1m140/140[0m 

In [10]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Accuracy: {accuracy}")


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9880 - loss: 0.0783
Accuracy: 0.9847533702850342


In [18]:
def predict_message(msg):
    msg = clean_text(msg)
    seq = tokenizer.texts_to_sequences([msg])
    padded = pad_sequences(seq, maxlen=100, padding='post')
    pred = model.predict(padded)[0][0]
    return "spam" if pred > 0.5 else "ham"

# Example
print(predict_message("Congratulations! You've won a free ticket to Bahamas!"))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
spam
