In [3]:
import pandas as pd
import numpy as np
import spacy
from tqdm.auto import tqdm

# Scikit-learn for splitting data
from sklearn.model_selection import train_test_split

# TensorFlow/Keras for building the LSTM model
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping



In [4]:

try:
    df = pd.read_csv('/content/spam_ham_dataset.csv')
    print("Dataset loaded successfully.")
    print("Dataset shape:", df.shape)
    print("Columns:", df.columns)
    # Let's drop the unnecessary columns
    df = df[['text', 'label_num']]
    df.columns = ['text', 'label']
    print("\nFirst 5 rows of the cleaned dataset:")
    print(df.head())
except FileNotFoundError:
    print("Error: 'spam_ham_dataset.csv' not found.")
    print("Please make sure the dataset file is in the same directory as the script.")
    exit()

Dataset loaded successfully.
Dataset shape: (5171, 4)
Columns: Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')

First 5 rows of the cleaned dataset:
                                                text  label
0  Subject: enron methanol ; meter # : 988291\r\n...      0
1  Subject: hpl nom for january 9 , 2001\r\n( see...      0
2  Subject: neon retreat\r\nho ho ho , we ' re ar...      0
3  Subject: photoshop , windows , office . cheap ...      1
4  Subject: re : indian springs\r\nthis deal is t...      0


In [5]:
tqdm.pandas()

# Load the spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
    print("spaCy model loaded successfully.")
except OSError:
    print("Downloading spaCy model 'en_core_web_sm'...")
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
    print("spaCy model downloaded and loaded successfully.")


def preprocess_text(text):
    """
    Preprocesses text data using spaCy:
    - Lemmatization
    - Lowercasing
    - Removing stop words
    - Removing punctuation
    """
    if not isinstance(text, str):
        return ""

    doc = nlp(text.lower())
    processed_tokens = [
        token.lemma_ for token in doc
        if not token.is_stop and not token.is_punct and not token.is_space
    ]
    return " ".join(processed_tokens)

print("\nStarting text preprocessing with spaCy. This may take a few minutes...")
df['processed_text'] = df['text'].progress_apply(preprocess_text)
print("Text preprocessing complete.")
print("\nDataset with processed text:")
print(df.head())

spaCy model loaded successfully.

Starting text preprocessing with spaCy. This may take a few minutes...


  0%|          | 0/5171 [00:00<?, ?it/s]

Text preprocessing complete.

Dataset with processed text:
                                                text  label  \
0  Subject: enron methanol ; meter # : 988291\r\n...      0   
1  Subject: hpl nom for january 9 , 2001\r\n( see...      0   
2  Subject: neon retreat\r\nho ho ho , we ' re ar...      0   
3  Subject: photoshop , windows , office . cheap ...      1   
4  Subject: re : indian springs\r\nthis deal is t...      0   

                                      processed_text  
0  subject enron methanol meter 988291 follow not...  
1  subject hpl nom january 9 2001 attach file hpl...  
2  subject neon retreat ho ho ho wonderful time y...  
3  subject photoshop window office cheap main tre...  
4  subject indian spring deal book teco pvr reven...  


In [6]:
# --- 4. Prepare Data for the LSTM Model ---

# Model Hyperparameters
VOCAB_SIZE = 5000  # Number of words to keep in the vocabulary
MAX_LENGTH = 150   # Max length of input sequences
EMBEDDING_DIM = 64 # Dimension of the word embeddings

# Tokenize the text: Convert words to integer sequences
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts(df['processed_text'])

# Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences(df['processed_text'])

# Pad sequences to ensure they all have the same length
padded_sequences = pad_sequences(sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')

In [7]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


In [8]:
# --- 5. Split Data into Training and Testing Sets ---
X = padded_sequences
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


Training data shape: (4136, 150)
Testing data shape: (1035, 150)


In [9]:
# --- 6. Build the LSTM Model ---
print("\nBuilding the LSTM model...")

model = Sequential([
    # 1. Embedding Layer: Turns positive integers (indexes) into dense vectors of fixed size.
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_LENGTH),

    # 2. SpatialDropout1D: Regularization to prevent overfitting. It drops entire 1D feature maps.
    SpatialDropout1D(0.3),

    # 3. LSTM Layer: The core of the model for processing sequences.
    # `return_sequences=False` because we only need the output of the last time step for classification.
    LSTM(128, dropout=0.3, recurrent_dropout=0.3),

    # 4. Dense Layer: A standard fully connected layer.
    Dense(64, activation='relu'),
    Dropout(0.5),

    # 5. Output Layer: A single neuron with a sigmoid activation for binary classification.
    Dense(1, activation='sigmoid')
])









Building the LSTM model...




In [10]:
# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [11]:
model.summary()

In [12]:



# --- 7. Train the Model ---
print("\nTraining the model...")

# Use EarlyStopping to stop training when validation accuracy stops improving
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=64,
    validation_split=0.1, # Use 10% of training data for validation
    callbacks=[early_stopping],
    verbose=1
)


# --- 8. Evaluate the Model ---
print("\nEvaluating the model on the test set...")
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy*100:.2f}%")
print(f"Test Loss: {loss:.4f}")


Training the model...
Epoch 1/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 517ms/step - accuracy: 0.6962 - loss: 0.6383 - val_accuracy: 0.7947 - val_loss: 0.5106
Epoch 2/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 516ms/step - accuracy: 0.8226 - loss: 0.4633 - val_accuracy: 0.8599 - val_loss: 0.3989
Epoch 3/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 512ms/step - accuracy: 0.8308 - loss: 0.4654 - val_accuracy: 0.8768 - val_loss: 0.3814
Epoch 4/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 542ms/step - accuracy: 0.8555 - loss: 0.4203 - val_accuracy: 0.8913 - val_loss: 0.3431
Epoch 5/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 507ms/step - accuracy: 0.8729 - loss: 0.3897 - val_accuracy: 0.8937 - val_loss: 0.3422
Epoch 6/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 538ms/step - accuracy: 0.8767 - loss: 0.3900 - val_accuracy: 0.8865 - val_loss: 0.3340

In [13]:
model.save("spam_detector_model.keras")


In [19]:
%%writefile app.py
import streamlit as st
import tensorflow as tf
import pickle
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ---------------- Load Model ----------------
MODEL_PATH = "/content/spam_detector_model.keras"
model = tf.keras.models.load_model(MODEL_PATH)

# Load tokenizer (make sure you saved it earlier as tokenizer.pkl)
with open("/content/tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

# Set max length same as training
MAX_LENGTH = 100  # change to whatever you used in training

# ---------------- Preprocessing ----------------
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text.strip()

# ---------------- Prediction Function ----------------
def predict_spam(text):
    processed_text = preprocess_text(text)
    sequence = tokenizer.texts_to_sequences([processed_text])
    padded_sequence = pad_sequences(sequence, maxlen=MAX_LENGTH, padding="post", truncating="post")
    prediction = model.predict(padded_sequence, verbose=0)[0][0]
    return prediction

# ---------------- Streamlit UI ----------------
st.set_page_config(page_title="Spam Detector", page_icon="📩")

st.title("📩 Spam Detector")
st.write("Enter a message below to check if it's **SPAM** or **HAM**.")

user_input = st.text_area("Message:", height=150)

if st.button("Predict"):
    if user_input.strip():
        score = predict_spam(user_input)
        if score > 0.5:
            st.error(f"🚨 This looks like **SPAM** (score: {score:.4f})")
        else:
            st.success(f"✅ This looks like **HAM** (score: {score:.4f})")
    else:
        st.warning("⚠️ Please enter a message first.")


Overwriting app.py


In [20]:
from pyngrok import ngrok

# Kill any existing ngrok processes
ngrok.kill()

# Set your Ngrok authtoken (replace 'your_auth_token' with the copied token)
ngrok.set_auth_token('2sdFOtoGRmxCxm53I5M0Eih6zee_6YN4hjxyWEvGoiqffZsbD')

# Set up the Ngrok tunnel to the Streamlit app
# The port number should be included in the 'addr' argument
public_url = ngrok.connect(addr='http://localhost:8501')
print(f"Streamlit app is live at: {public_url}")

Streamlit app is live at: NgrokTunnel: "https://c3272b39857d.ngrok-free.app" -> "http://localhost:8501"


In [21]:
!streamlit run app.py &


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.147.73.98:8501[0m
[0m
2025-09-22 16:08:41.440515: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758557321.463542    8047 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758557321.470645    8047 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1758557321.489181    8047 computation_placer.cc:177] computation placer already registe

In [17]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.3.0


In [18]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.49.1-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.49.1-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m103.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.49.1
