In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np

# Load CSV
df = pd.read_csv("spam.csv", encoding="latin-1")  # adjust path if needed
df = df.rename(columns={df.columns[0]: "label", df.columns[1]: "text"})
df = df[["text", "label"]]

# Convert labels to 0/1
df["label"] = df["label"].astype(str).str.lower().apply(lambda x: 1 if "spam" in x else 0)

# Tokenizer
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df["text"])
sequences = tokenizer.texts_to_sequences(df["text"])

# Pad
max_length = 20
padded = pad_sequences(sequences, maxlen=max_length, padding='post')

labels = np.array(df["label"])

# Model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=16, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(padded, labels, epochs=5, verbose=2)

# Test
msg = ["Get free stuff now!"]
seq = tokenizer.texts_to_sequences(msg)
pad = pad_sequences(seq, maxlen=max_length, padding='post')
pred = model.predict(pad)[0][0]

print("Spam probability:", pred)
print("Spam" if pred > 0.5 else "Not Spam")


Epoch 1/5




175/175 - 2s - 12ms/step - accuracy: 0.9180 - loss: 0.2289
Epoch 2/5
175/175 - 1s - 6ms/step - accuracy: 0.9869 - loss: 0.0468
Epoch 3/5
175/175 - 1s - 7ms/step - accuracy: 0.9950 - loss: 0.0181
Epoch 4/5
175/175 - 1s - 3ms/step - accuracy: 0.9982 - loss: 0.0076
Epoch 5/5
175/175 - 1s - 3ms/step - accuracy: 0.9995 - loss: 0.0036
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
Spam probability: 0.33414936
Not Spam
