In [5]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

# 1. Download UCI dataset manually:
#    Go to https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
#    Click “Download” → get the file “SMSSpamCollection”

# 2. Place it in your project, e.g. data/SMSSpamCollection

file_path = os.path.join("datasets", "SMSSpamCollection")

# 3. Load it
df = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'text'])
print(df.head())
print(df['label'].value_counts())


  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
label
ham     4825
spam     747
Name: count, dtype: int64


In [6]:

# 4. Encode labels
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# 5. Split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label_num'], test_size=0.2, random_state=42
)

# 6. Tokenize + pad
max_words = 5000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# 7. Build a simple Keras model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=32, input_length=max_len),
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

# 8. Train
history = model.fit(X_train_pad, y_train,
                    epochs=5,
                    validation_split=0.2,
                    batch_size=32,
                    verbose=2)

# 9. Evaluate
loss, acc = model.evaluate(X_test_pad, y_test)
print(f"Test accuracy: {acc:.2f}")




Epoch 1/5
112/112 - 2s - 14ms/step - accuracy: 0.8679 - loss: 0.3772 - val_accuracy: 0.8576 - val_loss: 0.3757
Epoch 2/5
112/112 - 0s - 3ms/step - accuracy: 0.8676 - loss: 0.3526 - val_accuracy: 0.8576 - val_loss: 0.3620
Epoch 3/5
112/112 - 0s - 4ms/step - accuracy: 0.8676 - loss: 0.3303 - val_accuracy: 0.8565 - val_loss: 0.3265
Epoch 4/5
112/112 - 0s - 3ms/step - accuracy: 0.8679 - loss: 0.2779 - val_accuracy: 0.8688 - val_loss: 0.2574
Epoch 5/5
112/112 - 0s - 3ms/step - accuracy: 0.9035 - loss: 0.2032 - val_accuracy: 0.9496 - val_loss: 0.1887
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9516 - loss: 0.1806
Test accuracy: 0.95


In [11]:
sample = ["Your free ringtone is waiting to be collected. Simply text the password 'MIX' to 85069 to verify. Get Usher and Britney."]
sample_seq = tokenizer.texts_to_sequences(sample)
sample_pad = pad_sequences(sample_seq, maxlen=max_len, padding="post")

pred = model.predict(sample_pad)[0][0]
print("Spam" if pred > 0.5 else "Ham", "Confidence:", pred)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Spam Confidence: 0.6093801
