In [None]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
# Read the data
train_data = pd.read_csv(train_file_path, sep='\t', header=None, names=['label', 'message'])
test_data = pd.read_csv(test_file_path, sep='\t', header=None, names=['label', 'message'])

In [None]:
# Explore the data
print(f"Training data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")
print("\nSample of training data:")
print(train_data.head())

In [None]:
# Count the number of ham and spam messages
ham_count = len(train_data[train_data['label'] == 'ham'])
spam_count = len(train_data[train_data['label'] == 'spam'])
print(f"\nHam messages: {ham_count}")
print(f"Spam messages: {spam_count}")
print(f"Percentage of spam messages: {spam_count / (ham_count + spam_count) * 100:.2f}%")

In [None]:
# Prepare the data
# Convert labels to numeric values
train_labels = np.array([0 if label == 'ham' else 1 for label in train_data['label']])
test_labels = np.array([0 if label == 'ham' else 1 for label in test_data['label']])

In [None]:
# Tokenize and pad the text
vocab_size = 10000  # Maximum number of words to tokenize
max_length = 100    # Maximum length of sequences
trunc_type = 'post' # Truncation type
padding_type = 'post' # Padding type
oov_tok = '<OOV>'   # Out of vocabulary token

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_data['message'])

In [None]:
# Get word index
word_index = tokenizer.word_index
print(f"\nNumber of unique words: {len(word_index)}")

In [None]:
# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_data['message'])
test_sequences = tokenizer.texts_to_sequences(test_data['message'])

In [None]:
# Pad sequences
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
# Check the shapes
print(f"\nTraining sequences shape: {train_padded.shape}")
print(f"Testing sequences shape: {test_padded.shape}")

In [None]:
# Build the model
embedding_dim = 16

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

In [None]:
# Train the model
epochs = 30
history = model.fit(
    train_padded, train_labels,
    epochs=epochs,
    validation_data=(test_padded, test_labels),
    verbose=1,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
    ]
)


In [None]:
# Plot training history
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='lower right')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.tight_layout()
plt.show()

In [None]:
# Evaluate the model on test data
loss, accuracy = model.evaluate(test_padded, test_labels)
print(f"\nTest Accuracy: {accuracy*100:.2f}%")

In [None]:
# Function to predict if a message is spam or ham
def predict_message(message):
    """
    Predicts if a message is spam or ham.

    Args:
        message (str): The SMS message to classify

    Returns:
        list: [probability_of_spam, "spam" or "ham"]
    """

In [None]:
# Tokenize the message
    sequence = tokenizer.texts_to_sequences([message])
    padded = pad_sequences(sequence, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
# Make prediction
    prediction = model.predict(padded)[0][0]

In [None]:
# Return the result
    return [float(prediction), "spam" if prediction > 0.5 else "ham"]

In [None]:
# Test the function with some examples
test_messages = [
    "Hey, how are you doing?",
    "Congratulations! You've won a $1000 gift card. Call 555-123-4567 to claim now!",
    "Don't forget about our meeting tomorrow.",
    "URGENT: Your account has been compromised. Reply with your details to secure."
]

for message in test_messages:
    prediction = predict_message(message)
    print(f"Message: '{message}'")
    print(f"Prediction: {prediction[1]} (probability: {prediction[0]:.4f})")
    print()


In [None]:
# Show confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
# Get predictions for test data
y_pred = model.predict(test_padded)
y_pred_classes = (y_pred > 0.5).astype(int).flatten()

In [None]:
# Create confusion matrix
cm = confusion_matrix(test_labels, y_pred_classes)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['ham', 'spam'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Final visualization: Display some misclassified examples
misclassified_indices = np.where(y_pred_classes != test_labels)[0]

if len(misclassified_indices) > 0:
    print("\nSome misclassified examples:")
    for i in np.random.choice(misclassified_indices, min(5, len(misclassified_indices)), replace=False):
        message = test_data['message'].iloc[i]
        true_label = 'ham' if test_labels[i] == 0 else 'spam'
        pred_label = 'ham' if y_pred_classes[i] == 0 else 'spam'
        confidence = y_pred[i][0] if pred_label == 'spam' else 1 - y_pred[i][0]

        print(f"Message: '{message[:100]}...' if len(message) > 100 else message")
        print(f"True label: {true_label}")
        print(f"Predicted label: {pred_label} (confidence: {confidence:.4f})")
        print()
else:
    print("\nNo misclassified examples in the test set!")

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):



  return (prediction)

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
