In [1]:
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# --- 1. Generate Synthetic Data (Replace with your real data) ---
# Imagine each message has 3 features, e.g., (number of suspicious words, length of message, presence of link)
# labels: 0 for "not spam", 1 for "spam"
np.random.seed(42)
num_samples = 1000

# Features for 'not spam' (e.g., low suspicious words, medium length, no link)
not_spam_features = np.random.rand(num_samples // 2, 3) * [5, 50, 0.1]
not_spam_labels = np.zeros(num_samples // 2)

2025-06-07 18:31:10.288032: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
not_spam_labels

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [38]:
not_spam_features

array([[2.39837793e-01, 4.41024093e+01, 4.44393819e-02],
       [2.94471824e+00, 6.95939014e+00, 3.38784798e-04],
       [1.22645047e+00, 3.52293016e+01, 9.91945301e-03],
       ...,
       [6.90138045e-01, 3.36695790e+01, 5.96138062e-02],
       [1.47232049e+00, 6.82994172e-01, 7.76479154e-03],
       [1.95847388e+00, 7.69871756e+00, 3.76819606e-02]])

In [3]:
# Features for 'spam' (e.g., high suspicious words, varied length, often has link)
spam_features = np.random.rand(num_samples // 2, 3) * [15, 100, 1.0] + [5, 20, 0] # Shift values
spam_labels = np.ones(num_samples // 2)

X = np.vstack((not_spam_features, spam_features))
y = np.hstack((not_spam_labels, spam_labels))

In [4]:
# Shuffle the data
indices = np.arange(num_samples)
np.random.shuffle(indices)
X = X[indices]
y = y[indices]

In [5]:
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"First 5 X values:\n{X[:5]}")
print(f"First 5 y values: {y[:5]}")

X shape: (1000, 3)
y shape: (1000,)
First 5 X values:
[[6.32577420e+00 8.98948828e+01 3.68347214e-01]
 [9.10589578e-01 3.94849254e+01 6.58707776e-02]
 [1.72158017e+01 4.80334753e+01 1.32179658e-01]
 [1.09927818e+01 5.26794706e+01 1.08110836e-02]
 [1.19751428e+01 1.04738793e+02 6.23940686e-02]]
First 5 y values: [1. 0. 1. 1. 1.]


In [6]:
# --- 2. Data Preprocessing ---
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features (important for many neural networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
X_train

array([[9.12858772e+00, 9.09438865e+01, 2.71492003e-01],
       [4.13993340e-01, 3.01576055e+01, 2.45349110e-02],
       [1.72595797e+01, 1.13039842e+02, 9.53609213e-02],
       ...,
       [2.43715599e+00, 4.47276113e+01, 7.99855256e-02],
       [3.51242042e+00, 1.79745576e+01, 2.93591844e-02],
       [6.54263888e+00, 3.25955196e+01, 9.79151555e-01]])

In [14]:
X_train_scaled

array([[ 0.27091861,  1.42913762,  0.03834201],
       [-1.2163859 , -0.50353113, -0.80283951],
       [ 1.65862073,  2.13166725, -0.56159302],
       ...,
       [-0.87109628, -0.04028524, -0.61396447],
       [-0.68758279, -0.89088492, -0.78640715],
       [-0.17042073, -0.4260189 ,  2.44876131]])

In [7]:
# --- 3. Build the Classification Model ---
model_classification = keras.Sequential([
    keras.layers.Dense(32, activation='relu', input_shape=(X_train_scaled.shape[1],)), # Input layer + 1st hidden layer
    keras.layers.Dropout(0.2), # Dropout for regularization
    keras.layers.Dense(16, activation='relu'), # 2nd hidden layer
    keras.layers.Dense(1, activation='sigmoid') # Output layer for binary classification (sigmoid for probability)
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [8]:
# --- 4. Compile the Model ---
model_classification.compile(
    optimizer='adam', # Adam is a popular optimizer
    loss='binary_crossentropy', # Appropriate loss for binary classification
    metrics=['accuracy'] # Metric to monitor during training
)

model_classification.summary()

In [9]:
# --- 5. Train the Model ---
print("\n--- Training Classification Model ---")
history_classification = model_classification.fit(
    X_train_scaled, y_train,
    epochs=20, # Number of epochs
    batch_size=32, # Batch size
    validation_split=0.2 # Use part of training data for validation
)


--- Training Classification Model ---
Epoch 1/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.7197 - loss: 0.6261 - val_accuracy: 0.9563 - val_loss: 0.5321
Epoch 2/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9610 - loss: 0.5137 - val_accuracy: 0.9625 - val_loss: 0.4277
Epoch 3/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9652 - loss: 0.4162 - val_accuracy: 0.9688 - val_loss: 0.3256
Epoch 4/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9702 - loss: 0.3030 - val_accuracy: 0.9688 - val_loss: 0.2350
Epoch 5/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9805 - loss: 0.2182 - val_accuracy: 0.9688 - val_loss: 0.1682
Epoch 6/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9788 - loss: 0.1496 - val_accuracy: 0.9750 - val_loss: 0.1247


In [10]:
# --- 6. Evaluate the Model ---
print("\n--- Evaluating Classification Model ---")
loss_classification, accuracy_classification = model_classification.evaluate(X_test_scaled, y_test)
print(f"Test Loss (Classification): {loss_classification:.4f}")
print(f"Test Accuracy (Classification): {accuracy_classification:.4f}")


--- Evaluating Classification Model ---
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 1.0000 - loss: 0.0082  
Test Loss (Classification): 0.0069
Test Accuracy (Classification): 1.0000


In [11]:
# --- 7. Make Predictions ---
print("\n--- Making Predictions (Classification) ---")
# Example new messages (scaled)
new_messages = np.array([
    [3, 30, 0],   # Likely not spam
    [18, 80, 0.9], # Likely spam
    [7, 60, 0.3]   # Borderline
])
new_messages_scaled = scaler.transform(new_messages)

predictions_classification = model_classification.predict(new_messages_scaled)
print("Raw predictions (probabilities):")
print(predictions_classification)

# Convert probabilities to classes (0 or 1)
predicted_classes = (predictions_classification > 0.5).astype(int)
print("\nPredicted classes:")
for i, pred in enumerate(predicted_classes):
    status = "Spam" if pred[0] == 1 else "Not Spam"
    print(f"Message {i+1}: {status} (Raw prediction: {predictions_classification[i,0]:.4f})")



--- Making Predictions (Classification) ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
Raw predictions (probabilities):
[[0.00335276]
 [0.9999979 ]
 [0.97797424]]

Predicted classes:
Message 1: Not Spam (Raw prediction: 0.0034)
Message 2: Spam (Raw prediction: 1.0000)
Message 3: Spam (Raw prediction: 0.9780)
