In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')



In [15]:
# Sample dataset: Raw text emails and their labels (1 = spam, 0 = not spam)
data = [
    ("Congratulations! You won a free iPhone. Click here to claim.", 1),  # Spam
    ("Hello John, are we still meeting tomorrow for lunch?", 0),         # Not Spam
    ("Limited offer: Buy now and get 50% off on your purchase.", 1),     # Spam
    ("Reminder: Your package will be delivered tomorrow.", 0),           # Not Spam
    ("Earn money fast! Work from home and make $$$ easily!", 1),         # Spam
    ("Hi Sarah, can you send me the report by Friday?", 0),              # Not Spam
    ("Exclusive deal! Get a free gift card with every order.", 1),       # Spam
    ("Your Amazon order has been shipped. Track your delivery here.", 0), # Not Spam
    ("Claim your reward points now. Limited time only!", 1),             # Spam
    ("Meeting scheduled for 3 PM tomorrow. Please confirm.", 0),         # Not Spam
    ("Congratulations on your promotion! Join us for a party.", 0),      # Not Spam
    ("Free trial for premium membership. Sign up today!", 1),            # Spam
    ("Your account password was successfully changed.", 0),              # Not Spam
    ("Final notice: Your subscription is about to expire!", 1),          # Spam
    ("Hello, here’s the document you requested.", 0),                    # Not Spam
    ("Win a trip to Paris! Enter the contest now!", 1),                  # Spam
    ("Your invoice for the last purchase is attached.", 0),              # Not Spam
    ("Last chance to claim your exclusive offer. Act now!", 1),          # Spam
    ("Don’t miss out on our holiday sale! Discounts up to 70%.", 1),     # Spam
    ("Reminder: The team meeting starts in 10 minutes.", 0),             # Not Spam
]

In [16]:
# Separate the dataset into raw text and labels
texts, labels = zip(*data)

In [17]:
# Convert text data into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts).toarray()  # Convert to feature matrix
# Use get_feature_names() for older versions of sklearn
feature_names = vectorizer.get_feature_names()

In [18]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.33, random_state=42)

In [19]:
# Create and train the Decision Tree Classifier
clf = DecisionTreeClassifier(criterion="gini", max_depth=3, random_state=42)
clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3, random_state=42)

In [20]:
# Make predictions
y_pred = clf.predict(X_test)

In [21]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.71


In [22]:
# Visualize the decision tree rules
tree_rules = export_text(clf, feature_names=feature_names)
print("\nDecision Tree Rules:\n")
print(tree_rules)


Decision Tree Rules:

|--- get <= 0.50
|   |--- to <= 0.50
|   |   |--- easily <= 0.50
|   |   |   |--- class: 0
|   |   |--- easily >  0.50
|   |   |   |--- class: 1
|   |--- to >  0.50
|   |   |--- class: 1
|--- get >  0.50
|   |--- class: 1



In [24]:
# Example prediction on new data
new_emails = [
    "Win Big a free vacation to Maldives by signing up today!",
    "Hi Mark, please review the attached document."
]
new_features = vectorizer.transform(new_emails).toarray()
predictions = clf.predict(new_features)

for email, pred in zip(new_emails, predictions):
    print(f"Email: \"{email}\" -> {'Spam' if pred == 1 else 'Not Spam'}")

Email: "Win Big a free vacation to Maldives by signing up today!" -> Spam
Email: "Hi Mark, please review the attached document." -> Not Spam
