In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



In [None]:
# Sample dataset: Raw text emails and their labels (1 = spam, 0 = not spam)
data = [
    ("Congratulations! You won a free iPhone. Click here to claim.", 1),  # Spam
    ("Hello John, are we still meeting tomorrow for lunch?", 0),         # Not Spam
    ("Limited offer: Buy now and get 50% off on your purchase.", 1),     # Spam
    ("Reminder: Your package will be delivered tomorrow.", 0),           # Not Spam
    ("Earn money fast! Work from home and make $$$ easily!", 1),         # Spam
    ("Hi Sarah, can you send me the report by Friday?", 0)               # Not Spam
]



In [None]:
# Separate the dataset into raw text and labels
texts, labels = zip(*data)



In [None]:
# Convert text data into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts).toarray()  # Convert to feature matrix
feature_names = vectorizer.get_feature_names_out()


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.33, random_state=42)


In [None]:
# Create and train the Decision Tree Classifier
clf = DecisionTreeClassifier(criterion="gini", max_depth=3, random_state=42)
clf.fit(X_train, y_train)


In [None]:
# Make predictions
y_pred = clf.predict(X_test)


In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


In [None]:
# Visualize the decision tree rules
tree_rules = export_text(clf, feature_names=feature_names)
print("\nDecision Tree Rules:\n")
print(tree_rules)


In [None]:
# Example prediction on new data
new_emails = [
    "Win a free vacation by signing up today!",
    "Hi Mark, please review the attached document."
]
new_features = vectorizer.transform(new_emails).toarray()
predictions = clf.predict(new_features)

for email, pred in zip(new_emails, predictions):
    print(f"Email: \"{email}\" -> {'Spam' if pred == 1 else 'Not Spam'}")
