In [13]:
import os
import pandas as pd  
from sklearn.feature_extraction.text import TfidfTransformer, HashingVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [14]:
# Step 1: Load the dataset 
def load_emails_from_folder(folder_path):
    emails = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='latin1') as file:
            emails.append(file.read())
    return emails

In [15]:
# Adjust these paths to point to your dataset directories
spam_path = "./spamassassin-public-corpus/spam"
ham_path = "./spamassassin-public-corpus/ham"

In [16]:
# Step 2: Load emails and assign labels
spam_emails = load_emails_from_folder(spam_path)
ham_emails = load_emails_from_folder(ham_path)

In [17]:
# Step 3: Create lists for email contents and labels
email_contents = spam_emails + ham_emails
email_labels = [0] * len(spam_emails) + [1] * len(ham_emails)

In [18]:
# Step 4: Perform train-test split (80%-20%)
X_train, X_test, y_train, y_test = train_test_split(
    email_contents, email_labels, test_size=0.2, random_state=42)

In [19]:
# Step 5: Train a DecisionTreeClassifier with HashingVectorizer and TfidfTransformer
vectorizer = HashingVectorizer(n_features=2**12)  # Use a large number of features for sparse text
tfidf_transformer = TfidfTransformer()
classifier = DecisionTreeClassifier()

In [20]:
# Transform data
X_train_vec = tfidf_transformer.fit_transform(vectorizer.transform(X_train))
X_test_vec = tfidf_transformer.transform(vectorizer.transform(X_test))

In [21]:
# Train the classifier
classifier.fit(X_train_vec, y_train)

In [22]:
# Step 6: Evaluate the model
y_pred = classifier.predict(X_test_vec)

In [23]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

In [24]:
# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)


In [43]:
# Print results
print("Accuracy of our model:", accuracy)
print("The Confusion Matrix is:\n", conf_matrix)

Accuracy of our model: 0.9773809523809524
The Confusion Matrix is:
 [[298   6]
 [ 13 523]]


[Read my article on Medium](https://medium.com/@umairm142/introduction-ee2512a061b6)

[Check out my GitHub repository](https://github.com/umairulmulk/Spam-Email-Detection)
