In [13]:
import os
import nltk
import random
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# Custom function to download only if necessary and suppress output
def silent_nltk_download(package_name):
    try:
        nltk.data.find(f'tokenizers/{package_name}')
    except LookupError:
        nltk.download(package_name, quiet=True)

# Ensure necessary NLTK packages are available
silent_nltk_download('punkt')
silent_nltk_download('stopwords')

In [14]:
def load_emails(folder_path):
    emails = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt"):
                label = 'spam' if 'spam' in root else 'ham'
                with open(os.path.join(root, file), 'r', encoding='utf-8', errors='ignore') as email_file:
                    emails.append((email_file.read(), label))
    return emails

In [15]:
def extract_features(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    freq_dist = FreqDist(tokens)
    return {word: freq_dist[word] for word in freq_dist}

In [16]:
# Load and shuffle the data
print("Loading and shuffling email data...")
email_data = load_emails(r'C:\Phishing Detection\enron1\spam')
random.shuffle(email_data)

Loading and shuffling email data...


In [None]:
# Create feature sets
featuresets = [(extract_features(email), label) for (email, label) in email_data]

# Split the data into training and testing sets
train_size = int(len(featuresets) * 0.8)
train_set, test_set = featuresets[:train_size], featuresets[train_size:]

# Train the Naive Bayes Classifier
classifier = NaiveBayesClassifier.train(train_set)

# Evaluate the model
print(f'Accuracy: {accuracy(classifier, test_set) * 100:.2f}%')
classifier.show_most_informative_features(15)

In [None]:
# Example of classifying a new email
def classify_email(email_text):
    features = extract_features(email_text)
    return classifier.classify(features)

In [None]:

new_email = "You have won a free lottery! Claim your prize now."
print(f'The email is classified as: {classify_email(new_email)}')