In [61]:
import os
import nltk
import random
from nltk.data import find
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

def check_and_download(resource, category):
    try:
        find(f'{category}/{resource}.zip')
    except LookupError:
        nltk.download(resource)

# Check and download only if necessary
check_and_download('punkt', 'tokenizers')
check_and_download('stopwords', 'corpora')

# Now you can proceed with the rest of your code



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\19294\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [62]:
def load_emails(folder_path):
    emails = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt"):
                label = 'spam' if 'spam' in root else 'ham'
                with open(os.path.join(root, file), 'r', encoding='utf-8', errors='ignore') as email_file:
                    emails.append((email_file.read(), label))
    return emails

# Load emails from the directory
email_data = load_emails(r'C:\Users\19294\Machine Learning PT')
random.shuffle(email_data)  # Shuffle the data




In [None]:
def extract_features(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]  # Lowercase and remove non-alphabetic words
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    freq_dist = FreqDist(tokens)
    return {word: freq_dist[word] for word in freq_dist}

# Create feature sets for all emails
featuresets = [(extract_features(email), label) for (email, label) in email_data]


In [None]:
train_size = int(len(featuresets) * 0.8)
train_set, test_set = featuresets[:train_size], featuresets[train_size:]


In [None]:
empty_features = [fs for fs, label in train_set if not fs]
print(f"Number of empty feature sets in training data: {len(empty_features)}")


In [None]:
train_set = [(fs, label) for fs, label in train_set if fs]
test_set = [(fs, label) for fs, label in test_set if fs]


In [None]:
classifier = NaiveBayesClassifier.train(train_set)


In [38]:
print(f'Accuracy: {accuracy(classifier, test_set) * 100:.2f}%')

# Show the most informative features
classifier.show_most_informative_features(15)


Accuracy: 94.11%
Most Informative Features
               forwarded = 1                 ham : spam   =    146.8 : 1.0
                    pain = 1                spam : ham    =     94.9 : 1.0
            prescription = 1                spam : ham    =     90.0 : 1.0
                    spam = 1                spam : ham    =     73.4 : 1.0
                     sex = 1                spam : ham    =     65.2 : 1.0
                      ex = 1                spam : ham    =     63.6 : 1.0
                featured = 1                spam : ham    =     61.9 : 1.0
                creative = 1                spam : ham    =     60.3 : 1.0
                   super = 1                spam : ham    =     57.0 : 1.0
                       u = 3                spam : ham    =     55.3 : 1.0
            solicitation = 1                spam : ham    =     53.7 : 1.0
              trademarks = 1                spam : ham    =     53.7 : 1.0
              compliance = 1                spam : ham   

In [None]:
def classify_email(email_text):
    features = extract_features(email_text)
    return classifier.classify(features)

# Example usage
new_email = "You have won a free lottery! Claim your prize now."
print(f'The email is classified as: {classify_email(new_email)}')
