In [26]:
import os
import email
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans
import string  
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import unicodedata
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
nltk.download('stopwords')
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_predict

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\austi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
folder_name = 'SpamAssassinMessages'

# Function to search for the folder recursively
def find_folder(start_dir, folder_name):
    for root, dirs, files in os.walk(start_dir):
        if folder_name in dirs:
            folder_path = os.path.join(root, folder_name)
            print(f"The path of '{folder_name}' is: {folder_path}")
        
# Start the search from the root directory (you can specify a different directory)
start_directory = '/'

find_folder(start_directory, folder_name)

The path of 'SpamAssassinMessages' is: /Users\austi\Downloads\SpamAssassinMessages


In [4]:
def extract_text_from_message(message):
    
    text = ""
    if message.is_multipart():
        for part in message.walk():
            content_type = part.get_content_type()
            content_disposition = str(part.get('Content-Disposition'))

            if 'attachment' not in content_disposition:
                payload = part.get_payload(decode = True)
                if payload:
                    if content_type == 'text/html':
                        soup = BeautifulSoup(payload, 'html.parser')
                        text += soup.get_text()
                    else:
                        text += payload.decode('utf-8', errors = 'ignore')
    else:
        payload = message.get_payload(decode = True)
        if payload:
            if message.get_content_type() == 'text/html':
                soup = BeautifulSoup(payload, 'html.parser')
                text = soup.get_text()
            else:
                text = payload.decode('utf-8', errors = 'ignore')

    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    
    stop_words = set(stopwords.words('english'))  
    words = text.split()  
    filtered_words = [word for word in words if word not in stop_words]  
    
    text = ' '.join(filtered_words)  
    
    return text


In [5]:
def extract_sender(message):
    sender = 'None'  

    if 'From' in message:
        sender = message['From']

    return sender

In [6]:
def extract_message_type(message):
    if message.is_multipart():
        return 'multipart'
    else:
        return message.get_content_type()

In [7]:
directory_path = r'/Users/austi/Downloads/SpamAssassinMessages/'

file_name = []
label = []
msgs = []
senders = [] 
message_types = []

for root, dirs, files in os.walk(directory_path):
    for file in files:
        file_path = os.path.join(root, file)
        
        if 'spam' in root:
            label.append(1)
        else:
            label.append(0)
        
        with open(file_path, 'r', encoding = 'latin-1') as file_handle:
            message = email.message_from_file(file_handle)
            text = extract_text_from_message(message)
            
            text = ''.join([c for c in text if not unicodedata.category(c).startswith('C')])
            
            msgs.append(text)
            
            sender = extract_sender(message)
            senders.append(sender)
            
            message_type = extract_message_type(message)
            message_types.append(message_type)
            
        file_name.append(file_path)

data = {'File Name': file_name, 
        'Label': label,
        'Message': msgs,
        'Sender': senders,
        'Message Type': message_types}

df = pd.DataFrame(data)

# this is where lots of the weird messages are
df.tail(50)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Unnamed: 0,File Name,Label,Message,Sender,Message Type
9303,/Users/austi/Downloads/SpamAssassinMessages/sp...,1,lowest rates available term life insurance tak...,firstever001@now3.takemetothesavings.com,text/plain
9304,/Users/austi/Downloads/SpamAssassinMessages/sp...,1,ricardo olá trabalho divulgando empresas séria...,"""_Ricardo_"" <topseg321@bol.com.br>",text/html
9305,/Users/austi/Downloads/SpamAssassinMessages/sp...,1,fill short application unbelievable 41 mortgag...,"""Stephanie Orville"" <blain8488@runbox.com>",text/html
9306,/Users/austi/Downloads/SpamAssassinMessages/sp...,1,offer best bulk email prices internet bulk ema...,salesandleads2628@Flashmail.com,text/plain
9307,/Users/austi/Downloads/SpamAssassinMessages/sp...,1,giftcd offer newsletter confirmation august 8t...,"""GiftCD Alert"" <alert@giftcd.com>",text/plain
9308,/Users/austi/Downloads/SpamAssassinMessages/sp...,1,18 21 yr old chicks horny nasty amateurs takin...,"""Alex"" <alexa122_bwer@msg.com>",text/html
9309,/Users/austi/Downloads/SpamAssassinMessages/sp...,1,norton systemworks 2002software suite professi...,zarco9@hotmail.com,text/html
9310,/Users/austi/Downloads/SpamAssassinMessages/sp...,1,dear sir fetch name internet month group set o...,"""Ms.Huang (Sales Manager)"" <motorcycle@qinghec...",multipart
9311,/Users/austi/Downloads/SpamAssassinMessages/sp...,1,dental plans 695 per month dental plans 695 pe...,"""1st Dental Plans"" <eml7@hotmail.com>",text/html
9312,/Users/austi/Downloads/SpamAssassinMessages/sp...,1,result feedback form submitted reenspringhappy...,reenspring@happyman.com (),text/plain


In [8]:
message_type_counts = df['Message Type'].value_counts()

print(message_type_counts)

Message Type
text/plain                     7413
text/html                      1193
multipart                       743
multipart/alternative             3
text/plain charset=us-ascii       1
Name: count, dtype: int64


In [16]:
stop_words = list(ENGLISH_STOP_WORDS)

X_train, X_test, y_train, y_test = train_test_split(df['Message'], 
                                                    df['Label'],
                                                    test_size = 0.2,
                                                    random_state = 42)

alpha_values = [0.001, 0.01, 0.1, 0.5, 1.0, 1.5, 2.0]

for alpha in alpha_values:
    vectorizer = TfidfVectorizer(stop_words = stop_words)
    X_train_transformed = vectorizer.fit_transform(X_train)
    X_test_transformed = vectorizer.transform(X_test)
    
    nb_classifier = MultinomialNB(alpha = alpha)
    nb_classifier.fit(X_train_transformed, y_train)
    
    y_pred = nb_classifier.predict(X_test_transformed)
    
    report = classification_report(y_test, y_pred)
    print(f'Alpha = {alpha}\n{report}\n')
    print('--------------------------------------------------------')

Alpha = 0.001
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1391
           1       0.99      0.97      0.98       480

    accuracy                           0.99      1871
   macro avg       0.99      0.98      0.99      1871
weighted avg       0.99      0.99      0.99      1871


--------------------------------------------------------
Alpha = 0.01
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1391
           1       1.00      0.97      0.98       480

    accuracy                           0.99      1871
   macro avg       0.99      0.99      0.99      1871
weighted avg       0.99      0.99      0.99      1871


--------------------------------------------------------
Alpha = 0.1
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1391
           1       1.00      0.96      0.98       480

    accuracy               

In [20]:
stop_words = list(ENGLISH_STOP_WORDS)


df['Label'] = df['Label'].astype(int) 
X = df['Message']
y = df['Label']

alpha_values = [0.001, 0.01, 0.1, 0.5, 1.0, 1.5, 2.0]

for alpha in alpha_values:
    vectorizer = TfidfVectorizer(stop_words = stop_words)
    X_transformed = vectorizer.fit_transform(X)
    
    nb_classifier = MultinomialNB(alpha = alpha)
    
    scores = cross_val_score(nb_classifier,
                             X_transformed,
                             y,
                             cv = 5,
                             scoring = 'f1_macro')
    
    print(f'Alpha = {alpha}')
    print(f'Cross-Validation Scores: {scores}')
    print(f'Mean F1 Score: {np.mean(scores)}\n')
    print('--------------------------------------------------------')


Alpha = 0.001
Cross-Validation Scores: [0.99300393 0.997894   0.96745087 0.97716731 0.89931107]
Mean F1 Score: 0.9669654377891204

--------------------------------------------------------
Alpha = 0.01
Cross-Validation Scores: [0.99020551 0.997894   0.97413197 0.98299629 0.88240493]
Mean F1 Score: 0.9655265399857855

--------------------------------------------------------
Alpha = 0.1
Cross-Validation Scores: [0.99086146 0.99365581 0.9711737  0.97500889 0.89194338]
Mean F1 Score: 0.9645286481743668

--------------------------------------------------------
Alpha = 0.5
Cross-Validation Scores: [0.96819768 0.96445229 0.91042561 0.92449935 0.91974052]
Mean F1 Score: 0.9374630906732856

--------------------------------------------------------
Alpha = 1.0
Cross-Validation Scores: [0.91461188 0.90704984 0.84369031 0.84723684 0.89544849]
Mean F1 Score: 0.8816074704411208

--------------------------------------------------------
Alpha = 1.5
Cross-Validation Scores: [0.85988131 0.85706443 0.79577

In [27]:
stop_words = list(ENGLISH_STOP_WORDS)

df['Label'] = df['Label'].astype(int) 
X = df['Message']
y = df['Label']

alpha_values = [0.001, 0.01, 0.1, 0.5, 1.0, 1.5, 2.0]

for alpha in alpha_values:
    vectorizer = TfidfVectorizer(stop_words = stop_words)
    X_transformed = vectorizer.fit_transform(X)
    
    nb_classifier = MultinomialNB(alpha = alpha)
    
    scores = cross_val_score(nb_classifier, 
                             X_transformed,
                             y,
                             cv = 5,
                             scoring = 'f1_macro')
    
    y_pred = cross_val_predict(nb_classifier, 
                               X_transformed,
                               y,
                               cv = 5)
    
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    
    print(f'Alpha = {alpha}')
    print(f'Cross-Validation F1 Score: {np.mean(scores)}')
    print(f'Cross-Validation Accuracy: {accuracy}')
    print(f'Cross-Validation Precision: {precision}')
    print(f'Cross-Validation Recall: {recall}\n')
    print('--------------------------------------------------------')

Alpha = 0.001
Cross-Validation F1 Score: 0.9669654377891204
Cross-Validation Accuracy: 0.9739121137602909
Cross-Validation Precision: 0.9414174518639902
Cross-Validation Recall: 0.9578991246352647

--------------------------------------------------------
Alpha = 0.01
Cross-Validation F1 Score: 0.9655265399857855
Cross-Validation Accuracy: 0.9719875975622795
Cross-Validation Precision: 0.9251890171110226
Cross-Validation Recall: 0.9691538140892039

--------------------------------------------------------
Alpha = 0.1
Cross-Validation F1 Score: 0.9645286481743668
Cross-Validation Accuracy: 0.9717737624291671
Cross-Validation Precision: 0.9355365157078743
Cross-Validation Recall: 0.9558149228845352

--------------------------------------------------------
Alpha = 0.5
Cross-Validation F1 Score: 0.9374630906732856
Cross-Validation Accuracy: 0.9548807869132898
Cross-Validation Precision: 0.9800874210781932
Cross-Validation Recall: 0.8411838265944144

------------------------------------------