In [None]:
import os
import re
import pandas as pd
from collections import Counter
from email import policy
from email.parser import BytesParser

# Define paths
folder_path = "trec06p-cs280/data/"
labels_file = "trec06p-cs280/labels"

# Load stop words from stop_words.txt
stop_words = []
with open("stop_words.txt", "r") as f:
    stop_word = f.read().splitlines()
    stop_words = [word for word in stop_word]

# Useless info to be removed from the email
punctuations = r"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
numbers = "0123456789"
html_tags = re.compile(r"<.*?>")  # remove HTML tags
esc_chars = re.compile(r"[a-z][a-z][0-9]+")  # remove escape characters

# Function to clean the email by removing useless information
def clean_email(message): 
    # Convert to lowercase
    message = message.lower()
    # Remove HTML tags
    message = re.sub(html_tags, '', message)
    # Remove symbols (punctuations)
    message = message.translate(str.maketrans('', '', punctuations))
    # Remove numbers
    message = message.translate(str.maketrans('', '', numbers))
    # Remove escape characters
    message = re.sub(esc_chars, '', message)
    # Remove non-alphabetic characters
    message = re.sub(r'[^a-zA-Z\s]', '', message)

    # Remove stop words
    words = message.split()
    words = [word for word in words if word not in stop_words]

    # Rejoin words into a cleaned message
    message = " ".join(words)

    # If the cleaned message is empty, add a placeholder
    if not message.strip():
        message = "[No content after cleaning]"
    
    return message


# Function to get message from parsed email
def get_message(parsed):
    message = ""
    # If email has attachments or is multipart
    if parsed.is_multipart():
        # Iterate over parts of the email
        for part in parsed.walk():
            # Only get the plain text part of the email
            if part.get_content_type() == 'text/plain':
                try:
                    message = part.get_payload(decode=True).decode('ISO-8859-1', errors='replace')
                except:
                    message = "[Could not decode message]"
                break
    # If it's a simple email with no attachments
    else:
        try:
            message = parsed.get_payload(decode=True).decode('ISO-8859-1', errors='replace')
        except:
            message = "[Could not decode message]"
    
    # If message is empty after parsing
    if not message.strip():
        message = "[No message content]"
    
    return message


# Load labels into a dataframe
labels_df = pd.read_csv(labels_file, sep=" ", names=["category", "file_path"])
labels_df["file_path"] = labels_df["file_path"].apply(lambda x: x.split("/")[-1])

# Initialize the main dataframe for storing processed email data
main_df = pd.DataFrame(columns=["folder", "file", "email_message", "category"])

# Get all folders except hidden/system files
folders = [folder for folder in os.listdir(folder_path) if not folder.startswith('.')]

# Loop through the filtered folders
for folder in folders:
    # Get all files in the folder, excluding hidden/system files like .DS_Store
    files = [file for file in os.listdir(f"{folder_path}/{folder}") if not file.startswith('.')]
    
    for file in files:
        # Process each email file
        with open(f"{folder_path}/{folder}/{file}", "rb") as e_mail:  # Open in binary mode 'rb'
            # Parse email
            parsed_email = BytesParser(policy=policy.default).parse(e_mail)
            # Get message from parsed email
            message = get_message(parsed_email)
            # Clean email message (remove useless info like HTML tags, URLs, headers, stop words, etc.)
            message_no_stopwords = clean_email(message)
            # Get the category of the email based on the labels dataframe
            category_label = labels_df[labels_df["file_path"] == file]["category"].values[0]
            # Concatenate the data to the main_df dataframe
            main_df = pd.concat([main_df, pd.DataFrame([[folder, file, message_no_stopwords, category_label]], columns=["folder", "file", "email_message", "category"])], ignore_index=True)

# Check and create a directory for saving preprocessed files if it doesn't exist
if not os.path.exists('preprocessed_files'):
    os.makedirs('preprocessed_files')

# Save main_df as preprocessed_emails.csv
main_df.to_csv("preprocessed_files/preprocessed_emails.csv", index=False)

# Reset the main_df to avoid memory overload
main_df.drop(main_df.index, inplace=True)
main_df = pd.DataFrame(columns=["folder", "file", "email_message", "category"])

# Load the preprocessed dataset
df_main = pd.read_csv("preprocessed_files/preprocessed_emails.csv")

# Split the dataset into training and testing sets based on folders 0-70 (train) and 71-126 (test)
train_df = df_main[df_main['folder'].astype(int) <= 70]
test_df = df_main[df_main['folder'].astype(int) > 70]

# Split training data into spam and ham
train_spam_df = train_df[train_df['category'] == 1]
train_ham_df = train_df[train_df['category'] == 0]

# Display the dataframes
print("Main Dataset:")
print(df_main.head())

print("\nTraining Set (Spam):")
print(train_spam_df.head())

print("\nTraining Set (Ham):")
print(train_ham_df.head())

print("\nTest Set:")
print(test_df.head())

# Initialize a Counter object to keep track of word frequencies
word_counter = Counter()

# Iterate over the training dataframe and update word counts
for email_message in train_df['email_message'].astype(str):
    word_counter.update(email_message.split())

# Sort the word counts in descending order
sorted_words = dict(sorted(word_counter.items(), key=lambda item: item[1], reverse=True))

# Get the top 10000 most common words
top_10000_words = dict(word_counter.most_common(10000))

# Create a list of the top 10000 words for future use
top_10000_words_list = list(top_10000_words.keys())

# Display the top 10000 words
print("\nTop 10,000 Words:")
print(top_10000_words)
