In [3]:
import mailbox
import os
import csv

In [20]:
# Run this cell to define the function that will be used to extract the mbox data.

def parse_mbox(file_path, output_dir, csv_path):
    # Create a directory for extracted PDFs if it doesn't exist
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    # Open the mbox file
    mbox = mailbox.mbox(file_path)

    # Open CSV file for writing
    file = open(csv_path, mode='w', newline='', encoding='utf-8') 
    writer = csv.writer(file)
    # Write CSV headers
    writer.writerow(['Email Index', 'Subject', 'Date', 'PDF Filename'])


    # Iterate through messages in the mbox
    for i_msg, message in enumerate(mbox):
        # Extract headers
        subject = message.get('subject', 'No Subject')
        from_ = message.get('from', 'No Sender')
        to = message.get('to', 'No Recipient')
        date = message.get('date', 'No Date')
        pdf_filepath = None
        body = None

        # Print email details
        # print(f"Subject: {subject}")
        # print(f"From: {from_}")
        # print(f"To: {to}")
        # print(f"Date: {date}")

        # Extract email body and attachment.
        if message.is_multipart():
            for part in message.walk():
                content_type = part.get_content_type()
                content_disposition = part.get("Content-Disposition")
				
                # Get message body.
                if content_type == "text/plain" and not content_disposition:
                    body = part.get_payload(decode=True).decode()
                
                # Save attachment to folder, named starting with the key.
                if content_disposition and "attachment" in content_disposition:
                    filename = part.get_filename()
                    if filename and filename.lower().endswith('.pdf'):
                        pdf_filepath = os.path.join(output_dir, f"{i_msg}_{filename}")
                        with open(pdf_filepath, "wb") as f:
                            f.write(part.get_payload(decode=True))
                        # print(f"Extracted: {filepath}")
        else:
            body = message.get_payload(decode=True).decode()

        # Write email details and PDF filename to CSV.
        writer.writerow([i_msg, subject, date, pdf_filepath or 'No PDF'])

    # Close the CSV file after done writing.
    file.close()


In [21]:
# Ensure file names are set here. Then run this cell to extract the PDFs and build the CSV.

mbox_file_path = 'AI Jobs.mbox'
parse_mbox(mbox_file_path, "./attachments", 'results.csv')

In [19]:
# Cell to clean up before starting another extraction.
import shutil
shutil.rmtree("./attachments/")
os.remove("./results.csv")