In [24]:
import os
import shutil
import random
import email
from email.parser import Parser
from tqdm import tqdm


def flatten_directory(source_dir, dest_dir):
    """
    Flatten the structure of source_dir, placing copies of all files into dest_dir with 
    their names changed to reflect their original path (without file extensions). 
    Skips any files that are larger than 2KB. Shows progress using tqdm.
    """
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    # Get all files to process and their total number for tqdm
    all_files = [(root, file) for root, _, files in os.walk(source_dir) for file in files]
    total_files = len(all_files)

    with tqdm(total=total_files, desc="Processing Files") as pbar:
        for root, file in all_files:
            file_path = os.path.join(root, file)

            # Check if file size is greater than 2KB
            if os.path.getsize(file_path) > 2048:  # 2KB = 2048 bytes
                pbar.update(1)
                continue  # Skip this file

            # Construct the new file path in the destination directory
            relative_path = os.path.relpath(root, source_dir)
            # Remove file extension and replace os.sep with "_"
            file_name_without_extension = os.path.splitext(file)[0]
            new_file_name = os.path.join(relative_path, file_name_without_extension).replace(os.sep, "_")

            # Ensure the new filename doesn't end with a "."
            new_file_name = new_file_name.rstrip(".")

            # Copy the file
            shutil.copy2(file_path, os.path.join(dest_dir, new_file_name))

            pbar.update(1)  # Update progress

    print("Flattening complete.")


def clean_email(email_text):
    msg = email.message_from_string(email_text)
    parsed = f"Subject: {msg['Subject']}\nFrom: {msg['From']}\nTo: {msg['To']}\nDate: {msg['Date']}\n{msg.get_payload()}"
    return parsed

def clear_directory(directory):
    """
    Clear all files in the given directory.
    """
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)

def sample_files(source_dir, dest_dir, n, exclude_dir=None):
    """
    Randomly select n files from source_dir, skipping files in exclude_dir if provided, 
    clean their email content, and save them to dest_dir.
    """
    if os.path.exists(dest_dir):
        clear_directory(dest_dir)
    else:
        os.makedirs(dest_dir)

    exclude_files = set(os.listdir(exclude_dir)) if exclude_dir and os.path.exists(exclude_dir) else set()

    files = [f for f in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, f)) and f not in exclude_files]

    if len(files) < n:
        print("Warning: Not enough files in the directory to select", n, "files.")
        n = len(files)

    random.seed(1)

    selected_files = random.sample(files, n)

    for file in selected_files:
        with open(os.path.join(source_dir, file), 'r', encoding='utf-8') as f:
            content = f.read()
        
        cleaned_content = clean_email(content)

        with open(os.path.join(dest_dir, file), 'w', encoding='utf-8') as f:
            f.write(cleaned_content)

    print(f"{n} files have been processed and copied to {dest_dir}.")


def search_files(directory, word_list, destination_directory):
    """
    Searches for text files in the specified directory that contain any of the words in the given word list.
    The search is case-insensitive. Copies the matching files to a specified destination directory.

    :param directory: The directory to search in.
    :param word_list: A list of words to search for.
    :param destination_directory: The directory where matching files will be copied to.
    :return: A list of files that contain any of the words in the word list.
    """
    matching_files = []

    # Convert the word list to lowercase for case-insensitive comparison
    word_list = [word.lower() for word in word_list]

    # Create the destination directory if it doesn't exist
    os.makedirs(destination_directory, exist_ok=True)

    # Walk through all files and directories within the specified directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'r') as f:
                    file_content = f.read().lower()  # Convert content to lowercase
                    # Check if any of the words in the word list are in the file content
                    if any(word in file_content for word in word_list):
                        matching_files.append(file_path)
                        # Copy the file to the destination directory
                        shutil.copy(file_path, destination_directory)
            except Exception as e:
                print(f"Error reading file {file_path}: {e}")

    return matching_files


In [11]:
original_dir = "maildir"
flat_dir = "flattened_maildir"
sample_dir = "sample_maildir"


In [15]:

flatten_directory(original_dir, flat_dir)

Processing Files: 100%|██████████| 517401/517401 [00:51<00:00, 9991.28it/s] 

Flattening complete.





In [22]:
word_list = ["urgent", "emergency"]
urgent_files = search_files(flat_dir, word_list, "urgent_files")
len(urgent_files)


Error reading file flattened_maildir/taylor-m_all_documents_3452: 'utf-8' codec can't decode byte 0xff in position 384: invalid start byte
Error reading file flattened_maildir/taylor-m_notes_inbox_2425: 'utf-8' codec can't decode byte 0x9b in position 271: invalid start byte
Error reading file flattened_maildir/ybarbo-p_inbox_271: 'utf-8' codec can't decode byte 0xd8 in position 1041: invalid continuation byte
Error reading file flattened_maildir/shankman-j_deleted_items_510: 'utf-8' codec can't decode byte 0xbf in position 502: invalid start byte
Error reading file flattened_maildir/horton-s_discussion_threads_198: 'utf-8' codec can't decode byte 0xad in position 322: invalid start byte
Error reading file flattened_maildir/taylor-m_notes_inbox_1591: 'utf-8' codec can't decode byte 0xff in position 384: invalid start byte
Error reading file flattened_maildir/taylor-m_all_documents_7852: 'utf-8' codec can't decode byte 0x9b in position 272: invalid start byte
Error reading file flattene

1866

In [27]:

sample_files("urgent_files", sample_dir, 600, "sample_2")

['haedicke-m_sent_896', 'white-s_inbox_33', 'jones-t_all_documents_2454', 'jones-t_all_documents_2765', 'gay-r_all_documents_15', 'mann-k__sent_mail_340', 'beck-s_all_documents_3045', 'guzman-m_discussion_threads_1252', 'jones-t_sent_1521', 'shankman-j_deleted_items_687', 'kean-s_archiving_untitled_4783', 'dasovich-j_notes_inbox_1548', 'symes-k_sent_123', 'shankman-j_deleted_items_35', 'lucci-p_deleted_items_455', 'heard-m_deleted_items_165', 'salisbury-h_inbox_1036', 'jones-t_sent_112', 'mann-k_all_documents_343', 'jones-t_notes_inbox_2634', 'kean-s_attachments_1245', 'derrick-j_inbox_167', 'dean-c_all_documents_156', 'salisbury-h_inbox_897', 'salisbury-h_inbox_1048', 'kean-s_calendar_untitled_2209', 'dean-c_all_documents_224', 'williams-w3_bill_williams_iii_745', 'kean-s_archiving_untitled_4813', 'mccarty-d_deleted_items_33', 'jones-t_all_documents_5419', 'mann-k_all_documents_933', 'derrick-j_deleted_items_244', 'kean-s_all_documents_959', 'kean-s_all_documents_1745', 'badeer-r_all_