In [None]:
import os
from utils import parse_eml_file, save_parsed_email
from config import config

RAW_EMAIL_DIR = os.path.join(config.DATA_DIR, "raw_emails")
PARSED_EMAIL_DIR = os.path.join(config.DATA_DIR, "parsed_emails") 

os.makedirs(PARSED_EMAIL_DIR, exist_ok=True)

for folder in os.listdir(RAW_EMAIL_DIR):
    folder_path = os.path.join(RAW_EMAIL_DIR, folder)

    if os.path.isdir(folder_path): 
        for filename in os.listdir(folder_path):
            if filename.endswith(".eml"):
                file_path = os.path.join(folder_path, filename)
                output_path = os.path.join(PARSED_EMAIL_DIR, f"{folder}.txt")

                email_data = parse_eml_file(file_path)

                if email_data:
                    save_parsed_email(email_data, output_path)


In [None]:
import os
from config import config
from utils import extract_main_content

RAW_EMAIL_DIR = os.path.join(config.DATA_DIR, "raw_emails")
JSON_DIR = os.path.join(config.DATA_DIR, "json_emails")

os.makedirs(JSON_DIR, exist_ok=True)

for folder in os.listdir(RAW_EMAIL_DIR):
    folder_path = os.path.join(RAW_EMAIL_DIR, folder)

    if os.path.isdir(folder_path):
        for filename in os.listdir(folder_path):
            if filename.endswith(".eml"):
                eml_file = os.path.join(folder_path, filename)
                json_name = f"{os.path.splitext(filename)[0]}.json"
                output_path = os.path.join(JSON_DIR, json_name)

                try:
                    extract_main_content(eml_file, output_path)
                except Exception as e:
                    print(f"[ERROR] Failed to process {eml_file}: {e}")


In [None]:
import os
import json
from utils import extract_email_dialog
from config import config

JSON_DIR = os.path.join(config.DATA_DIR, "json_emails")
DIALOG_DIR = os.path.join(config.DATA_DIR, "dialog")
os.makedirs(DIALOG_DIR, exist_ok=True)

for filename in os.listdir(JSON_DIR):
    if not filename.endswith(".json"):
        continue

    input_path = os.path.join(JSON_DIR, filename)
    output_path = os.path.join(DIALOG_DIR, filename)

    try:
        with open(input_path, "r", encoding="utf-8") as f:
            email_data = json.load(f)

        # 如果加载结果是字符串，则尝试再次解析
        if isinstance(email_data, str):
            try:
                email_data = json.loads(email_data)
            except Exception as e:
                raise ValueError(f"文件 {filename} 内容为字符串，无法转换为 dict: {e}")

        if not isinstance(email_data, dict):
            raise ValueError(f"文件 {filename} 格式错误，期望 dict，但得到 {type(email_data)}")

        dialog = extract_email_dialog(email_data)

        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(dialog, f, ensure_ascii=False, indent=2)

        print(f"提取完成: {filename} -> {output_path}")

    except Exception as e:
        print(f"处理失败: {filename} 错误信息: {e}")


In [None]:
import os
import json
from utils import remove_ads_from_dialog
from config import config

DIALOG_DIR = os.path.join(config.DATA_DIR, "dialog")
CLEAN_DIALOG_DIR = os.path.join(config.DATA_DIR, "clean_dialog")
os.makedirs(CLEAN_DIALOG_DIR, exist_ok=True)

# 遍历 DIALOG_DIR 中所有 JSON 文件
for filename in os.listdir(DIALOG_DIR):
    if not filename.endswith(".json"):
        continue

    input_path = os.path.join(DIALOG_DIR, filename)
    output_path = os.path.join(CLEAN_DIALOG_DIR, filename)

    try:
        with open(input_path, "r", encoding="utf-8") as f:
            dialog = json.load(f)

        cleaned_dialog = remove_ads_from_dialog(dialog)

        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(cleaned_dialog, f, ensure_ascii=False, indent=2)

        print(f"Cleaned: {filename} -> {output_path}")

    except Exception as e:
        print(f"处理失败: {filename} 错误信息: {e}")


Cleaned: 2833.json -> /Users/apple/Projects/LLM-email/data/clean_dialog/2833.json
Cleaned: 729.json -> /Users/apple/Projects/LLM-email/data/clean_dialog/729.json
Cleaned: 683.json -> /Users/apple/Projects/LLM-email/data/clean_dialog/683.json
Cleaned: 379.json -> /Users/apple/Projects/LLM-email/data/clean_dialog/379.json
Cleaned: 2999.json -> /Users/apple/Projects/LLM-email/data/clean_dialog/2999.json
Cleaned: 2130.json -> /Users/apple/Projects/LLM-email/data/clean_dialog/2130.json
Cleaned: 2560.json -> /Users/apple/Projects/LLM-email/data/clean_dialog/2560.json
Cleaned: 2075.json -> /Users/apple/Projects/LLM-email/data/clean_dialog/2075.json
Cleaned: 2425.json -> /Users/apple/Projects/LLM-email/data/clean_dialog/2425.json
Cleaned: 1659.json -> /Users/apple/Projects/LLM-email/data/clean_dialog/1659.json
Cleaned: 2976.json -> /Users/apple/Projects/LLM-email/data/clean_dialog/2976.json
Cleaned: 396.json -> /Users/apple/Projects/LLM-email/data/clean_dialog/396.json
Cleaned: 3234.json -> /U