### 先获取原始邮件文件并且进行存储


In [2]:
import os
import chardet
from service import EmailReceiver, EmailSender
from config import config

def decode_email_content(payload: bytes) -> str:
    """
    尝试使用动态检测和指定编码解码邮件正文。
    """
    try:
        # 尝试使用 UTF-8 解码
        return payload.decode("utf-8")
    except UnicodeDecodeError:
        # 动态检测编码
        detected = chardet.detect(payload)
        encoding = detected.get("encoding") or "utf-8"
        try:
            return payload.decode(encoding)
        except Exception as e:
            print(f"Failed to decode with detected encoding {encoding}: {e}")
            return "[UNABLE TO DECODE CONTENT]"

def save_email(email_message, email_id, save_dir):
    """
    保存邮件内容到本地文件，动态处理编码问题。
    """
    try:
        # 检查邮件是否已经保存
        email_id_str = email_id.decode()  # IMAP 的邮件ID是字节，需要解码
        filename = f"{email_id_str}.txt"
        filepath = os.path.join(save_dir, filename)

        if os.path.exists(filepath):
            print(f"Email ID {email_id_str} already saved, skipping.")
            return

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # 确保文件名不包含特殊字符
        subject = email_message.get("subject", "No Subject").replace("/", "_").replace("\\", "_")

        with open(filepath, "w", encoding="utf-8") as f:
            # 保存邮件头部
            f.write(f"Subject: {subject}\n")
            f.write(f"From: {email_message.get('from')}\n")
            f.write(f"To: {email_message.get('to')}\n")
            f.write(f"Date: {email_message.get('date')}\n\n")

            # 保存邮件正文
            if email_message.is_multipart():
                # 遍历所有邮件部分，查找 text/plain 部分
                for part in email_message.walk():
                    if part.get_content_type() == "text/plain":
                        content = part.get_payload(decode=True)
                        if content:
                            f.write(decode_email_content(content))
                            break
            else:
                # 非多部分邮件，直接处理邮件正文
                content = email_message.get_payload(decode=True)
                if content:
                    f.write(decode_email_content(content))

        print(f"Email ID {email_id_str} saved to {filepath}")
    except Exception as e:
        print(f"Error saving email ID {email_id.decode()}: {e}")


email_receiver = EmailReceiver(config.IMAP_SERVER, config.IMAP_PORT, config.EMAIL_USERNAME,config.EMAIL_PASSWORD)

print("base dir:", config.BASE_DIR)
print("project root:", config.PROJECT_ROOT)
print("data dir:", config.DATA_DIR)

EMAIL_DIR = os.path.join(config.DATA_DIR, "emails")
RAW_EMAIL_DIR = os.path.join(config.DATA_DIR, "raw_emails")

try:
    email_receiver.connect()
    email_ids = email_receiver.fetch_all_emails()
    print(f"Found {len(email_ids)} emails.")

    for email_id in email_ids:
        email_message = email_receiver.fetch_email_by_id(email_id)
        if email_message:
            save_email(email_message, email_id, RAW_EMAIL_DIR)
finally:
    email_receiver.disconnect()


base dir: C:\Projects\LLM-email\src\config
project root: C:\Projects\LLM-email
data dir: C:\Projects\LLM-email\data
Connected to IMAP server imap.bnu.edu.cn
Logging in as noibeijing@bnu.edu.cn...
Logging in with password: 5xVPNyE9tdPZxQfA...
Found 3073 emails.
Email ID 1 already saved, skipping.
Email ID 2 already saved, skipping.
Email ID 3 already saved, skipping.
Email ID 4 already saved, skipping.
Email ID 5 already saved, skipping.
Email ID 6 already saved, skipping.
Email ID 7 already saved, skipping.
Email ID 8 already saved, skipping.
Email ID 9 already saved, skipping.
Email ID 10 already saved, skipping.
Email ID 11 already saved, skipping.
Email ID 12 already saved, skipping.
Email ID 13 already saved, skipping.
Email ID 14 already saved, skipping.
Email ID 15 already saved, skipping.
Email ID 16 already saved, skipping.
Email ID 17 already saved, skipping.
Email ID 18 already saved, skipping.
Email ID 19 already saved, skipping.
Email ID 20 already saved, skipping.
Email ID

abort: command: LOGOUT => unexpected response: b'7Lnu4/lnKjljJfkuqzkuIrl'

### 解析 raw 邮件文件，把杂乱的邮件内容解析成结构化数据，最后向量化存储


In [2]:
# 测试代码
# Example usage
from utils import parse_email
from config import config
import os

DATA_DIR = config.DATA_DIR
test_input_file_path = os.path.join(DATA_DIR, "raw_emails", "1.txt")
test_output_file_path = os.path.join(DATA_DIR, "parsed_emails", "1.json")
parsed_email = parse_email(test_input_file_path)
# Save result
import json
with open(test_output_file_path, "w", encoding="utf-8") as f:
    json.dump(parsed_email, f, ensure_ascii=False, indent=4)

import json
# print(json.dumps(parsed_email, ensure_ascii=False, indent=4))

subject: =?UTF-8?B?5qyi6L+O5L2_55So5YyX5Lqs5biI6IyD5aSn?=  =?UTF-8?B?5a2m5pWZ5biI55S15a2Q6YKu5Lu257O757uf?=From: <postmaster@bnu.edu.cn>
