### 先获取原始邮件文件并且进行存储


In [1]:
import os
import chardet
from service import EmailReceiver, EmailSender
from config import config

def decode_email_content(payload: bytes) -> str:
    """
    尝试使用动态检测和指定编码解码邮件正文。
    """
    try:
        # 尝试使用 UTF-8 解码
        return payload.decode("utf-8")
    except UnicodeDecodeError:
        # 动态检测编码
        detected = chardet.detect(payload)
        encoding = detected.get("encoding") or "utf-8"
        try:
            return payload.decode(encoding)
        except Exception as e:
            print(f"Failed to decode with detected encoding {encoding}: {e}")
            return "[UNABLE TO DECODE CONTENT]"

def save_email(email_message, email_id, save_dir):
    """
    保存邮件内容到本地文件，动态处理编码问题。
    """
    try:
        # 检查邮件是否已经保存
        email_id_str = email_id.decode()  # IMAP 的邮件ID是字节，需要解码
        filename = f"{email_id_str}.txt"
        filepath = os.path.join(save_dir, filename)

        if os.path.exists(filepath):
            print(f"Email ID {email_id_str} already saved, skipping.")
            return

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # 确保文件名不包含特殊字符
        subject = email_message.get("subject", "No Subject").replace("/", "_").replace("\\", "_")

        with open(filepath, "w", encoding="utf-8") as f:
            # 保存邮件头部
            f.write(f"Subject: {subject}\n")
            f.write(f"From: {email_message.get('from')}\n")
            f.write(f"To: {email_message.get('to')}\n")
            f.write(f"Date: {email_message.get('date')}\n\n")

            # 保存邮件正文
            if email_message.is_multipart():
                # 遍历所有邮件部分，查找 text/plain 部分
                for part in email_message.walk():
                    if part.get_content_type() == "text/plain":
                        content = part.get_payload(decode=True)
                        if content:
                            f.write(decode_email_content(content))
                            break
            else:
                # 非多部分邮件，直接处理邮件正文
                content = email_message.get_payload(decode=True)
                if content:
                    f.write(decode_email_content(content))

        print(f"Email ID {email_id_str} saved to {filepath}")
    except Exception as e:
        print(f"Error saving email ID {email_id.decode()}: {e}")


email_receiver = EmailReceiver(config.IMAP_SERVER, config.IMAP_PORT, config.EMAIL_USERNAME,config.EMAIL_PASSWORD)

print("base dir:", config.BASE_DIR)
print("project root:", config.PROJECT_ROOT)
print("data dir:", config.DATA_DIR)

EMAIL_DIR = os.path.join(config.DATA_DIR, "emails")
RAW_EMAIL_DIR = os.path.join(config.DATA_DIR, "raw_emails")

try:
    email_receiver.connect()
    email_ids = email_receiver.fetch_all_emails()
    print(f"Found {len(email_ids)} emails.")

    for email_id in email_ids:
        email_message = email_receiver.fetch_email_by_id(email_id)
        if email_message:
            save_email(email_message, email_id, RAW_EMAIL_DIR)
finally:
    email_receiver.disconnect()


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
base dir: /Users/apple/Projects/LLM-email/src/config
project root: /Users/apple/Projects/LLM-email
data dir: /Users/apple/Projects/LLM-email/data
Connected to IMAP server imap.bnu.edu.cn
Logging in as noibeijing@bnu.edu.cn...
Logging in with password: 5xVPNyE9tdPZxQfA...
Found 3073 emails.
Email ID 1 saved to /Users/apple/Projects/LLM-email/data/raw_emails/1.txt
Email ID 2 saved to /Users/apple/Projects/LLM-email/data/raw_emails/2.txt
Email ID 3 saved to /Users/apple/Projects/LLM-email/data/raw_emails/3.txt
Email ID 4 saved to /Users/apple/Projects/LLM-email/data/raw_emails/4.txt
Email ID 5 saved to /Users/apple/Projects/LLM-email/data/raw_emails/5.txt
Email ID 6 saved to /Users/apple/Projects/LLM-email/data/raw_emails/6.txt
Email ID 7 saved to /Users/apple/Projects/LLM-email/data/raw_emails/7.txt
Email ID 8 saved to /Users

### 解析 raw 邮件文件，把杂乱的邮件内容解析成结构化数据，最后向量化存储


In [2]:
# 测试代码
# Example usage
from utils import parse_email
from config import config
import os
import json

DATA_DIR = config.DATA_DIR
test_input_file_path = os.path.join(DATA_DIR, "raw_emails", "4.txt")
test_output_file_path = os.path.join(DATA_DIR, "parsed_emails", "4.json")
parsed_email = parse_email(test_input_file_path)
# Save result
with open(test_output_file_path, "w", encoding="utf-8") as f:
    json.dump(parsed_email, f, ensure_ascii=False, indent=4)


print(json.dumps(parsed_email, ensure_ascii=False, indent=4))

{
    "subject": "=?UTF-8?q?=E9=BB=84=E9=99=A2=E9=95=BF=E5=A5=BD=EF=BC=81=E5=85=B3=E4=BA=8E?=\n\t=?UTF-8?q?=E5=8C=97=E4=BA=ACCSP=E3=80=81NOI=E7=9B=B8=E5=85=B3=E5=B7=A5?=\n\t=?UTF-8?q?=E4=BD=9C=E7=9A=84=E6=94=AF=E6=92=91=E5=8D=8F=E4=BD=9C=E3=80=82?=",
    "from": "=?utf-8?q?=E6=9D=8E=E5=93=B2?= <lizhe02@hetao101.com>",
    "to": "\"noibeijing@bnu.edu.cn\" <noibeijing@bnu.edu.cn>",
    "date": "2024-04-07T15:06:47+08:00",
    "body": "\\u9ec4\\u9662\\u957f\\u60a8\\u597d\\uff1a\n�� �� \\u9996\\u5148\\u795d\\u8d3a\\u60a8 \\u62c5\\u4efbCCF\\u6559\\u80b2\\u5de5\\u59d4\\u526f\\u4e3b\\u4efb\\uff01 \\u5192\\u6627\\u6253\\u6270\\uff01\\u6211\\u662f\\u6838\\u6843\\u7f16\\u7a0b \\u516c\\u5171\\u4e8b\\u52a1\\u8d1f\\u8d23\\u4eba \\u674e\\u54f2\\uff0c\\u4e5f\\u662f\\u54b1\\u4eecCCF\\u7684\\u9ad8\\u7ea7\\u4f1a\\u5458\\u3002\\u56e0\\u4e0d\\u77e5\\u9053\\u60a8\\u7684\\u8054\\u7cfb\\u65b9\\u5f0f\\uff0c\\u53ea\\u80fd\\u5148\\u7ed9\\u60a8\\u53d1\\u90ae\\u4ef6\\uff0c\\u5e0c\\u671b\\u80fd\\u591f\\u5f97\\u523

In [1]:
# 批量处理全部原始文件
from utils import parse_email
from config import config
import os
import json

DATA_DIR = config.DATA_DIR
RAW_EMAIL_DIR = os.path.join(DATA_DIR, "raw_emails")
PARSED_EMAIL_DIR = os.path.join(DATA_DIR, "parsed_emails")

for filename in os.listdir(RAW_EMAIL_DIR):
    input_file_path = os.path.join(RAW_EMAIL_DIR, filename)
    output_file_path = os.path.join(PARSED_EMAIL_DIR, f"{filename}.json")
    parsed_email = parse_email(input_file_path)
    # Save result
    with open(output_file_path, "w", encoding="utf-8") as f:
        json.dump(parsed_email, f, ensure_ascii=False, indent=4)



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


In [6]:
# 提取所有原始格式的邮件称json格式的文件，保留原始的body内容
from utils import parse_email
from config import config
import os
import json

DATA_DIR = config.DATA_DIR
RAW_EMAIL_DIR = os.path.join(DATA_DIR, "raw_emails")
PARSED_EMAIL_DIR = os.path.join(DATA_DIR, "parsed_emails")

for filename in os.listdir(RAW_EMAIL_DIR):
    input_file_path = os.path.join(RAW_EMAIL_DIR, filename)
    # 去除扩展名
    filename = os.path.splitext(filename)[0]
    output_file_path = os.path.join(PARSED_EMAIL_DIR, f"{filename}.json")
    parsed_email = parse_email(input_file_path)
    # Save result
    with open(output_file_path, "w", encoding="utf-8") as f:
        json.dump(parsed_email, f, ensure_ascii=False, indent=4)

tmp_subject_str:  =?GBK?B?uPbIy7Gow_sru8bLp+S_?=
tmp_str:  =?GBK?B?uPbIy7Gow_sru8bLp+S_?=
subject:  个人报名+黄帅淇
tmp_subject_str:  =?utf-8?B?55So5oi35ZCNIGRlbmdrYWlnZe+8jOeDpuivtw==?= =?utf-8?B?5a6h5qC477yM6LCi6LCi?=
tmp_str:  =?utf-8?B?55So5oi35ZCNIGRlbmdrYWlnZe+8jOeDpuivtw==?= =?utf-8?B?5a6h5qC477yM6LCi6LCi?=
subject:  用户名 dengkaige，烦请审核，谢谢
tmp_subject_str:  =?utf-8?B?5bm06b6E6ZmQ5Yi25ZKo6K+i?=
tmp_str:  =?utf-8?B?5bm06b6E6ZmQ5Yi25ZKo6K+i?=
subject:  年龄限制咨询
tmp_subject_str:  =?utf-8?B?UmU6IOWbnuWkje+8mlJlOiDlkqjor6JOT0nmiqU=?= =?utf-8?B?5ZCN5LqL5a6c?=
tmp_str:  =?utf-8?B?UmU6IOWbnuWkje+8mlJlOiDlkqjor6JOT0nmiqU=?= =?utf-8?B?5ZCN5LqL5a6c?=
subject:  Re: 回复：Re: 咨询NOI报名事宜
tmp_subject_str:  =?gbk?B?udjT2kNQU7+8ytTWuLW8vczKpsnzusu1xM7KzOI=?=
tmp_str:  =?gbk?B?udjT2kNQU7+8ytTWuLW8vczKpsnzusu1xM7KzOI=?=
subject:  关于CPS考试指导教师审核的问题
tmp_subject_str:  =?GBK?B?sbG+qcrQt+HMqMf4tdrO5dCh0adDU1AtSlMyMDI0zca89rHto6ixsb6po6k=?=
tmp_str:  =?GBK?B?sbG+qcrQt+HMqMf4tdrO5dCh0adDU1AtSlMyMDI0zca89rHto6ixsb6po6k=?

In [4]:
# 继续提取body内容，解析成语义文本
# 批量处理全部原始文件
from utils import parse_email,handle_email_body
from config import config
import os
import json

DATA_DIR = config.DATA_DIR
RAW_EMAIL_DIR = os.path.join(DATA_DIR, "raw_emails")
PARSED_EMAIL_DIR = os.path.join(DATA_DIR, "parsed_emails")

for filename in os.listdir(PARSED_EMAIL_DIR):
    input_file_path = os.path.join(PARSED_EMAIL_DIR, filename)
    # 去掉扩展名
    filename = os.path.splitext(filename)[0]
    output_file_path = os.path.join(PARSED_EMAIL_DIR, f"{filename}.json")
    with open(input_file_path, "r", encoding="utf-8") as f:
        content = f.read()
    email_json=json.loads(content)
    handled_body=handle_email_body(email_json['body'])
    print("handled_body",handled_body)
    email_json['handled_body']=handled_body
    with open(output_file_path, "w", encoding="utf-8") as f:
        f.write(json.dumps(email_json, indent=4, ensure_ascii=False))



handled_body 黄院长您好：
ï¿½ï¿½ ï¿½ï¿½ 首先祝贺您 担任CCF教育工委副主任！ 冒昧打扰！我是核桃编程 公共事务负责人 李哲，也是咱们CCF的高级会员。因不知道您的联系方式，只能先给您发邮件，希望能够得到您的回复。
ï¿½ï¿½ ï¿½ï¿½ 核桃编程是面向青少年编程教育的线上头部企业，总部在北京苏州街。目前长期付费学员规模超过100万人，累计用户超过1500万人。其中图形化编程、Python、C++学员比例大体为6：3：1。这两年，公司和CCF合作紧密，我们是GESP的组织委员会委员，另外在Python、C++这两块投入很大，成立集训队，对标CSP、NOI进行学员的培养。总而言之，我们是希望在CCF的体系下，更多的做些贡献和公益性合作，特别是协助区域做一些提升规模和质量的事情。我们一直跟北师大部分院系有着长期紧密的合作，也是想借这个契机，跟您接触后拓展更多的合作点。
ï¿½ï¿½ ï¿½ ï¿½ 您看可否回个电话，或者添加个微信，咱们先建立个联系。具体情况可当面再跟您沟通。祝好！

ï¿½ï¿½ ï¿½ ï¿½ï¿½ 李哲：18610144332 （同微信）
handled_body 黄院长您好：
ï¿½ï¿½ ï¿½ï¿½ 首先祝贺您 担任CCF教育工委副主任！ 冒昧打扰！我是核桃编程 公共事务负责人 李哲，也是咱们CCF的高级会员。因不知道您的联系方式，只能先给您发邮件，希望能够得到您的回复。
ï¿½ï¿½ ï¿½ï¿½ 核桃编程是面向青少年编程教育的线上头部企业，总部在北京苏州街。目前长期付费学员规模超过100万人，累计用户超过1500万人。其中图形化编程、Python、C++学员比例大体为6：3：1。这两年，公司和CCF合作紧密，我们是GESP的组织委员会委员，另外在Python、C++这两块投入很大，成立集训队，对标CSP、NOI进行学员的培养。总而言之，我们是希望在CCF的体系下，更多的做些贡献和公益性合作，特别是协助区域做一些提升规模和质量的事情。我们一直跟北师大部分院系有着长期紧密的合作，也是想借这个契机，跟您接触后拓展更多的合作点。
ï¿½ï¿½ ï¿½ ï¿½ 您看可否回个电话，或者添加个微信，咱们先建立个联系。具体情况可当面再跟您沟通。祝好！

ï¿½ï¿½ ï¿½ ï¿½ï¿½ 李哲：18610144332 （同