### 先获取原始邮件文件并且进行存储


In [None]:
import os
from service import EmailReceiver, EmailSender
from config import config

# 初始化邮箱接收器（请确保config中的参数已正确配置）
email_receiver = EmailReceiver(
    server=config.IMAP_SERVER,
    port=config.IMAP_PORT,
    username=config.EMAIL_USERNAME,
    password=config.EMAIL_PASSWORD,
    mailbox="INBOX"
)

print("Base dir:", config.BASE_DIR)
print("Project root:", config.PROJECT_ROOT)
print("Data dir:", config.DATA_DIR)

# 定义保存目录（建议分开存储原始邮件和其它数据）
EMAIL_DIR = os.path.join(config.DATA_DIR, "emails")
RAW_EMAIL_DIR = os.path.join(config.DATA_DIR, "raw_emails")

try:
    email_receiver.connect()
    email_ids = email_receiver.fetch_all_email_ids()
    print(f"Found {len(email_ids)} emails.")

    # 遍历所有邮件ID，直接调用save_email保存完整的.eml文件，并提取附件
    for email_id in email_ids:
        email_receiver.save_email(email_id, RAW_EMAIL_DIR, extract_attachments=True)
finally:
    email_receiver.disconnect()


Base dir: /Users/apple/Projects/LLM-email/src/config
Project root: /Users/apple/Projects/LLM-email
Data dir: /Users/apple/Projects/LLM-email/data
Connected to IMAP server imap.bnu.edu.cn
Found 3229 emails.
Email ID 1 saved to /Users/apple/Projects/LLM-email/data/raw_emails/1.txt
Email ID 2 saved to /Users/apple/Projects/LLM-email/data/raw_emails/2.txt
Email ID 3 saved to /Users/apple/Projects/LLM-email/data/raw_emails/3.txt
Email ID 4 saved to /Users/apple/Projects/LLM-email/data/raw_emails/4.txt
Email ID 5 saved to /Users/apple/Projects/LLM-email/data/raw_emails/5.txt
Email ID 6 saved to /Users/apple/Projects/LLM-email/data/raw_emails/6.txt
Email ID 7 saved to /Users/apple/Projects/LLM-email/data/raw_emails/7.txt
Email ID 8 saved to /Users/apple/Projects/LLM-email/data/raw_emails/8.txt
Email ID 9 saved to /Users/apple/Projects/LLM-email/data/raw_emails/9.txt
Attachment 北京.xlsx saved to /Users/apple/Projects/LLM-email/data/raw_emails/北京.xlsx
Email ID 10 saved to /Users/apple/Projects/L

### 解析 raw 邮件文件，把杂乱的邮件内容解析成结构化数据，最后向量化存储


In [None]:
# 测试代码
# Example usage
from utils import parse_email
from config import config
import os
import json

DATA_DIR = config.DATA_DIR
test_input_file_path = os.path.join(DATA_DIR, "raw_emails", "4.txt")
test_output_file_path = os.path.join(DATA_DIR, "parsed_emails", "4.json")
parsed_email = parse_email(test_input_file_path)
# Save result
with open(test_output_file_path, "w", encoding="utf-8") as f:
    json.dump(parsed_email, f, ensure_ascii=False, indent=4)


print(json.dumps(parsed_email, ensure_ascii=False, indent=4))

In [None]:
# 批量处理全部原始文件
from utils import parse_email
from config import config
import os
import json

DATA_DIR = config.DATA_DIR
RAW_EMAIL_DIR = os.path.join(DATA_DIR, "raw_emails")
PARSED_EMAIL_DIR = os.path.join(DATA_DIR, "parsed_emails")

for filename in os.listdir(RAW_EMAIL_DIR):
    input_file_path = os.path.join(RAW_EMAIL_DIR, filename)
    output_file_path = os.path.join(PARSED_EMAIL_DIR, f"{filename}.json")
    parsed_email = parse_email(input_file_path)
    # Save result
    with open(output_file_path, "w", encoding="utf-8") as f:
        json.dump(parsed_email, f, ensure_ascii=False, indent=4)



In [None]:
# 提取所有原始格式的邮件称json格式的文件，保留原始的body内容
from utils import parse_email
from config import config
import os
import json

DATA_DIR = config.DATA_DIR
RAW_EMAIL_DIR = os.path.join(DATA_DIR, "raw_emails")
PARSED_EMAIL_DIR = os.path.join(DATA_DIR, "parsed_emails")

for filename in os.listdir(RAW_EMAIL_DIR):
    input_file_path = os.path.join(RAW_EMAIL_DIR, filename)
    # 去除扩展名
    filename = os.path.splitext(filename)[0]
    output_file_path = os.path.join(PARSED_EMAIL_DIR, f"{filename}.json")
    parsed_email = parse_email(input_file_path)
    # Save result
    with open(output_file_path, "w", encoding="utf-8") as f:
        json.dump(parsed_email, f, ensure_ascii=False, indent=4)

In [None]:
import json
import os
from bs4 import BeautifulSoup

from config import config

DATA_DIR = config.DATA_DIR
RAW_EMAIL_DIR = os.path.join(DATA_DIR, "raw_emails")
PARSED_EMAIL_DIR = os.path.join(DATA_DIR, "parsed_emails")

# 处理邮件体函数：提取纯文本并去除HTML标签，解码Unicode转义字符
def clean_and_extract_body(body):
    """ 处理HTML或转义字符，提取纯文本 """
    # 使用BeautifulSoup去除HTML标签
    soup = BeautifulSoup(body, "html.parser")
    
    # 提取纯文本
    text = soup.get_text(separator="\n", strip=True)  # `separator="\n"` 保证多行内容分开
    
    # 解码Unicode转义字符
    text = bytes(text, "utf-8").decode("unicode_escape")
    
    return text

# 批量处理文件
for filename in os.listdir(PARSED_EMAIL_DIR):
    input_file_path = os.path.join(PARSED_EMAIL_DIR, filename)
    
    # 只处理json文件
    if not filename.endswith(".json"):
        continue
    
    # 去掉扩展名
    filename_no_ext = os.path.splitext(filename)[0]
    output_file_path = os.path.join(PARSED_EMAIL_DIR, f"{filename_no_ext}.json")
    
    # 读取原始文件
    try:
        with open(input_file_path, "r", encoding="utf-8") as f:
            content = f.read()
        
        email_json = json.loads(content)
        
        # 提取并处理email的body
        handled_body = clean_and_extract_body(email_json['body'])
        print("Handled Body: ", handled_body)
        
        # 更新handled_body字段
        email_json['handled_body'] = handled_body
        
        # 将更新后的json内容写入新的文件
        with open(output_file_path, "w", encoding="utf-8") as f:
            json.dump(email_json, f, indent=4, ensure_ascii=False)

    except Exception as e:
        print(f"Error processing file {filename}: {e}")


In [None]:
# # 继续处理handled_body部分，从标签语法中提取出原始文本进行存储
# from utils import parse_email,handle_email_body
# from config import config
# import os
# from bs4 import BeautifulSoup
# import json
# import re

# DATA_DIR = config.DATA_DIR
# RAW_EMAIL_DIR = os.path.join(DATA_DIR, "raw_emails")
# PARSED_EMAIL_DIR = os.path.join(DATA_DIR, "parsed_emails")


# def clean_text(text):
#     """ 处理转义字符 """
#     # 将转义字符转换为实际字符
#     return bytes(text, 'utf-8').decode('unicode_escape')

# def extract_text_from_html(html):
#     """ 从HTML中提取纯文本 """
#     # 使用BeautifulSoup解析HTML
#     soup = BeautifulSoup(html, "html.parser")
#     # 提取文本并清理多余的空白
#     text = soup.get_text(separator="\n", strip=True)
#     return text

# def process_email_data(email_data):
#     """ 处理邮件数据并提取纯文本 """
#     body = email_data["body"]
#     handled_body = email_data["handled_body"]

#     # 处理带HTML标签的邮件内容
#     if body:
#         body = clean_text(body)
#         email_data["body"] = extract_text_from_html(body)

#     # 处理handled_body，清理标签并提取纯文本
#     if handled_body:
#         handled_body = clean_text(handled_body)
#         email_data["handled_body"] = extract_text_from_html(handled_body)

#     return email_data

# for filename in os.listdir(PARSED_EMAIL_DIR):
#     input_file_path = os.path.join(PARSED_EMAIL_DIR, filename)
#     # 去掉扩展名
#     filename = os.path.splitext(filename)[0]
#     output_file_path = os.path.join(PARSED_EMAIL_DIR, f"{filename}.json")
#     with open(input_file_path, "r", encoding="utf-8") as f:
#         content = f.read()
#     read_json=json.loads(content)
#     tar_json=process_email_data(read_json)
#     with open(output_file_path, "w", encoding="utf-8") as f:
#         f.write(json.dumps(tar_json, indent=4, ensure_ascii=False))
    