In [3]:
import subprocess
import os
import re
from pathlib import Path
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
)

In [4]:
# --- 1. 配置 ---

# 要处理的源文件
SOURCE_DOC_FILE = "./yq2021-0602文娱产业正离文化越来越远.doc"

# Milvus 连接配置 (请根据您的设置修改)
MILVUS_HOST = "192.168.16.138"
MILVUS_PORT = "19530"

# Milvus 集合 (表) 配置
COLLECTION_NAME = "report_analysis"
ID_FIELD = "chunk_id"
VECTOR_FIELD = "vector"
TEXT_FIELD = "text_content"

# 嵌入模型配置
MODEL_NAME = 'all-mpnet-base-v2'
EMBEDDING_DIM = 768

In [None]:
# --- 2. 文档加载  ---

def find_libreoffice():
    """
    查找 LibreOffice 的安装路径
    """
    possible_paths = [
        "/usr/local/bin/libreoffice",
        "/Applications/LibreOffice.app/Contents/MacOS/soffice",
        "/opt/homebrew/bin/libreoffice",
        "soffice"
    ]
    
    for path in possible_paths:
        try:
            result = subprocess.run([path, '--version'], 
                                  capture_output=True, text=True, timeout=10)
            if result.returncode == 0:
                print(f"找到 LibreOffice: {path}")
                return path
        except (FileNotFoundError, subprocess.TimeoutExpired):
            continue
    
    return None

In [None]:
def load_doc_as_text(doc_path):
    """
    使用 LibreOffice 将 .doc 转换为 HTML，然后提取纯文本。
    """
    print(f"正在加载文档: {doc_path}")
    libreoffice_path = find_libreoffice()
    if not libreoffice_path:
        print("错误: 未找到 LibreOffice，无法转换 .doc 文件。")
        return None
    
    doc_path = os.path.abspath(doc_path)
    if not os.path.exists(doc_path):
        print(f"错误: 文件不存在 {doc_path}")
        return None
        
    output_dir = os.path.dirname(doc_path)
    html_filename = os.path.basename(doc_path).rsplit('.', 1)[0] + '.html'
    html_path = os.path.join(output_dir, html_filename)
    
    # 确保旧的HTML文件被删除，以免读取到缓存
    if os.path.exists(html_path):
        os.remove(html_path)

    cmd = [
        libreoffice_path,
        '--headless',
        '--convert-to', 'html',
        '--outdir', output_dir,
        doc_path
    ]
    
    try:
        print("正在使用 LibreOffice 转换 .doc 为 HTML...")
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
        
        if result.returncode != 0:
            print(f"LibreOffice 转换失败: {result.stderr}")
            return None
        
        if not os.path.exists(html_path):
            print(f"HTML 文件未生成: {html_path}")
            return None
        
        print(f"HTML 文件已生成: {html_path}")
        
        # 读取HTML并提取纯文本
        with open(html_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
        
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # 清理临时HTML文件
        os.remove(html_path)
        
        # 返回纯文本内容
        return soup.get_text()

    except subprocess.TimeoutExpired:
        print("LibreOffice 转换超时")
        return None
    except Exception as e:
        print(f"加载 .doc 文件时出错: {e}")
        return None

In [None]:
# --- 3. 策略A：结构化切割函数 ---
def split_text_structurally(text):
    """
    根据文档的特定结构进行语义切割
    """
    print("开始进行结构化切割...")
    
    # 1. 移除 "相关链接" 及其之后的所有内容
    text = re.split(r"〖相关链接：信息〗|〖相关链接：报告〗", text, 1)[0]
    
    # 2. 移除开头的 和 〖特别报告〗 标记 (基于 .doc 原文)
    text = re.sub(r"\"", "", text).strip()
    text = text.replace("〖特别报告〗", "").strip()

    # 3. 移除页脚 (从日期开始) 和其他控制字符
    text = re.sub(r"2021年06月02日.*", "", text, flags=re.DOTALL).strip()
    text = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", text)
    # 移除 BeautifulSoup 可能留下的多余空行
    text = re.sub(r"\n\s*\n", "\n\n", text).strip()

    # 4. 定义结构化分隔符
    delimiters = [
        "摘要：",
        "一，",  # 乱象一
        "二，",  # 乱象二
        "三，",  # 乱象三
        "四，",  # 乱象四
        "毋庸置疑，", # 乱象总结
        "究其原因，", # 原因分析
        "更进一步分析，" # 深入分析
    ]
    
    pattern = f"({'|'.join(re.escape(d) for d in delimiters)})"
    
    # 5. 执行切割 (使用正向预查，保留分隔符)
    chunks = re.split(f"(?={pattern})", text)
    
    # 6. 清理结果
    cleaned_chunks = [chunk.strip() for chunk in chunks if chunk.strip() and len(chunk.strip()) > 50]
    
    print(f"切割完成，共得到 {len(cleaned_chunks)} 个文本块 (Chunks)。")
    return cleaned_chunks

In [None]:
# --- 4. Milvus 处理函数 ---
def setup_milvus_collection():
    """
    连接 Milvus 并创建 Collection
    """
    print(f"正在连接 Milvus ({MILVUS_HOST}:{MILVUS_PORT})...")
    connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)

    if utility.has_collection(COLLECTION_NAME):
        print(f"Collection '{COLLECTION_NAME}' 已存在，将删除重建。")
        utility.drop_collection(COLLECTION_NAME)

    fields = [
        FieldSchema(name=ID_FIELD, dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name=VECTOR_FIELD, dtype=DataType.FLOAT_VECTOR, dim=EMBEDDING_DIM),
        FieldSchema(name=TEXT_FIELD, dtype=DataType.VARCHAR, max_length=65535)
    ]
    schema = CollectionSchema(fields, description="文娱产业报告分析")
    
    collection = Collection(name=COLLECTION_NAME, schema=schema)
    print(f"Collection '{COLLECTION_NAME}' 创建成功。")
    return collection

def create_vector_index(collection):
    """
    为向量字段创建索引
    """
    print("开始创建向量索引...")
    index_params = {
        "metric_type": "L2",
        "index_type": "IVF_FLAT",
        "params": {"nlist": 128}
    }
    collection.create_index(field_name=VECTOR_FIELD, index_params=index_params)
    print("向量索引创建成功。")


In [None]:
# --- 5. 主执行函数 ---
def main():
    # 1. 动态加载文档为纯文本
    full_text = load_doc_as_text(SOURCE_DOC_FILE)
    if not full_text:
        print(f"未能加载文件 {SOURCE_DOC_FILE}，退出。")
        return

    # 2. 切割文本
    chunks = split_text_structurally(full_text)
    if not chunks:
        print("未能从文档中切割出任何文本块，退出。")
        return
        
    for i, chunk in enumerate(chunks[:3]):
        print(f"\n--- 预览 Chunk {i+1} (前50字) ---")
        print(chunk[:50] + "...")

    # 3. 加载嵌入模型
    print(f"\n正在加载嵌入模型 '{MODEL_NAME}'...")
    model = SentenceTransformer(MODEL_NAME)

    # 4. 生成向量
    print("正在为所有文本块生成向量 (Embedding)...")
    embeddings = model.encode(chunks, show_progress_bar=True)

    # 5. 连接并设置 Milvus
    collection = setup_milvus_collection()

    # 6. 准备插入数据
    data_to_insert = [
        embeddings,     # 对应 VECTOR_FIELD
        chunks          # 对应 TEXT_FIELD
    ]

    # 7. 插入数据
    print(f"正在向 Milvus 插入 {len(chunks)} 条数据...")
    insert_result = collection.insert(data_to_insert)
    collection.flush()
    print(f"数据插入成功，主键(PKs): {insert_result.primary_keys}")

    # 8. 创建索引
    create_vector_index(collection)

    # 9. 加载 Collection 到内存以便搜索
    print("正在加载 Collection 到内存以备搜索...")
    collection.load()
    
    print("\n--- 任务完成 ---")
    print(f"文档 '{SOURCE_DOC_FILE}' 已成功加载、切割、向量化，并存入 Milvus。")
    print(f"总计 {collection.num_entities} 条记录。")

if __name__ == "__main__":
    main()