In [1]:
import subprocess
import os
import re
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from pymilvus import connections, utility, FieldSchema, CollectionSchema, DataType, Collection

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- 1. 配置 ---
SOURCE_DOC_FILE = "./yq2021-0602文娱产业正离文化越来越远.doc"

# Milvus 配置 (默认使用 Lite 本地模式)
USE_MILVUS_LITE = False 
# 如果您改为 False (使用 Docker)，请设置以下 IP
MILVUS_HOST = "192.168.16.138" 
MILVUS_PORT = "19530"

COLLECTION_NAME = "report_analysis"
ID_FIELD = "chunk_id"
VECTOR_FIELD = "vector"
TEXT_FIELD = "text_content"
MODEL_NAME = 'all-mpnet-base-v2'
EMBEDDING_DIM = 768

# 切割参数控制
MAX_CHUNK_LEN = 600  # 超过这个长度强制切分
MIN_CHUNK_LEN = 100  # 小于这个长度尝试合并

In [3]:
# --- 2. 文档加载模块 ---
def find_libreoffice():
    possible_paths = [
        r"C:\Program Files\LibreOffice\program\soffice.exe",
        r"C:\Program Files (x86)\LibreOffice\program\soffice.exe",
        r"D:\LibreOffice\program\soffice.exe",
        "soffice"
    ]
    for path in possible_paths:
        try:
            if path != "soffice" and not os.path.exists(path): continue
            result = subprocess.run([path, '--version'], capture_output=True, text=True, timeout=10)
            if result.returncode == 0: return path
        except: continue
    return None

In [4]:
def load_doc_as_text(doc_path):
    print("正在调用 LibreOffice 加载文档...")
    libreoffice_path = find_libreoffice()
    if not libreoffice_path: return None
    
    doc_path = os.path.abspath(doc_path)
    output_dir = os.path.dirname(doc_path)
    html_filename = os.path.basename(doc_path).rsplit('.', 1)[0] + '.html'
    html_path = os.path.join(output_dir, html_filename)
    
    if os.path.exists(html_path): os.remove(html_path)
    cmd = [libreoffice_path, '--headless', '--convert-to', 'html', '--outdir', output_dir, doc_path]
    subprocess.run(cmd, capture_output=True, text=True)
    
    if not os.path.exists(html_path): return None
    
    content = ""
    for enc in ['utf-8', 'gb18030', 'gbk']:
        try:
            with open(html_path, 'r', encoding=enc) as f:
                content = f.read()
                break
        except: continue
            
    soup = BeautifulSoup(content, 'html.parser')
    text = soup.get_text()
    try: os.remove(html_path)
    except: pass
    return text

In [5]:
# --- 3. 智能语义切割 (已修改：丢弃相关链接) ---
def split_text_smart(text, max_len=MAX_CHUNK_LEN, min_len=MIN_CHUNK_LEN):
    print(f"正在进行智能语义切割 (Max: {max_len}, Min: {min_len})...")
    
    # === [核心修改] 1. 移除 "相关链接" 及其后所有内容 ===
    # 只要遇到这两个标记中的任何一个，直接截断
    text = re.split(r"〖相关链接：信息〗|〖相关链接：报告〗", text, 1)[0]
    print("已移除 '相关链接' 部分。")
    
    # 2. 基础清理
    text = re.sub(r"\"", "", text)
    text = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", text)
    text = re.sub(r"\n\s*\n", "\n", text).strip()
    
    # 3. 定义语义分隔符 (移除了已被删除的相关链接标记)
    # 强制在这些词前面切开
    semantic_markers = [
        "摘要：", 
        "一，", "二，", "三，", "四，", 
        "毋庸置疑，", "究其原因，", "更进一步分析，"
    ]
    pattern = f"(?=({'|'.join(re.escape(m) for m in semantic_markers)}))"
    
    # 初步切分
    raw_segments = re.split(pattern, text)
    raw_segments = [s.strip() for s in raw_segments if s.strip()]
    
    final_chunks = []
    current_chunk = ""

    # 4. 遍历处理长度
    for segment in raw_segments:
        if len(segment) > max_len:
            if current_chunk:
                final_chunks.append(current_chunk)
                current_chunk = ""
            sub_chunks = recursive_split_sentence(segment, max_len)
            final_chunks.extend(sub_chunks)
        else:
            is_semantic_start = any(segment.startswith(m) for m in semantic_markers)
            
            if current_chunk:
                # 如果长度允许，且不是新的重要标题，则合并
                if len(current_chunk) + len(segment) < max_len and not is_semantic_start:
                    current_chunk += "\n" + segment
                else:
                    # 必须断开。如果上一块太短，且不是强行分隔，尝试粘到这一块头上
                    if len(current_chunk) < min_len and not is_semantic_start:
                         segment = current_chunk + "\n" + segment
                         current_chunk = segment
                    else:
                        final_chunks.append(current_chunk)
                        current_chunk = segment
            else:
                current_chunk = segment
                
    if current_chunk:
        final_chunks.append(current_chunk)

    print(f"切割完成，共得到 {len(final_chunks)} 个语义完整的文本块。")
    return final_chunks

In [6]:
def recursive_split_sentence(text, max_len):
    """辅助函数：按句子拆分超长段落"""
    chunks = []
    current = ""
    # 按优先级拆分：换行 > 句号 > 其他标点
    splits = re.split(r'([\n。！？；])', text) 
    
    temp_sentence = ""
    for part in splits:
        temp_sentence += part
        # 只有当遇到标点符号，或者攒够了一定长度时才检查
        if len(temp_sentence) > max_len:
             # 单句过长，强制按字切
             chunks.append(temp_sentence[:max_len])
             temp_sentence = temp_sentence[max_len:]
        
        if part in ["\n", "。", "！", "？", "；"]: # 句子结束了
            if len(current) + len(temp_sentence) > max_len:
                if current: chunks.append(current)
                current = temp_sentence
            else:
                current += temp_sentence
            temp_sentence = ""
            
    # 处理剩余
    if temp_sentence: current += temp_sentence
    if current: chunks.append(current)
    
    return chunks

In [7]:
# --- 4. 主程序 ---
def main():
    full_text = load_doc_as_text(SOURCE_DOC_FILE)
    if not full_text: return

    # 执行切割
    chunks = split_text_smart(full_text)
    
    # 预览检查
    print("\n" + "="*30)
    print("      切分效果预览      ")
    print("="*30)
    for i, chunk in enumerate(chunks):
        # 打印前几条，以及特定的标题块，方便检查
        markers = ["一，", "二，", "三，", "四，", "现代与传统"]
        is_marker_chunk = any(m in chunk[:20] for m in markers)
        
        if i < 3 or is_marker_chunk:
            print(f"\n>>> Chunk {i} (长度: {len(chunk)})")
            clean_preview = chunk[:60].replace('\n', ' ')
            print(f"内容: {clean_preview}...")
            print("-" * 30)

    # 存入 Milvus
    print(f"\n正在加载嵌入模型 '{MODEL_NAME}'...")
    model = SentenceTransformer(MODEL_NAME)
    embeddings = model.encode(chunks, show_progress_bar=True)
    
    print("正在连接 Milvus...")
    if USE_MILVUS_LITE:
        print("使用 Milvus Lite 本地模式...")
        connections.connect("default", uri="./milvus_demo.db")
    else:
        print(f"使用 Docker 模式 ({MILVUS_HOST})...")
        connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)
    
    if utility.has_collection(COLLECTION_NAME):
        utility.drop_collection(COLLECTION_NAME)
        
    fields = [
        FieldSchema(name=ID_FIELD, dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name=VECTOR_FIELD, dtype=DataType.FLOAT_VECTOR, dim=EMBEDDING_DIM),
        FieldSchema(name=TEXT_FIELD, dtype=DataType.VARCHAR, max_length=65535)
    ]
    collection = Collection(name=COLLECTION_NAME, schema=CollectionSchema(fields))
    
    print(f"正在插入 {len(chunks)} 条数据...")
    collection.insert([embeddings, chunks])
    
    print("正在写入磁盘 (Flush)...")
    collection.flush() 
    
    print("正在创建索引...")
    index_params = {"metric_type": "L2", "index_type": "IVF_FLAT", "params": {"nlist": 128}}
    collection.create_index(VECTOR_FIELD, index_params)
    collection.load()
    
    print(f"\n任务完成！共存入 {collection.num_entities} 条纯净数据（已去除相关链接）。")

if __name__ == "__main__":
    main()

正在调用 LibreOffice 加载文档...
正在进行智能语义切割 (Max: 600, Min: 100)...
已移除 '相关链接' 部分。
切割完成，共得到 24 个语义完整的文本块。

      切分效果预览      

>>> Chunk 0 (长度: 101)
内容: 〖特别报告〗      政府左右房价？（下）2021年06月02日福卡分析                       ...
------------------------------

>>> Chunk 1 (长度: 3)
内容: 摘要：...
------------------------------

>>> Chunk 2 (长度: 475)
内容: 摘要：文娱产业偏离文化的背后折射出时尚与经典文化的对冲，在人类文明转型阶段，文化上的变革是不可避免的，只不过，时尚化显然...
------------------------------

>>> Chunk 3 (长度: 2)
内容: 一，...
------------------------------

>>> Chunk 4 (长度: 149)
内容: 一，明星IP化。明星效应膨胀式放大，并通过多个出口变现。最典型的便是拟上市公司和明星进行深度绑定，借其人气从资本市场获得...
------------------------------

>>> Chunk 5 (长度: 2)
内容: 二，...
------------------------------

>>> Chunk 6 (长度: 175)
内容: 二，饭圈邪教化。“饭圈”粉丝构建了一套包括打投组、反黑组、安利站等在内的严密的组织体系，往往乐于将偶像标签化，如“美强惨...
------------------------------

>>> Chunk 7 (长度: 2)
内容: 三，...
------------------------------

>>> Chunk 8 (长度: 169)
内容: 三，运作金融化。文娱产业俨然成为了一场裹挟着各路资本套路的“买卖”，如近期“倒奶事件”便是在“唯钱是举”的打投应援机制下...
------------------------------

>>> Chunk 9 (长度: 2)
内容

Batches: 100%|██████████| 1/1 [00:03<00:00,  3.66s/it]


正在连接 Milvus...
使用 Docker 模式 (192.168.16.138)...
正在插入 24 条数据...
正在写入磁盘 (Flush)...
正在创建索引...

任务完成！共存入 24 条纯净数据（已去除相关链接）。
