In [None]:
import os  
import pandas as pd  
from ChineseTextSplitter import ChineseTextSplitter  # 确保这个类或模块存在  
  
# 如果需要处理.docx文件，请确保安装了python-docx库  
# pip install python-docx  
try:  
    from docx import Document  
except ImportError:  
    print("请安装python-docx库来支持.docx文件的处理。")  
  
def read_docx(file_path):  
    """从.docx文件中读取文本内容"""  
    doc = Document(file_path)  
    full_text = []  
    for para in doc.paragraphs:  
        full_text.append(para.text)  
    return '\n'.join(full_text)  
  
def split_and_save_to_excel(text_folder, output_excel):  
    all_chunks = []  
  
    for filename in os.listdir(text_folder):  
        if filename.endswith(('.txt', '.docx')):  # 处理.txt和.docx文件  
            file_path = os.path.join(text_folder, filename)  
              
            # 根据文件类型读取文本  
            if filename.endswith('.txt'):  
                with open(file_path, 'r', encoding='utf-8') as file:  
                    text = file.read()  
            elif filename.endswith('.docx'):  
                text = read_docx(file_path)  
  
            # 初始化文本分割器  
            textsplitter = ChineseTextSplitter(1, sentence_size=300, overlap_size=0.2)  
            chunks = textsplitter.split_text(text)  
  
            # 为每个文本块添加来源文件名信息  
            for i, chunk in enumerate(chunks, start=1):  
                all_chunks.append((f"Chunk {i}_{filename[:-4]}", chunk.strip()))  
  
    # 将列表转换为DataFrame  
    chunks_df = pd.DataFrame(all_chunks, columns=["Chunk ID", "Text Chunk"])  
  
    # 使用pandas的ExcelWriter写入Excel文件  
    with pd.ExcelWriter(output_excel, engine='openpyxl') as writer:  
        chunks_df.to_excel(writer, index=False)  
  
# 使用示例  
text_folder = 'path_to_your_text_folder'  # 替换为你的文本文件所在文件夹路径  
output_excel = 'text_chunks_all.xlsx'  # 输出的Excel文件名  
split_and_save_to_excel(text_folder, output_excel)