In [None]:
import os
import re
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import tiktoken
import numpy as np

TARGET_CHAR = 500
MIN_CHAR = 200
MAX_CHAR = 800

enc = tiktoken.get_encoding("cl100k_base")

def split_text_into_chunks(md_text: str, pdf_name: str) -> list:
    lines = md_text.splitlines()
    chunks = []
    current_chunk = []
    current_char_count = 0
    inside_table = False
    current_page = 1
    buffer = []

    def flush_buffer():
        nonlocal buffer, current_chunk, current_char_count, current_page
        if not buffer:
            return
        section = "\n".join(buffer).strip()
        length = len(section)
        if length == 0:
            return
        if current_char_count + length > MAX_CHAR and current_chunk:
            chunks.append({
                "pdf_name": pdf_name,
                "page_number": current_page,
                "chunk_text": "\n".join(current_chunk).strip()
            })
            current_chunk.clear()
            current_char_count = 0
        current_chunk.append(section)
        current_char_count += length
        buffer.clear()

    for line in lines:
        line = line.rstrip()
        page_match = re.match(r"^## Page: (\d+)", line)
        if page_match:
            current_page = int(page_match.group(1))

        if re.match(r"^\|.*\|$", line):
            inside_table = True
            buffer.append(line)
        elif inside_table and (line.startswith("|") or re.match(r"^[-| ]+$", line)):
            buffer.append(line)
        else:
            if inside_table:
                flush_buffer()
                inside_table = False
            buffer.append(line)
            if re.search(r"[.!?。！？]$", line):
                flush_buffer()

    flush_buffer()
    if current_chunk:
        chunks.append({
            "pdf_name": pdf_name,
            "page_number": current_page,
            "chunk_text": "\n".join(current_chunk).strip()
        })

    return [
        chunk for chunk in chunks
        if len(chunk["chunk_text"]) >= MIN_CHAR
    ]

def process_md_folder(input_dir, output_dir):
    input_dir = Path(input_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    md_files = list(input_dir.glob("*.md"))
    summary_data = []

    print(f"📁 处理 {len(md_files)} 个 Markdown 文件...")

    for md_file in tqdm(md_files):
        text = md_file.read_text(encoding="utf-8")
        chunks = split_text_into_chunks(text, md_file.stem + ".pdf")

        csv_rows = []
        char_counts = []
        token_counts = []

        for i, chunk in enumerate(chunks, 1):
            chunk_text = chunk["chunk_text"]
            char_count = len(chunk_text)
            token_count = len(enc.encode(chunk_text))
            char_counts.append(char_count)
            token_counts.append(token_count)

            csv_rows.append({
                "pdf_name": chunk["pdf_name"],
                "chunk_id": i,
                "page_number": chunk["page_number"],
                "character_count": char_count,
                "token_count": token_count,
                "content": chunk_text
            })

        df = pd.DataFrame(csv_rows)
        df.to_csv(output_dir / f"{md_file.stem}.csv", index=False, encoding="utf-8")

        summary_data.append({
            "pdf_name": md_file.name,
            "total_chunks": len(chunks),
            "total_characters": sum(char_counts),
            "total_tokens": sum(token_counts),
            "avg_chars": np.mean(char_counts) if char_counts else 0,
            "std_chars": np.std(char_counts) if char_counts else 0,
            "avg_tokens": np.mean(token_counts) if token_counts else 0,
            "std_tokens": np.std(token_counts) if token_counts else 0,
            "precision": None,  # placeholder
            "recall": None,     # placeholder
            "f1_score": None    # placeholder
        })

    # 保存整体 summary
    summary_df = pd.DataFrame(summary_data)
    summary_df.to_csv(output_dir / "chunking_summary.csv", index=False)

    print(f"✅ 所有 chunking 完成，CSV 输出目录：{output_dir}")
    print(f"📊 汇总表格：{output_dir/'chunking_summary.csv'}")


if __name__ == "__main__":
    input_md_folder = "output_md"                  
    output_md_folder = "Gradient_Chunks_CSV_withpages"    
    process_md_folder(input_md_folder, output_md_folder)
