### ƒê·ªçc v√† ti·ªÅn x·ª≠ l√Ω

In [17]:
# preprocess_vanhoa_pdf.py (optimized)
import re, json, unicodedata
from pathlib import Path
from tqdm import tqdm
import PyPDF2
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

PDF_PATH = Path("../data/VanHoaVaDuLichVN.pdf")
OUT_DIR = Path("../data")
OUT_DIR.mkdir(exist_ok=True)

# ========== Cleaning improved ==========
def clean_text_advanced(s: str) -> str:
    if not s:
        return ""
    s = unicodedata.normalize("NFC", s)
    # Remove common headers/footers patterns and page numbers
    s = re.sub(r'(?mi)^(?:page|trang)\s*\d+\b', ' ', s)
    s = re.sub(r'\f', ' ', s)
    # remove leading/trailing digits sticking to text like "581Nh√†"
    s = re.sub(r'(?<=\s)\d{2,4}(?=[A-Za-z√Ä-·ªπ])', ' ', s)
    s = re.sub(r'^\s*\d{1,4}\s*', ' ', s)
    s = re.sub(r'\s*\d{1,4}\s*$', ' ', s)
    # fix hyphenation/newline breaks and many whitespace
    s = re.sub(r'(\w)-\s+(\w)', r'\1\2', s)       # foo-\nbar -> foobar
    s = re.sub(r'[\r\n\t]+', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    # Fix Vietnamese token fragmentation heuristics (merge 1-2 char tokens)
    s = re.sub(r'\b([a-zƒë√Ä-·ªπ]{1,2})\s+([a-zƒë√Ä-·ªπ]{1,3})\b', r'\1\2', s, flags=re.I)
    s = s.strip()
    s = s.lower()
    return s

# ========== Read PDF (PyPDF2; can swap to pdfminer) ==========
def read_pdf(pdf_path: Path):
    reader = PyPDF2.PdfReader(str(pdf_path))
    pages = []
    for i, page in enumerate(reader.pages):
        raw = page.extract_text() or ""
        cleaned = clean_text_advanced(raw)
        if cleaned:
            pages.append({"page": i+1, "text": cleaned})
    print(f"‚úÖ ƒê√£ ƒë·ªçc {len(pages)} trang.")
    return pages

# ========== Sentence split & chunk ==========
def split_sentences(text: str):
    sents = re.split(r'(?<=[\.\?\!‚Ä¶])\s+', text)
    return [s.strip() for s in sents if s.strip()]

def chunk_by_sentences(pages, max_words=300, overlap_sentences=2):
    chunks = []
    cid = 0
    for p in tqdm(pages, desc="Chunking pages"):
        sents = split_sentences(p["text"])
        i = 0
        while i < len(sents):
            cur, cnt = [], 0
            j = i
            while j < len(sents) and cnt + len(sents[j].split()) <= max_words:
                cur.append(sents[j])
                cnt += len(sents[j].split())
                j += 1
            if not cur:
                cur = [sents[i]]
                j = i + 1
            cid += 1
            text_chunk = " ".join(cur)
            chunks.append({
                "id": cid,
                "page": p["page"],
                "text": text_chunk,
                "first_sentence": cur[0] if cur else "",
                "char_len": len(text_chunk)
            })
            i = max(i + 1, j - overlap_sentences)
    print(f"‚úÖ T·ªïng chunk: {len(chunks)}")
    return chunks

# ========== Embed + FAISS (use multilingual model) ==========
def embed_and_index(chunks, model_name="paraphrase-multilingual-MiniLM-L12-v2"):
    model = SentenceTransformer(model_name)
    texts = [c["text"] for c in chunks]
    print("üîç T·∫°o embedding...")
    embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
    # normalize for cosine
    embeddings = embeddings / (np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-10)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(embeddings)
    # Save
    faiss.write_index(index, str(OUT_DIR / "pdf_index.faiss"))
    np.save(OUT_DIR / "pdf_embeddings.npy", embeddings)
    with open(OUT_DIR / "pdf_chunks.json", "w", encoding="utf-8") as f:
        json.dump(chunks, f, ensure_ascii=False, indent=2)
    print("‚úÖ L∆∞u xong index + embeddings + chunks.")
    return index

if __name__ == "__main__":
    pages = read_pdf(PDF_PATH)
    chunks = chunk_by_sentences(pages, max_words=300, overlap_sentences=2)
    embed_and_index(chunks)


‚úÖ ƒê√£ ƒë·ªçc 1027 trang.


Chunking pages: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1027/1027 [00:00<00:00, 8318.95it/s]

‚úÖ T·ªïng chunk: 3892





üîç T·∫°o embedding...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 122/122 [00:12<00:00,  9.77it/s]


‚úÖ L∆∞u xong index + embeddings + chunks.


### Re-chunk


In [18]:

"""
M·ª•c ƒë√≠ch:
- Load ../data/pdf_chunks.json (chunk hi·ªán c√≥)
- Clean: fix hyphenation, remove page-number residues, separate digits/letters,
  tokenise VN b·∫±ng underthesea n·∫øu c√≥, post-process punctuation
- Re-chunk n·∫øu chunk qu√° d√†i (max_words configurable)
- L∆∞u pdf_chunks_cleaned.json v√† mapping old_chunk_id -> new_chunk_ids
"""

import re, json, unicodedata
from pathlib import Path
from tqdm import tqdm

# config
IN_PATH = Path("../data/pdf_chunks.json")
OUT_PATH = Path("../data/pdf_chunks_cleaned.json")
MAPPING_PATH = Path("../data/pdf_chunks_mapping.json")
MAX_WORDS = 200          # t·ªëi ƒëa t·ª´ 1 chunk sau clean
OVERLAP_SENTENCES = 1    # overlap (s·ªë c√¢u)
KEEP_ORIGINAL_PAGE = True

# try underthesea
try:
    from underthesea import word_tokenize, sent_tokenize
    HAS_UT = True
except Exception:
    HAS_UT = False
    print("[WARN] underthesea kh√¥ng kh·∫£ d·ª•ng ‚Äî tokenization fallback d√πng regex.")

# -----------------------------
# cleaning function for chunk text
# -----------------------------
def clean_chunk_text(s: str) -> str:
    if not s: return ""
    s = unicodedata.normalize("NFC", s)

    # remove stray form feeds
    s = s.replace('\f', ' ')

    # remove leading/trailing page numbers or isolated numbers at line starts
    s = re.sub(r'(?m)^\s*\d{1,4}\s*', ' ', s)
    s = re.sub(r'\s*\d{1,4}\s*$', ' ', s)

    # fix hyphenation broken across line breaks or spaces: "th - ·ªù" / "foo-\nbar"
    s = re.sub(r'(\w)-\s+(\w)', r'\1\2', s)
    s = s.replace('\u2013', '-').replace('\u2014', '-')

    # isolate punctuation so tokenizer can handle consistently
    s = re.sub(r'([,.;:!?()"‚Äú‚Äù¬´¬ª\[\]])', r' \1 ', s)

    # separate digits stuck to letters (e.g., "1877nƒÉm" -> "1877 nƒÉm")
    s = re.sub(r'([0-9])([^\s0-9\W])', r'\1 \2', s)
    s = re.sub(r'([^\s0-9\W])([0-9])', r'\1 \2', s)

    # collapse whitespace
    s = re.sub(r'\s+', ' ', s).strip()

    # Vietnamese tokenization: prefer underthesea.word_tokenize (format="text")
    if HAS_UT:
        try:
            s_tok = word_tokenize(s, format="text")
        except Exception:
            s_tok = s
    else:
        s_tok = s

    # Post-process punctuation spacing
    s_tok = re.sub(r'\s+([,.;:!?()\[\]"])', r'\1', s_tok)
    s_tok = re.sub(r'\s+', ' ', s_tok).strip()

    # lowercase optional (keep lowercase for embedding)
    s_tok = s_tok.lower()

    return s_tok

# -----------------------------
# sentence splitter (fallback to regex if underthesea not available)
# -----------------------------
def split_sentences(text: str):
    if HAS_UT:
        try:
            sents = sent_tokenize(text)
            # underthesea.sent_tokenize returns list
            if isinstance(sents, list) and len(sents) > 0:
                return [s.strip() for s in sents if s.strip()]
        except Exception:
            pass
    # fallback simple regex split on punctuation
    sents = re.split(r'(?<=[\.\?\!‚Ä¶])\s+', text)
    return [s.strip() for s in sents if s.strip()]

# -----------------------------
# re-chunk sentences into chunks (max_words, overlap_sentences)
# -----------------------------
def re_chunk_from_text(text: str, page: int, source_chunk_id: int, start_new_cid):
    sents = split_sentences(text)
    new_chunks = []
    cid = start_new_cid
    i = 0
    while i < len(sents):
        cur = []
        cnt = 0
        j = i
        while j < len(sents) and cnt + len(sents[j].split()) <= MAX_WORDS:
            cur.append(sents[j])
            cnt += len(sents[j].split())
            j += 1
        if not cur:
            cur = [sents[i]]
            j = i + 1
        cid += 1
        chunk_text = " ".join(cur)
        new_chunks.append({
            "id": cid,
            "page": page,
            "text": chunk_text,
            "first_sentence": cur[0] if cur else "",
            "char_len": len(chunk_text),
            "source_chunk_id": source_chunk_id
        })
        i = max(i + 1, j - OVERLAP_SENTENCES)
    return new_chunks, cid

# -----------------------------
# main process
# -----------------------------
def main():
    assert IN_PATH.exists(), f"Kh√¥ng t√¨m th·∫•y {IN_PATH}"
    raw_chunks = json.load(open(IN_PATH, 'r', encoding='utf-8'))
    cleaned_chunks = []
    mapping = {}  # old_id -> list of new_ids

    next_cid = 0
    for c in tqdm(raw_chunks, desc="Processing chunks"):
        old_id = c.get("id")
        page = c.get("page", None)
        text = c.get("text", "")
        # 1) clean text
        cleaned = clean_chunk_text(text)
        # 2) re-chunk cleaned text to ensure chunks not too long and sentences intact
        new_chunks, next_cid = re_chunk_from_text(cleaned, page if KEEP_ORIGINAL_PAGE else None, old_id, next_cid)
        # append and map
        cleaned_chunks.extend(new_chunks)
        mapping[str(old_id)] = [nc["id"] for nc in new_chunks]

    # save
    json.dump(cleaned_chunks, open(OUT_PATH, "w", encoding='utf-8'), ensure_ascii=False, indent=2)
    json.dump(mapping, open(MAPPING_PATH, "w", encoding='utf-8'), ensure_ascii=False, indent=2)

    print(f"Done. Saved cleaned chunks: {OUT_PATH} (count={len(cleaned_chunks)})")
    print(f"Mapping saved: {MAPPING_PATH}")

if __name__ == "__main__":
    main()


Processing chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3892/3892 [00:52<00:00, 74.24it/s] 


Done. Saved cleaned chunks: ..\data\pdf_chunks_cleaned.json (count=7688)
Mapping saved: ..\data\pdf_chunks_mapping.json


### X·ª≠ l√Ω pdf_chunks_cleaned.json s·∫°ch s·∫Ω

In [1]:
# X·ª≠ l√Ω chunks ƒë·ªÉ chu·∫©n b·ªã cho embedding.

import json
import os
import re
import statistics
from collections import Counter

# ===== C·∫§U H√åNH =====
IN_PATH  = "../data/pdf_chunks_cleaned.json"
OUT_PATH = "../data/pdf_chunks_cleaned_for_embed.json"

SHORT_DROP_CHARLEN = 50      # lo·∫°i chunk < N k√Ω t·ª±
MERGE_TARGET_LEN    = 250    # g·ªôp t·ªõi khi >= N k√Ω t·ª±

# ===== H√ÄM H·ªñ TR·ª¢ =====
def normalize_text(t: str) -> str:
    return re.sub(r"\s+", " ", t.strip().lower())

def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def save_json(obj, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

# ===== N·∫†P D·ªÆ LI·ªÜU =====
chunks = load_json(IN_PATH)
n_orig = len(chunks)

# ƒê·∫£m b·∫£o tr∆∞·ªùng text v√† char_len
for c in chunks:
    if "text" not in c:
        c["text"] = ""
    if "char_len" not in c or not isinstance(c["char_len"], int):
        c["char_len"] = len(c["text"])

lengths = [c["char_len"] for c in chunks] if chunks else [0]

# ===== TH·ªêNG K√ä TR∆Ø·ªöC =====
exact_counts = Counter([c["text"].strip() for c in chunks])
exact_dup_count = sum(1 for t,cnt in exact_counts.items() if cnt>1)

norm_counts = Counter([normalize_text(c["text"]) for c in chunks])
norm_dup_count = sum(1 for t,cnt in norm_counts.items() if cnt>1)

underscore_pct = sum(1 for c in chunks if "_" in c["text"]) / n_orig * 100 if n_orig else 0

# ===== 1) L·ªåC CHUNK QU√Å NG·∫ÆN =====
filtered = [c for c in chunks if c["char_len"] >= SHORT_DROP_CHARLEN]

# ===== 2) DEDUPE (normalized) =====
seen = {}
uniq = []
for idx, c in enumerate(filtered):
    key = normalize_text(c["text"])
    if key in seen:
        continue
    seen[key] = idx
    new = dict(c)
    new["_orig_index"] = idx
    uniq.append(new)

# ===== 3) G·ªòP CHUNK NG·∫ÆN LI·ªÄN K·ªÄ c√πng source_chunk_id =====
merged = []
i = 0
while i < len(uniq):
    cur = uniq[i]
    cur_text = cur["text"]
    cur_len = cur.get("char_len", len(cur_text))
    src = cur.get("source_chunk_id")
    if cur_len < MERGE_TARGET_LEN:
        j = i + 1
        merged_text = cur_text
        merged_len = cur_len
        while j < len(uniq) and merged_len < MERGE_TARGET_LEN and uniq[j].get("source_chunk_id") == src:
            merged_text = merged_text.rstrip() + " " + uniq[j]["text"].lstrip()
            merged_len = len(merged_text)
            j += 1
        new = dict(cur)
        new["text"] = merged_text
        new["char_len"] = merged_len
        merged.append(new)
        i = j
    else:
        merged.append(cur)
        i += 1

# ===== 4) DEDUPE CU·ªêI & T·∫†O text_for_embed =====
final = []
seen_final = set()
for c in merged:
    key = normalize_text(c["text"])
    if key in seen_final:
        continue
    seen_final.add(key)
    c["text_for_embed"] = c["text"].replace("_", " ")
    final.append(c)

# ===== L∆ØU K·∫æT QU·∫¢ =====
save_json(final, OUT_PATH)

# ===== B√ÅO C√ÅO IN RA =====
lengths_all = [c["char_len"] for c in final] if final else [0]
median_len = statistics.median(lengths_all)
mean_len = statistics.mean(lengths_all)
min_len = min(lengths_all)
max_len = max(lengths_all)

very_long_count = sum(1 for L in lengths_all if L > 2000)

buckets = {
    "0-49": sum(1 for L in lengths_all if 0 <= L <= 49),
    "50-99": sum(1 for L in lengths_all if 50 <= L <= 99),
    "100-199": sum(1 for L in lengths_all if 100 <= L <= 199),
    "200-399": sum(1 for L in lengths_all if 200 <= L <= 399),
    "400-799": sum(1 for L in lengths_all if 400 <= L <= 799),
    "800-1999": sum(1 for L in lengths_all if 800 <= L <= 1999),
    "2000+": very_long_count
}

print("=== B√ÅO C√ÅO T√ìM T·∫ÆT ===")
print("Chunks ban ƒë·∫ßu:", n_orig)
print("Sau lo·∫°i <{} k√Ω t·ª±:".format(SHORT_DROP_CHARLEN), len(filtered))
print("Sau dedupe (pre-merge):", len(uniq))
print("Sau g·ªôp:", len(merged))
print("Chunks cu·ªëi c√πng (ƒë√£ dedupe):", len(final))
print("Exact duplicate (g·ªëc):", exact_dup_count)
print("Normalized duplicate (g·ªëc):", norm_dup_count)
print("Ph·∫ßn trƒÉm chunk c√≥ d·∫•u g·∫°ch d∆∞·ªõi (_): {:.2f}%".format(underscore_pct))
print("ƒê·ªô d√†i (min/median/mean/max):", f"{min_len}/{median_len:.1f}/{mean_len:.1f}/{max_len}")
print("Bucket ƒë·ªô d√†i:", buckets)
print("S·ªë chunk qu√° d√†i (>2000):", very_long_count)
print("File ƒë·∫ßu ra:", OUT_PATH)


=== B√ÅO C√ÅO T√ìM T·∫ÆT ===
Chunks ban ƒë·∫ßu: 7688
Sau lo·∫°i <50 k√Ω t·ª±: 6328
Sau dedupe (pre-merge): 5320
Sau g·ªôp: 5165
Chunks cu·ªëi c√πng (ƒë√£ dedupe): 5165
Exact duplicate (g·ªëc): 851
Normalized duplicate (g·ªëc): 851
Ph·∫ßn trƒÉm chunk c√≥ d·∫•u g·∫°ch d∆∞·ªõi (_): 92.60%
ƒê·ªô d√†i (min/median/mean/max): 50/315.0/491.2/3917
Bucket ƒë·ªô d√†i: {'0-49': 0, '50-99': 675, '100-199': 1105, '200-399': 1394, '400-799': 894, '800-1999': 1010, '2000+': 87}
S·ªë chunk qu√° d√†i (>2000): 87
File ƒë·∫ßu ra: ../data/pdf_chunks_cleaned_for_embed.json


### Ki·ªÉm tra chunks d√†i v√† replace "_"

In [2]:
# X·ª≠ l√Ω: t√°ch chunk qu√° d√†i (>2000), replace "_" -> " " cho text_for_embed, in m·∫´u ki·ªÉm tra.
import json, os, re
from typing import List

IN_PATH  = "../data/pdf_chunks_cleaned_for_embed.json"   # file hi·ªán t·∫°i
OUT_PATH = "../data/pdf_chunks_cleaned_for_embed_v3.json"
MAX_PIECE_LEN = 1000      # m·ªói ph·∫ßn sau t√°ch t·ªëi ƒëa ~1000 k√Ω t·ª± (ch·ªânh ƒë∆∞·ª£c)
LONG_THRESHOLD = 2000     # chunk > threshold s·∫Ω b·ªã t√°ch
SAMPLE_COUNT = 10

# --- h√†m t√°ch theo c√¢u; n·∫øu c√¢u v·∫´n qu√° d√†i d√πng split theo whitespace
SENTENCE_SPLIT_RE = re.compile(r'(?<=[\.\?\!\‡•§\u3002])\s+|\n+')

def split_long_text(text: str, max_len: int) -> List[str]:
    pieces = []
    # t√°ch theo c√¢u
    sents = [s.strip() for s in SENTENCE_SPLIT_RE.split(text) if s.strip()]
    cur = ""
    for s in sents:
        if len(cur) + 1 + len(s) <= max_len:
            cur = (cur + " " + s).strip() if cur else s
        else:
            if cur:
                pieces.append(cur)
            # n·∫øu c√¢u ƒë∆°n d√†i h∆°n max_len, split by whitespace
            if len(s) <= max_len:
                cur = s
            else:
                words = s.split()
                buf = ""
                for w in words:
                    if len(buf) + 1 + len(w) <= max_len:
                        buf = (buf + " " + w).strip() if buf else w
                    else:
                        if buf:
                            pieces.append(buf)
                        buf = w
                if buf:
                    cur = buf
                else:
                    cur = ""
    if cur:
        pieces.append(cur)
    # n·∫øu kh√¥ng t√°ch ƒë∆∞·ª£c c√¢u (v√≠ d·ª• text kh√¥ng c√≥ d·∫•u c√¢u), fallback split by whitespace
    if not pieces:
        tokens = text.split()
        buf = ""
        for t in tokens:
            if len(buf) + 1 + len(t) <= max_len:
                buf = (buf + " " + t).strip() if buf else t
            else:
                pieces.append(buf)
                buf = t
        if buf:
            pieces.append(buf)
    return pieces

# --- t·∫£i file
with open(IN_PATH, "r", encoding="utf-8") as f:
    chunks = json.load(f)

n_orig = len(chunks)

# --- th·ªëng k√™ v√† l·∫•y m·∫´u tr∆∞·ªõc x·ª≠ l√Ω
underscore_chunks = [c for c in chunks if "_" in c.get("text","")]
long_chunks = [c for c in chunks if c.get("char_len", len(c.get("text",""))) > LONG_THRESHOLD]

print(f"Chunks ban ƒë·∫ßu: {n_orig}")
print(f"Chunk c√≥ '_' : {len(underscore_chunks)}")
print(f"Chunk > {LONG_THRESHOLD}: {len(long_chunks)}\n")

print("=== V√≠ d·ª• 10 chunk ch·ª©a '_' (tr√≠ch ƒëo·∫°n ƒë·∫ßu 200 k√Ω t·ª±) ===")
for c in underscore_chunks[:SAMPLE_COUNT]:
    print("-", repr(c["text"][:200]))

print("\n=== V√≠ d·ª• 10 chunk qu√° d√†i (tr∆∞·ªõc t√°ch, tr√≠ch 200 k√Ω t·ª± ƒë·∫ßu) ===")
for c in long_chunks[:SAMPLE_COUNT]:
    print("-", repr(c["text"][:200]))

# --- x·ª≠ l√Ω t√°ch
out_chunks = []
count_split = 0
for c in chunks:
    text = c.get("text","")
    char_len = c.get("char_len", len(text))
    if char_len > LONG_THRESHOLD:
        parts = split_long_text(text, MAX_PIECE_LEN)
        # n·∫øu split tr·∫£ v·ªÅ 1 ph·∫ßn (kh√¥ng t√°ch ƒë∆∞·ª£c) v·∫´n push l·∫°i
        if len(parts) == 1:
            new = dict(c)
            new["text"] = parts[0]
            new["char_len"] = len(parts[0])
            new["text_for_embed"] = parts[0].replace("_"," ")
            out_chunks.append(new)
        else:
            # t·∫°o c√°c chunk m·ªõi, preserve metadata, g√°n source_chunk_id_partN
            base_id = c.get("source_chunk_id", "unknown")
            for i, p in enumerate(parts, start=1):
                new = dict(c)
                new["text"] = p
                new["char_len"] = len(p)
                new["source_chunk_id"] = f"{base_id}_part{i}"
                new["text_for_embed"] = p.replace("_"," ")
                out_chunks.append(new)
            count_split += 1
    else:
        new = dict(c)
        new["text_for_embed"] = text.replace("_"," ")
        out_chunks.append(new)

# --- sau x·ª≠ l√Ω: in 10 v√≠ d·ª• chunk ƒë√£ t√°ch
after_long_chunks = [c for c in out_chunks if c.get("char_len",0) > LONG_THRESHOLD]
print(f"\nS·ªë chunk ban ƒë·∫ßu >{LONG_THRESHOLD}: {len(long_chunks)}")
print(f"S·ªë chunk v·∫´n >{LONG_THRESHOLD} sau t√°ch: {len(after_long_chunks)} (n√™n =0)")

print("\n=== V√≠ d·ª• 10 chunk ƒë√£ t√°ch (tr√≠ch 200 k√Ω t·ª± ƒë·∫ßu) ===")
examples = [c for c in out_chunks if c.get("char_len",0) <= MAX_PIECE_LEN][:SAMPLE_COUNT]
for c in examples[:SAMPLE_COUNT]:
    print("-", repr(c["text"][:200]))

# --- l∆∞u file
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
with open(OUT_PATH, "w", encoding="utf-8") as f:
    json.dump(out_chunks, f, ensure_ascii=False, indent=2)

# --- b√°o c√°o cu·ªëi
lens = [c.get("char_len",0) for c in out_chunks]
print("\n=== B√ÅO C√ÅO CU·ªêI ===")
print("T·ªïng chunk sau x·ª≠ l√Ω:", len(out_chunks))
print("S·ªë b·∫£n ghi t√°ch (original chunks c√≥ > LONG_THRESHOLD):", count_split)
print("Min/Median/Mean/Max k√Ω t·ª±:",
      min(lens), 
      sorted(lens)[len(lens)//2] if lens else 0,
      sum(lens)/len(lens) if lens else 0,
      max(lens) if lens else 0)
print("File ƒë·∫ßu ra:", OUT_PATH)


Chunks ban ƒë·∫ßu: 5165
Chunk c√≥ '_' : 5133
Chunk > 2000: 87

=== V√≠ d·ª• 10 chunk ch·ª©a '_' (tr√≠ch ƒëo·∫°n ƒë·∫ßu 200 k√Ω t·ª±) ===
- 'vi·ªát_nam vƒÉn_h√≥a v√† du_l·ªãch tr·∫ßn m·∫°nh_th∆∞·ªùng bi√™n_so·∫°n nguy·ªÖn_minh ti·∫øn_hi·ªáu ƒë√≠nh ph√°t_h√†nh theo th·ªèa_thu·∫≠n gi·ªØa c√¥ng_ty vƒÉn_h√≥a h∆∞∆°ng_trang v√† t√°c_gi·∫£. nghi√™m_c·∫•m m·ªçi s·ª± sao_ch√©p, tr√≠ch_d·ªãch ho·∫∑c in '
- 'no part of_this book may be reproduced byany means without prior written permission from the publisher.'
- 'all rights reserved. no part of_this book may be reproduced byany means without prior written permission from the publisher.'
- 'no part of_this book may be reproduced by any means without prior written permission from the publisher.'
- 'tr·∫ßn m·∫°nh_th∆∞·ªùng bi√™n_so·∫°n nguy·ªÖn_minh ti·∫øn_hi·ªáu ƒë√≠nh vi·ªát_nam vƒÉn_h√≥a v√† du_l·ªãch nh√†_xu·∫•t_b·∫£n th√¥ng_t·∫•n h√†_n·ªôi vi·ªát_nam vƒÉn_h√≥a v√†du l·ªãch nh√†_xu·∫•t_b·∫£n th√¥ng_t·∫•n'
- 'l·ªùi nh√†_xu·∫•t_b·∫£n vi·ªát_nam l√