In [7]:
import json
from pathlib import Path
from transformers import AutoTokenizer
from concurrent.futures import ThreadPoolExecutor, as_completed
import warnings

warnings.filterwarnings("ignore", message="Token indices sequence length is longer")

# Tải tokenizer 1 lần (ở đây dùng bert multilingual)
TOKENIZER = AutoTokenizer.from_pretrained('bert-base-multilingual-cased', use_fast=True)

def chunk_text_with_tokens(text, chunk_size=350, overlap=80, max_len=512):
    # token ids thay vì gọi tokenizer.tokenize() nhiều lần
    # use_fast=True cho tokenizer nhanh hơn
    token_ids = TOKENIZER.encode(text, add_special_tokens=False)
    chunks = []
    start = 0
    L = len(token_ids)
    while start < L:
        end = min(start + chunk_size, L)
        sub_ids = token_ids[start:end]
        if len(sub_ids) > max_len:
            sub_ids = sub_ids[:max_len]
        chunk = TOKENIZER.decode(sub_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

def process_doc(doc, chunk_size=350, overlap=80):
    chunks_out = []
    text_chunks = chunk_text_with_tokens(doc['content'], chunk_size, overlap)
    for i, chunk in enumerate(text_chunks):
        meta = doc.get('metadata', {})
        if meta.get('source') == 'pdf' and len(chunk.split()) > 25:
            chunk = f"[Paraphrased] {chunk[:100]}... (Cite page {meta.get('page','?')})"
        chunks_out.append({
            'content': chunk,
            'metadata': {**meta, 'chunk_id': f"{meta.get('original_id','unk')}_{i}"}
        })
    return chunks_out

def main(parallel_workers=4):
    base_dir = Path.cwd().parent if Path.cwd().name == "scripts" else Path.cwd()
    data_dir = base_dir / "data"
    input_file = data_dir / "merged_docs.json"
    output_file = data_dir / "chunked_docs.json"

    with open(input_file, 'r', encoding='utf-8') as f:
        docs = json.load(f)

    all_chunks = []
    # xử lý song song docs (mỗi doc là 1 unit)
    with ThreadPoolExecutor(max_workers=parallel_workers) as exe:
        futures = [exe.submit(process_doc, doc) for doc in docs]
        for fut in as_completed(futures):
            all_chunks.extend(fut.result())

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(all_chunks, f, ensure_ascii=False, indent=2)

    print(f"Đã tạo xong file chunk: {output_file}")

# chạy
main(parallel_workers=4)


Token indices sequence length is longer than the specified maximum sequence length for this model (574 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (693 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (579 > 512). Running this sequence through the model will result in indexing errors


Đã tạo xong file chunk: d:\RAG\data\chunked_docs.json
