In [None]:
import sys
sys.path.append("..")
import tiktoken
from typing import List
from scripts.chunkingAlgorithm import HierarchicalChunker
from scripts.chunkingAlgorithm import merge_text
from scripts.config import EXTRACTED_DATA_PATH
from scripts.config import CHUNKS_PATH

cl100k_base = tiktoken.get_encoding("cl100k_base")
chunker = HierarchicalChunker(max_tokens=500, model=cl100k_base)

In [None]:
import json
from pathlib import Path
from tqdm import tqdm

def process_directory(input_dir: str, output_dir: str, chunker: HierarchicalChunker):
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    for file in tqdm(list(input_path.glob("*.json")), desc="Processing files"):
        try:
            with open(file, "r", encoding="utf-8") as f:
                structured_data = json.load(f)

            # Try chunking
            chunks = chunker.chunk(structured_data)

            file_data = []
            for i, chunk in enumerate(chunks):
                merged_text = merge_text(chunk["content"])
                output_data = {
                    "file_source": file.name,
                    "page_numbers": chunk.get("page_numbers", []),
                    "chunk_number": i,
                    "text": merged_text
                }
                file_data.append(output_data)

            # Save output
            output_file = output_path / f"{file.stem}.json"
            with open(output_file, "w", encoding="utf-8") as out:
                json.dump(file_data, out, ensure_ascii=False, indent=2)

        except Exception as e:
            print(f"Error processing {file.name}: {e}")


In [None]:
chunker = HierarchicalChunker(max_tokens=500, model=cl100k_base)
process_directory(EXTRACTED_DATA_PATH, CHUNKS_PATH, chunker)