From 73548e4a437fb8d467fc03e5a3d9c71b6ec20339 Mon Sep 17 00:00:00 2001 From: Drew Minnear Date: Mon, 21 Apr 2025 18:35:07 -0400 Subject: [PATCH] combine partitions inside of chunk for more complete context in each chunk --- loaders/text.py | 68 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 24 deletions(-) diff --git a/loaders/text.py b/loaders/text.py index 504e74b..c293ea8 100644 --- a/loaders/text.py +++ b/loaders/text.py @@ -39,38 +39,58 @@ def __init__(self, config: Config): ) def load(self, paths: List[Path]) -> List[Document]: - """ - Loads and splits a list of text files into structured and chunked LangChain documents. - - Args: - paths (List[Path]): List of file paths to load. - - Returns: - List[Document]: Chunked LangChain Document objects with metadata. - """ all_chunks: List[Document] = [] for path in paths: try: - logger.info("Loading and partitioning: %s", path) + logger.info("Partitioning %s", path) elements = partition(filename=str(path), strategy="fast") - docs = [ - Document( - page_content=element.text, - metadata={ - "source": str(path), - **(element.metadata.to_dict() if element.metadata else {}), - }, + # 1) concatenate elements until we hit ~chunk_size chars + buf: List[str] = [] + buf_len = 0 + for el in elements: + if not getattr(el, "text", ""): + continue + t = el.text.strip() + if not t: + continue + + if buf_len + len(t) > self.config.chunk_size and buf: + all_chunks.append( + Document( + page_content="\n".join(buf), + metadata={"source": str(path)}, + ) + ) + buf, buf_len = [], 0 + + buf.append(t) + buf_len += len(t) + + # flush remainder + if buf: + all_chunks.append( + Document( + page_content="\n".join(buf), + metadata={"source": str(path)}, + ) ) - for element in elements - if hasattr(element, "text") and element.text - ] - - chunks = self.splitter.split_documents(docs) - all_chunks.extend(chunks) except Exception as e: logger.warning("Failed to load %s: %s", path, e) - return all_chunks + # 2) optional secondary splitter for *very* long docs + final_docs: List[Document] = [] + for doc in all_chunks: + if len(doc.page_content) > self.config.chunk_size * 2: + final_docs.extend(self.splitter.split_documents([doc])) + else: + final_docs.append(doc) + + logger.info( + "Produced %d chunks (avg %.0f chars)", + len(final_docs), + sum(len(d.page_content) for d in final_docs) / max(1, len(final_docs)), + ) + return final_docs