From 17b3adb61f2218901e1dce7d377fcadf822ae652 Mon Sep 17 00:00:00 2001 From: Oskar Stark Date: Mon, 8 Sep 2025 10:49:24 +0200 Subject: [PATCH] Add constructor with configurable chunk size and overlap to TextSplitTransformer - Add constructor to TextSplitTransformer with default values (chunkSize=1000, overlap=200) - Add validation that overlap must be non-negative and less than chunk size - Use constructor parameters as defaults when no options are provided in transform method - Add comprehensive tests for constructor parameter validation - Fix code style with PHP CS Fixer --- .../Transformer/TextSplitTransformer.php | 13 ++++- .../Transformer/TextSplitTransformerTest.php | 57 +++++++++++++++++++ 2 files changed, 68 insertions(+), 2 deletions(-) diff --git a/src/store/src/Document/Transformer/TextSplitTransformer.php b/src/store/src/Document/Transformer/TextSplitTransformer.php index 772de9bf8..2f24b976a 100644 --- a/src/store/src/Document/Transformer/TextSplitTransformer.php +++ b/src/store/src/Document/Transformer/TextSplitTransformer.php @@ -29,13 +29,22 @@ public const OPTION_CHUNK_SIZE = 'chunk_size'; public const OPTION_OVERLAP = 'overlap'; + public function __construct( + private int $chunkSize = 1000, + private int $overlap = 200, + ) { + if ($this->overlap < 0 || $this->overlap >= $this->chunkSize) { + throw new InvalidArgumentException(\sprintf('Overlap must be non-negative and less than chunk size. Got chunk size: %d, overlap: %d', $this->chunkSize, $this->overlap)); + } + } + /** * @param array{chunk_size?: int, overlap?: int} $options */ public function transform(iterable $documents, array $options = []): iterable { - $chunkSize = $options[self::OPTION_CHUNK_SIZE] ?? 1000; - $overlap = $options[self::OPTION_OVERLAP] ?? 200; + $chunkSize = $options[self::OPTION_CHUNK_SIZE] ?? $this->chunkSize; + $overlap = $options[self::OPTION_OVERLAP] ?? $this->overlap; if ($overlap < 0 || $overlap >= $chunkSize) { throw new InvalidArgumentException('Overlap must be non-negative and less than chunk size.'); diff --git a/src/store/tests/Document/Transformer/TextSplitTransformerTest.php b/src/store/tests/Document/Transformer/TextSplitTransformerTest.php index 98d092caa..a13cfb71d 100644 --- a/src/store/tests/Document/Transformer/TextSplitTransformerTest.php +++ b/src/store/tests/Document/Transformer/TextSplitTransformerTest.php @@ -184,6 +184,63 @@ public function testSplitWithNegativeOverlap() ])); } + public function testConstructorWithValidParameters() + { + $transformer = new TextSplitTransformer(500, 100); + $document = new TextDocument(Uuid::v4(), 'short text'); + + $chunks = iterator_to_array($transformer->transform([$document])); + + $this->assertCount(1, $chunks); + $this->assertSame('short text', $chunks[0]->content); + } + + public function testConstructorWithDefaultParameters() + { + $transformer = new TextSplitTransformer(); + $document = new TextDocument(Uuid::v4(), 'short text'); + + $chunks = iterator_to_array($transformer->transform([$document])); + + $this->assertCount(1, $chunks); + $this->assertSame('short text', $chunks[0]->content); + } + + public function testConstructorWithNegativeOverlap() + { + $this->expectException(InvalidArgumentException::class); + $this->expectExceptionMessage('Overlap must be non-negative and less than chunk size. Got chunk size: 1000, overlap: -1'); + + new TextSplitTransformer(1000, -1); + } + + public function testConstructorWithOverlapEqualToChunkSize() + { + $this->expectException(InvalidArgumentException::class); + $this->expectExceptionMessage('Overlap must be non-negative and less than chunk size. Got chunk size: 500, overlap: 500'); + + new TextSplitTransformer(500, 500); + } + + public function testConstructorWithOverlapGreaterThanChunkSize() + { + $this->expectException(InvalidArgumentException::class); + $this->expectExceptionMessage('Overlap must be non-negative and less than chunk size. Got chunk size: 100, overlap: 200'); + + new TextSplitTransformer(100, 200); + } + + public function testConstructorParametersAreUsedAsDefaults() + { + $transformer = new TextSplitTransformer(150, 25); + $document = new TextDocument(Uuid::v4(), $this->getLongText()); + + $chunks = iterator_to_array($transformer->transform([$document])); + + $this->assertCount(12, $chunks); + $this->assertSame(150, mb_strlen($chunks[0]->content)); + } + private function getLongText(): string { return trim(file_get_contents(\dirname(__DIR__, 5).'/fixtures/lorem.txt'));