diff --git a/examples/document/vectorizing-text-documents.php b/examples/document/vectorizing-text-documents.php index 24e74d094..a54f658b9 100644 --- a/examples/document/vectorizing-text-documents.php +++ b/examples/document/vectorizing-text-documents.php @@ -26,6 +26,6 @@ ]; $vectorizer = new Vectorizer($platform, 'text-embedding-3-large'); -$vectorDocuments = $vectorizer->vectorizeEmbeddableDocuments($textDocuments); +$vectorDocuments = $vectorizer->vectorize($textDocuments); dump(array_map(fn (VectorDocument $document) => $document->vector->getDimensions(), $vectorDocuments)); diff --git a/src/store/src/Document/Vectorizer.php b/src/store/src/Document/Vectorizer.php index f8e08f65e..3cb662c3b 100644 --- a/src/store/src/Document/Vectorizer.php +++ b/src/store/src/Document/Vectorizer.php @@ -27,7 +27,129 @@ public function __construct( ) { } - public function vectorizeEmbeddableDocuments(array $documents, array $options = []): array + public function vectorize(string|\Stringable|EmbeddableDocumentInterface|array $values, array $options = []): Vector|VectorDocument|array + { + if (\is_string($values) || $values instanceof \Stringable) { + return $this->vectorizeString($values, $options); + } + + if ($values instanceof EmbeddableDocumentInterface) { + return $this->vectorizeEmbeddableDocument($values, $options); + } + + if ([] === $values) { + return []; + } + + $firstElement = reset($values); + if ($firstElement instanceof EmbeddableDocumentInterface) { + $this->validateArray($values, EmbeddableDocumentInterface::class); + + return $this->vectorizeEmbeddableDocuments($values, $options); + } + + if (\is_string($firstElement) || $firstElement instanceof \Stringable) { + $this->validateArray($values, 'string|stringable'); + + return $this->vectorizeStrings($values, $options); + } + + throw new RuntimeException('Array must contain only strings, Stringable objects, or EmbeddableDocumentInterface instances.'); + } + + /** + * @param array $values + */ + private function validateArray(array $values, string $expectedType): void + { + foreach ($values as $value) { + if ('string|stringable' === $expectedType) { + if (!\is_string($value) && !$value instanceof \Stringable) { + throw new RuntimeException('Array must contain only strings or Stringable objects.'); + } + } elseif (!$value instanceof $expectedType) { + throw new RuntimeException(\sprintf('Array must contain only "%s" instances.', $expectedType)); + } + } + } + + /** + * @param array $options + */ + private function vectorizeString(string|\Stringable $string, array $options = []): Vector + { + $stringValue = (string) $string; + $this->logger->debug('Vectorizing string', ['string' => $stringValue]); + + $result = $this->platform->invoke($this->model, $stringValue, $options); + $vectors = $result->asVectors(); + + if (!isset($vectors[0])) { + throw new RuntimeException('No vector returned for string vectorization.'); + } + + return $vectors[0]; + } + + /** + * @param array $options + */ + private function vectorizeEmbeddableDocument(EmbeddableDocumentInterface $document, array $options = []): VectorDocument + { + $this->logger->debug('Vectorizing embeddable document', ['document_id' => $document->getId()]); + + $vector = $this->vectorizeString($document->getContent(), $options); + + return new VectorDocument($document->getId(), $vector, $document->getMetadata()); + } + + /** + * @param array $strings + * @param array $options + * + * @return array + */ + private function vectorizeStrings(array $strings, array $options = []): array + { + $stringCount = \count($strings); + $this->logger->info('Starting vectorization of strings', ['string_count' => $stringCount]); + + // Convert all values to strings + $stringValues = array_map(fn (string|\Stringable $s) => (string) $s, $strings); + + if ($this->platform->getModelCatalog()->getModel($this->model)->supports(Capability::INPUT_MULTIPLE)) { + $this->logger->debug('Using batch vectorization with model that supports multiple inputs'); + $result = $this->platform->invoke($this->model, $stringValues, $options); + + $vectors = $result->asVectors(); + $this->logger->debug('Batch vectorization completed', ['vector_count' => \count($vectors)]); + } else { + $this->logger->debug('Using sequential vectorization for model without multiple input support'); + $results = []; + foreach ($stringValues as $i => $string) { + $this->logger->debug('Vectorizing string', ['string_index' => $i]); + $results[] = $this->platform->invoke($this->model, $string, $options); + } + + $vectors = []; + foreach ($results as $result) { + $vectors = array_merge($vectors, $result->asVectors()); + } + $this->logger->debug('Sequential vectorization completed', ['vector_count' => \count($vectors)]); + } + + $this->logger->info('Vectorization process completed', ['string_count' => $stringCount, 'vector_count' => \count($vectors)]); + + return $vectors; + } + + /** + * @param array $documents + * @param array $options + * + * @return array + */ + private function vectorizeEmbeddableDocuments(array $documents, array $options = []): array { $documentCount = \count($documents); $this->logger->info('Starting vectorization process', ['document_count' => $documentCount]); @@ -65,18 +187,4 @@ public function vectorizeEmbeddableDocuments(array $documents, array $options = return $vectorDocuments; } - - public function vectorize(string|\Stringable $string, array $options = []): Vector - { - $this->logger->debug('Vectorizing string', ['string' => (string) $string]); - - $result = $this->platform->invoke($this->model, (string) $string, $options); - $vectors = $result->asVectors(); - - if (!isset($vectors[0])) { - throw new RuntimeException('No vector returned for string vectorization.'); - } - - return $vectors[0]; - } } diff --git a/src/store/src/Document/VectorizerInterface.php b/src/store/src/Document/VectorizerInterface.php index 3f08999a2..32ed57766 100644 --- a/src/store/src/Document/VectorizerInterface.php +++ b/src/store/src/Document/VectorizerInterface.php @@ -14,25 +14,27 @@ use Symfony\AI\Platform\Vector\Vector; /** - * Interface for converting a collection of Embeddable documents into VectorDocuments - * and for vectorizing individual strings. + * Interface for vectorizing strings and EmbeddableDocuments into Vectors and VectorDocuments. * * @author Oskar Stark */ interface VectorizerInterface { /** - * @param EmbeddableDocumentInterface[] $documents - * @param array $options Options to pass to the underlying platform + * Vectorizes strings or EmbeddableDocuments into Vectors or VectorDocuments. * - * @return VectorDocument[] - */ - public function vectorizeEmbeddableDocuments(array $documents, array $options = []): array; - - /** - * Vectorizes a single string or Stringable object into a Vector. + * @param string|\Stringable|EmbeddableDocumentInterface|array|array $values The values to vectorize + * @param array $options Options to pass to the underlying platform + * + * @return Vector|VectorDocument|array|array * - * @param array $options Options to pass to the underlying platform + * @phpstan-return ( + * $values is string|\Stringable ? Vector : ( + * $values is EmbeddableDocumentInterface ? VectorDocument : ( + * $values is array ? array : array + * ) + * ) + * ) */ - public function vectorize(string|\Stringable $string, array $options = []): Vector; + public function vectorize(string|\Stringable|EmbeddableDocumentInterface|array $values, array $options = []): Vector|VectorDocument|array; } diff --git a/src/store/src/Indexer.php b/src/store/src/Indexer.php index 34bec91dd..d797c05f8 100644 --- a/src/store/src/Indexer.php +++ b/src/store/src/Indexer.php @@ -87,13 +87,13 @@ public function index(array $options = []): void ++$counter; if ($chunkSize === \count($chunk)) { - $this->store->add(...$this->vectorizer->vectorizeEmbeddableDocuments($chunk)); + $this->store->add(...$this->vectorizer->vectorize($chunk)); $chunk = []; } } if ([] !== $chunk) { - $this->store->add(...$this->vectorizer->vectorizeEmbeddableDocuments($chunk)); + $this->store->add(...$this->vectorizer->vectorize($chunk)); } $this->logger->debug('Document processing completed', ['total_documents' => $counter]); diff --git a/src/store/tests/Document/VectorizerTest.php b/src/store/tests/Document/VectorizerTest.php index 374f29835..3946e9e37 100644 --- a/src/store/tests/Document/VectorizerTest.php +++ b/src/store/tests/Document/VectorizerTest.php @@ -65,7 +65,7 @@ public function testVectorizeDocumentsWithBatchSupport() $platform = PlatformTestHandler::createPlatform(new VectorResult(...$vectors), $modelCatalog); $vectorizer = new Vectorizer($platform, 'test-embedding-with-batch'); - $vectorDocuments = $vectorizer->vectorizeEmbeddableDocuments($documents); + $vectorDocuments = $vectorizer->vectorize($documents); $this->assertCount(3, $vectorDocuments); @@ -84,7 +84,7 @@ public function testVectorizeDocumentsWithSingleDocument() $platform = PlatformTestHandler::createPlatform(new VectorResult($vector)); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small'); - $vectorDocuments = $vectorizer->vectorizeEmbeddableDocuments([$document]); + $vectorDocuments = $vectorizer->vectorize([$document]); $this->assertCount(1, $vectorDocuments); $this->assertInstanceOf(VectorDocument::class, $vectorDocuments[0]); @@ -97,7 +97,7 @@ public function testVectorizeEmptyDocumentsArray() { $platform = PlatformTestHandler::createPlatform(new VectorResult()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small'); - $vectorDocuments = $vectorizer->vectorizeEmbeddableDocuments([]); + $vectorDocuments = $vectorizer->vectorize([]); $this->assertSame([], $vectorDocuments); } @@ -119,7 +119,7 @@ public function testVectorizeDocumentsPreservesMetadata() $platform = PlatformTestHandler::createPlatform(new VectorResult(...$vectors)); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small'); - $vectorDocuments = $vectorizer->vectorizeEmbeddableDocuments($documents); + $vectorDocuments = $vectorizer->vectorize($documents); $this->assertCount(2, $vectorDocuments); $this->assertSame($metadata1, $vectorDocuments[0]->metadata); @@ -148,7 +148,7 @@ public function testVectorizeDocumentsPreservesDocumentIds() $platform = PlatformTestHandler::createPlatform(new VectorResult(...$vectors)); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small'); - $vectorDocuments = $vectorizer->vectorizeEmbeddableDocuments($documents); + $vectorDocuments = $vectorizer->vectorize($documents); $this->assertCount(3, $vectorDocuments); $this->assertSame($id1, $vectorDocuments[0]->id); @@ -175,7 +175,7 @@ public function testVectorizeVariousDocumentCounts(int $count) $count > 0 ? new VectorResult(...$vectors) : new VectorResult() ); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small'); - $vectorDocuments = $vectorizer->vectorizeEmbeddableDocuments($documents); + $vectorDocuments = $vectorizer->vectorize($documents); $this->assertCount($count, $vectorDocuments); @@ -212,7 +212,7 @@ public function testVectorizeDocumentsWithLargeVectors() $platform = PlatformTestHandler::createPlatform(new VectorResult($vector)); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small'); - $vectorDocuments = $vectorizer->vectorizeEmbeddableDocuments([$document]); + $vectorDocuments = $vectorizer->vectorize([$document]); $this->assertCount(1, $vectorDocuments); $this->assertEquals($vector, $vectorDocuments[0]->vector); @@ -234,7 +234,7 @@ public function testVectorizeDocumentsWithSpecialCharacters() $platform = PlatformTestHandler::createPlatform(new VectorResult(...$vectors)); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small'); - $vectorDocuments = $vectorizer->vectorizeEmbeddableDocuments($documents); + $vectorDocuments = $vectorizer->vectorize($documents); $this->assertCount(3, $vectorDocuments); @@ -272,7 +272,7 @@ public function testVectorizeDocumentsWithoutBatchSupportUsesNonBatchMode() $platform = PlatformTestHandler::createPlatform(new VectorResult(...$vectors), $modelCatalog); $vectorizer = new Vectorizer($platform, 'test-embedding-no-batch'); - $vectorDocuments = $vectorizer->vectorizeEmbeddableDocuments($documents); + $vectorDocuments = $vectorizer->vectorize($documents); $this->assertCount(2, $vectorDocuments); $this->assertEquals($vectors[0], $vectorDocuments[0]->vector); @@ -331,6 +331,111 @@ public function testVectorizeStringThrowsExceptionWhenNoVectorReturned() $vectorizer->vectorize($text); } + public function testVectorizeStringable() + { + $stringable = new class implements \Stringable { + public function __toString(): string + { + return 'This is a Stringable object'; + } + }; + + $vector = new Vector([0.1, 0.2, 0.3]); + + $platform = PlatformTestHandler::createPlatform(new VectorResult($vector)); + $vectorizer = new Vectorizer($platform, 'text-embedding-3-small'); + $result = $vectorizer->vectorize($stringable); + + $this->assertInstanceOf(Vector::class, $result); + $this->assertEquals($vector, $result); + } + + public function testVectorizeArrayOfStrings() + { + $strings = ['First string', 'Second string', 'Third string']; + $vectors = [ + new Vector([0.1, 0.2, 0.3]), + new Vector([0.4, 0.5, 0.6]), + new Vector([0.7, 0.8, 0.9]), + ]; + + $platform = PlatformTestHandler::createPlatform(new VectorResult(...$vectors)); + $vectorizer = new Vectorizer($platform, 'text-embedding-3-small'); + $result = $vectorizer->vectorize($strings); + + $this->assertIsArray($result); + $this->assertCount(3, $result); + foreach ($result as $i => $vector) { + $this->assertInstanceOf(Vector::class, $vector); + $this->assertEquals($vectors[$i], $vector); + } + } + + public function testVectorizeArrayOfStringables() + { + $stringables = [ + new class implements \Stringable { + public function __toString(): string + { + return 'First Stringable'; + } + }, + new class implements \Stringable { + public function __toString(): string + { + return 'Second Stringable'; + } + }, + ]; + + $vectors = [ + new Vector([0.1, 0.2]), + new Vector([0.3, 0.4]), + ]; + + $platform = PlatformTestHandler::createPlatform(new VectorResult(...$vectors)); + $vectorizer = new Vectorizer($platform, 'text-embedding-3-small'); + $result = $vectorizer->vectorize($stringables); + + $this->assertIsArray($result); + $this->assertCount(2, $result); + foreach ($result as $i => $vector) { + $this->assertInstanceOf(Vector::class, $vector); + $this->assertEquals($vectors[$i], $vector); + } + } + + public function testVectorizeArrayOfMixedStringsAndStringables() + { + $values = [ + 'Regular string', + new class implements \Stringable { + public function __toString(): string + { + return 'Stringable object'; + } + }, + 'Another string', + ]; + + $vectors = [ + new Vector([0.1, 0.2]), + new Vector([0.3, 0.4]), + new Vector([0.5, 0.6]), + ]; + + $platform = PlatformTestHandler::createPlatform(new VectorResult(...$vectors)); + $vectorizer = new Vectorizer($platform, 'text-embedding-3-small'); + $result = $vectorizer->vectorize($values); + + $this->assertIsArray($result); + $this->assertCount(3, $result); + foreach ($result as $i => $vector) { + $this->assertInstanceOf(Vector::class, $vector); + $this->assertEquals($vectors[$i], $vector); + } + } + public function testVectorizeTextDocumentsPassesOptionsToInvoke() { $documents = [ @@ -344,7 +449,7 @@ public function testVectorizeTextDocumentsPassesOptionsToInvoke() // This ensures batch mode is used and the test expectation matches the behavior $platform = PlatformTestHandler::createPlatform(new VectorResult($vector)); $vectorizer = new Vectorizer($platform, 'test-embedding-with-batch'); - $result = $vectorizer->vectorizeEmbeddableDocuments($documents, $options); + $result = $vectorizer->vectorize($documents, $options); $this->assertCount(1, $result); $this->assertEquals($vector, $result[0]->vector); @@ -362,7 +467,7 @@ public function testVectorizeTextDocumentsWithEmptyOptions() // This ensures batch mode is used and the test expectation matches the behavior $platform = PlatformTestHandler::createPlatform(new VectorResult($vector)); $vectorizer = new Vectorizer($platform, 'test-embedding-with-batch'); - $result = $vectorizer->vectorizeEmbeddableDocuments($documents); + $result = $vectorizer->vectorize($documents); $this->assertCount(1, $result); $this->assertEquals($vector, $result[0]->vector); @@ -441,7 +546,7 @@ public function testVectorizeTextDocumentsWithoutBatchSupportPassesOptions() $platform = PlatformTestHandler::createPlatform(new VectorResult(...$vectors), $modelCatalog); $vectorizer = new Vectorizer($platform, 'test-embedding-no-batch-with-options'); - $result = $vectorizer->vectorizeEmbeddableDocuments($documents, $options); + $result = $vectorizer->vectorize($documents, $options); $this->assertCount(2, $result); $this->assertEquals($vectors[0], $result[0]->vector);