From 1bd8cc4d6b57d4b75dcc569b8b6afe7075b97356 Mon Sep 17 00:00:00 2001 From: Oskar Stark Date: Mon, 8 Sep 2025 10:52:22 +0200 Subject: [PATCH] [Store] Refactor `VectorizerInterface` to separate string and document vectorization --- demo/config/packages/ai.yaml | 2 +- .../document/vectorizing-text-documents.php | 33 ++++++++ examples/document/vectorizing.php | 20 +++-- .../src/Toolbox/Tool/SimilaritySearch.php | 10 +-- .../Toolbox/Tool/SimilaritySearchTest.php | 66 +++++----------- src/ai-bundle/config/options.php | 1 + src/store/src/Document/Vectorizer.php | 18 ++++- .../src/Document/VectorizerInterface.php | 12 ++- src/store/src/Indexer.php | 4 +- src/store/tests/Document/VectorizerTest.php | 78 ++++++++++++++++--- 10 files changed, 165 insertions(+), 79 deletions(-) create mode 100644 examples/document/vectorizing-text-documents.php diff --git a/demo/config/packages/ai.yaml b/demo/config/packages/ai.yaml index df1c3c3ba..e980bd9c3 100644 --- a/demo/config/packages/ai.yaml +++ b/demo/config/packages/ai.yaml @@ -74,4 +74,4 @@ services: # $apiKey: '%env(SERP_API_KEY)%' Symfony\AI\Agent\Toolbox\Tool\Wikipedia: ~ Symfony\AI\Agent\Toolbox\Tool\SimilaritySearch: - $model: '@ai.indexer.default.model' + $vectorizer: '@ai.vectorizer.openai_embeddings' diff --git a/examples/document/vectorizing-text-documents.php b/examples/document/vectorizing-text-documents.php new file mode 100644 index 000000000..425c6bc3a --- /dev/null +++ b/examples/document/vectorizing-text-documents.php @@ -0,0 +1,33 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +use Symfony\AI\Platform\Bridge\OpenAi\Embeddings; +use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory; +use Symfony\AI\Store\Document\TextDocument; +use Symfony\AI\Store\Document\VectorDocument; +use Symfony\AI\Store\Document\Vectorizer; +use Symfony\Component\Uid\Uuid; + +require_once dirname(__DIR__).'/bootstrap.php'; + +$platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); +$embeddings = new Embeddings(Embeddings::TEXT_3_LARGE); + +$textDocuments = [ + new TextDocument(Uuid::v4(), 'Hello World'), + new TextDocument(Uuid::v4(), 'Lorem ipsum dolor sit amet'), + new TextDocument(Uuid::v4(), 'PHP Hypertext Preprocessor'), +]; + +$vectorizer = new Vectorizer($platform, $embeddings); +$vectorDocuments = $vectorizer->vectorizeTextDocuments($textDocuments); + +dump(array_map(fn (VectorDocument $document) => $document->vector->getDimensions(), $vectorDocuments)); diff --git a/examples/document/vectorizing.php b/examples/document/vectorizing.php index 1fa1bb04b..8d70baaa4 100644 --- a/examples/document/vectorizing.php +++ b/examples/document/vectorizing.php @@ -11,23 +11,21 @@ use Symfony\AI\Platform\Bridge\OpenAi\Embeddings; use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory; -use Symfony\AI\Store\Document\TextDocument; -use Symfony\AI\Store\Document\VectorDocument; use Symfony\AI\Store\Document\Vectorizer; -use Symfony\Component\Uid\Uuid; require_once dirname(__DIR__).'/bootstrap.php'; $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $embeddings = new Embeddings(Embeddings::TEXT_3_LARGE); -$textDocuments = [ - new TextDocument(Uuid::v4(), 'Hello World'), - new TextDocument(Uuid::v4(), 'Lorem ipsum dolor sit amet'), - new TextDocument(Uuid::v4(), 'PHP Hypertext Preprocessor'), -]; - $vectorizer = new Vectorizer($platform, $embeddings); -$vectorDocuments = $vectorizer->vectorizeDocuments($textDocuments); -dump(array_map(fn (VectorDocument $document) => $document->vector->getDimensions(), $vectorDocuments)); +$string = 'Hello World'; +$vector = $vectorizer->vectorize($string); + +printf( + "String: %s\nVector dimensions: %d\nFirst 5 values: [%s]\n", + $string, + $vector->getDimensions(), + implode(', ', array_map(fn ($val) => number_format($val, 6), array_slice($vector->getData(), 0, 5))) +); diff --git a/src/agent/src/Toolbox/Tool/SimilaritySearch.php b/src/agent/src/Toolbox/Tool/SimilaritySearch.php index 4603ce8ff..1e85b49a6 100644 --- a/src/agent/src/Toolbox/Tool/SimilaritySearch.php +++ b/src/agent/src/Toolbox/Tool/SimilaritySearch.php @@ -12,9 +12,8 @@ namespace Symfony\AI\Agent\Toolbox\Tool; use Symfony\AI\Agent\Toolbox\Attribute\AsTool; -use Symfony\AI\Platform\Model; -use Symfony\AI\Platform\PlatformInterface; use Symfony\AI\Store\Document\VectorDocument; +use Symfony\AI\Store\Document\VectorizerInterface; use Symfony\AI\Store\StoreInterface; /** @@ -29,8 +28,7 @@ final class SimilaritySearch public array $usedDocuments = []; public function __construct( - private readonly PlatformInterface $platform, - private readonly Model $model, + private readonly VectorizerInterface $vectorizer, private readonly StoreInterface $store, ) { } @@ -40,8 +38,8 @@ public function __construct( */ public function __invoke(string $searchTerm): string { - $vectors = $this->platform->invoke($this->model, $searchTerm)->asVectors(); - $this->usedDocuments = $this->store->query($vectors[0]); + $vector = $this->vectorizer->vectorize($searchTerm); + $this->usedDocuments = $this->store->query($vector); if ([] === $this->usedDocuments) { return 'No results found'; diff --git a/src/agent/tests/Toolbox/Tool/SimilaritySearchTest.php b/src/agent/tests/Toolbox/Tool/SimilaritySearchTest.php index 0965cf429..d44e5b60a 100644 --- a/src/agent/tests/Toolbox/Tool/SimilaritySearchTest.php +++ b/src/agent/tests/Toolbox/Tool/SimilaritySearchTest.php @@ -14,14 +14,10 @@ use PHPUnit\Framework\Attributes\CoversClass; use PHPUnit\Framework\TestCase; use Symfony\AI\Agent\Toolbox\Tool\SimilaritySearch; -use Symfony\AI\Platform\Model; -use Symfony\AI\Platform\PlatformInterface; -use Symfony\AI\Platform\Result\RawResultInterface; -use Symfony\AI\Platform\Result\ResultPromise; -use Symfony\AI\Platform\Result\VectorResult; use Symfony\AI\Platform\Vector\Vector; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\VectorDocument; +use Symfony\AI\Store\Document\VectorizerInterface; use Symfony\AI\Store\StoreInterface; use Symfony\Component\Uid\Uuid; @@ -44,18 +40,11 @@ public function testSearchWithResults() new Metadata(['title' => 'Document 2', 'content' => 'Second document content']), ); - $rawResult = $this->createMock(RawResultInterface::class); - $vectorResult = new VectorResult($vector); - $resultPromise = new ResultPromise( - fn () => $vectorResult, - $rawResult - ); - - $platform = $this->createMock(PlatformInterface::class); - $platform->expects($this->once()) - ->method('invoke') - ->with($this->isInstanceOf(Model::class), $searchTerm) - ->willReturn($resultPromise); + $vectorizer = $this->createMock(VectorizerInterface::class); + $vectorizer->expects($this->once()) + ->method('vectorize') + ->with($searchTerm) + ->willReturn($vector); $store = $this->createMock(StoreInterface::class); $store->expects($this->once()) @@ -63,8 +52,7 @@ public function testSearchWithResults() ->with($vector) ->willReturn([$document1, $document2]); - $model = new Model('test-model'); - $similaritySearch = new SimilaritySearch($platform, $model, $store); + $similaritySearch = new SimilaritySearch($vectorizer, $store); $result = $similaritySearch($searchTerm); @@ -77,18 +65,11 @@ public function testSearchWithoutResults() $searchTerm = 'find nothing'; $vector = new Vector([0.1, 0.2, 0.3]); - $rawResult = $this->createMock(RawResultInterface::class); - $vectorResult = new VectorResult($vector); - $resultPromise = new ResultPromise( - fn () => $vectorResult, - $rawResult - ); - - $platform = $this->createMock(PlatformInterface::class); - $platform->expects($this->once()) - ->method('invoke') - ->with($this->isInstanceOf(Model::class), $searchTerm) - ->willReturn($resultPromise); + $vectorizer = $this->createMock(VectorizerInterface::class); + $vectorizer->expects($this->once()) + ->method('vectorize') + ->with($searchTerm) + ->willReturn($vector); $store = $this->createMock(StoreInterface::class); $store->expects($this->once()) @@ -96,8 +77,7 @@ public function testSearchWithoutResults() ->with($vector) ->willReturn([]); - $model = new Model('test-model'); - $similaritySearch = new SimilaritySearch($platform, $model, $store); + $similaritySearch = new SimilaritySearch($vectorizer, $store); $result = $similaritySearch($searchTerm); @@ -116,18 +96,11 @@ public function testSearchWithSingleResult() new Metadata(['title' => 'Single Document', 'description' => 'Only one match']), ); - $rawResult = $this->createMock(RawResultInterface::class); - $vectorResult = new VectorResult($vector); - $resultPromise = new ResultPromise( - fn () => $vectorResult, - $rawResult - ); - - $platform = $this->createMock(PlatformInterface::class); - $platform->expects($this->once()) - ->method('invoke') - ->with($this->isInstanceOf(Model::class), $searchTerm) - ->willReturn($resultPromise); + $vectorizer = $this->createMock(VectorizerInterface::class); + $vectorizer->expects($this->once()) + ->method('vectorize') + ->with($searchTerm) + ->willReturn($vector); $store = $this->createMock(StoreInterface::class); $store->expects($this->once()) @@ -135,8 +108,7 @@ public function testSearchWithSingleResult() ->with($vector) ->willReturn([$document]); - $model = new Model('test-model'); - $similaritySearch = new SimilaritySearch($platform, $model, $store); + $similaritySearch = new SimilaritySearch($vectorizer, $store); $result = $similaritySearch($searchTerm); diff --git a/src/ai-bundle/config/options.php b/src/ai-bundle/config/options.php index 67b45c32f..d6a8440c7 100644 --- a/src/ai-bundle/config/options.php +++ b/src/ai-bundle/config/options.php @@ -373,6 +373,7 @@ ->end() ->end() ->arrayNode('vectorizer') + ->info('Vectorizers for converting strings to Vector objects and transforming TextDocument arrays to VectorDocument arrays') ->useAttributeAsKey('name') ->arrayPrototype() ->children() diff --git a/src/store/src/Document/Vectorizer.php b/src/store/src/Document/Vectorizer.php index 7a2d0bbc8..01792f024 100644 --- a/src/store/src/Document/Vectorizer.php +++ b/src/store/src/Document/Vectorizer.php @@ -16,6 +16,8 @@ use Symfony\AI\Platform\Capability; use Symfony\AI\Platform\Model; use Symfony\AI\Platform\PlatformInterface; +use Symfony\AI\Platform\Vector\Vector; +use Symfony\AI\Store\Exception\RuntimeException; final readonly class Vectorizer implements VectorizerInterface { @@ -26,7 +28,7 @@ public function __construct( ) { } - public function vectorize(array $documents): array + public function vectorizeTextDocuments(array $documents): array { $documentCount = \count($documents); $this->logger->info('Starting vectorization process', ['document_count' => $documentCount]); @@ -64,4 +66,18 @@ public function vectorize(array $documents): array return $vectorDocuments; } + + public function vectorize(string $string): Vector + { + $this->logger->debug('Vectorizing string', ['string' => $string]); + + $result = $this->platform->invoke($this->model, $string); + $vectors = $result->asVectors(); + + if (!isset($vectors[0])) { + throw new RuntimeException('No vector returned for string vectorization.'); + } + + return $vectors[0]; + } } diff --git a/src/store/src/Document/VectorizerInterface.php b/src/store/src/Document/VectorizerInterface.php index d87edce34..21d144236 100644 --- a/src/store/src/Document/VectorizerInterface.php +++ b/src/store/src/Document/VectorizerInterface.php @@ -11,8 +11,11 @@ namespace Symfony\AI\Store\Document; +use Symfony\AI\Platform\Vector\Vector; + /** - * Interface for converting a collection of TextDocuments into VectorDocuments. + * Interface for converting a collection of TextDocuments into VectorDocuments + * and for vectorizing individual strings. * * @author Oskar Stark */ @@ -23,5 +26,10 @@ interface VectorizerInterface * * @return VectorDocument[] */ - public function vectorize(array $documents): array; + public function vectorizeTextDocuments(array $documents): array; + + /** + * Vectorizes a single string into a Vector. + */ + public function vectorize(string $string): Vector; } diff --git a/src/store/src/Indexer.php b/src/store/src/Indexer.php index b3257ea6f..a7a3137c9 100644 --- a/src/store/src/Indexer.php +++ b/src/store/src/Indexer.php @@ -45,13 +45,13 @@ public function index(TextDocument|iterable $documents, int $chunkSize = 50): vo ++$counter; if ($chunkSize === \count($chunk)) { - $this->store->add(...$this->vectorizer->vectorize($chunk)); + $this->store->add(...$this->vectorizer->vectorizeTextDocuments($chunk)); $chunk = []; } } if (\count($chunk) > 0) { - $this->store->add(...$this->vectorizer->vectorize($chunk)); + $this->store->add(...$this->vectorizer->vectorizeTextDocuments($chunk)); } $this->logger->debug(0 === $counter ? 'No documents to index' : \sprintf('Indexed %d documents', $counter)); diff --git a/src/store/tests/Document/VectorizerTest.php b/src/store/tests/Document/VectorizerTest.php index 3e4059944..8a0cb7222 100644 --- a/src/store/tests/Document/VectorizerTest.php +++ b/src/store/tests/Document/VectorizerTest.php @@ -67,7 +67,7 @@ public function testVectorizeDocumentsWithBatchSupport() $model = new Embeddings(); $vectorizer = new Vectorizer($platform, $model); - $vectorDocuments = $vectorizer->vectorize($documents); + $vectorDocuments = $vectorizer->vectorizeTextDocuments($documents); $this->assertCount(3, $vectorDocuments); @@ -88,7 +88,7 @@ public function testVectorizeDocumentsWithSingleDocument() $model = new Embeddings(); $vectorizer = new Vectorizer($platform, $model); - $vectorDocuments = $vectorizer->vectorize([$document]); + $vectorDocuments = $vectorizer->vectorizeTextDocuments([$document]); $this->assertCount(1, $vectorDocuments); $this->assertInstanceOf(VectorDocument::class, $vectorDocuments[0]); @@ -103,7 +103,7 @@ public function testVectorizeEmptyDocumentsArray() $model = new Embeddings(); $vectorizer = new Vectorizer($platform, $model); - $vectorDocuments = $vectorizer->vectorize([]); + $vectorDocuments = $vectorizer->vectorizeTextDocuments([]); $this->assertSame([], $vectorDocuments); } @@ -127,7 +127,7 @@ public function testVectorizeDocumentsPreservesMetadata() $model = new Embeddings(); $vectorizer = new Vectorizer($platform, $model); - $vectorDocuments = $vectorizer->vectorize($documents); + $vectorDocuments = $vectorizer->vectorizeTextDocuments($documents); $this->assertCount(2, $vectorDocuments); $this->assertSame($metadata1, $vectorDocuments[0]->metadata); @@ -158,7 +158,7 @@ public function testVectorizeDocumentsPreservesDocumentIds() $model = new Embeddings(); $vectorizer = new Vectorizer($platform, $model); - $vectorDocuments = $vectorizer->vectorize($documents); + $vectorDocuments = $vectorizer->vectorizeTextDocuments($documents); $this->assertCount(3, $vectorDocuments); $this->assertSame($id1, $vectorDocuments[0]->id); @@ -187,7 +187,7 @@ public function testVectorizeVariousDocumentCounts(int $count) $model = new Embeddings(); $vectorizer = new Vectorizer($platform, $model); - $vectorDocuments = $vectorizer->vectorize($documents); + $vectorDocuments = $vectorizer->vectorizeTextDocuments($documents); $this->assertCount($count, $vectorDocuments); @@ -226,7 +226,7 @@ public function testVectorizeDocumentsWithLargeVectors() $model = new Embeddings(); $vectorizer = new Vectorizer($platform, $model); - $vectorDocuments = $vectorizer->vectorize([$document]); + $vectorDocuments = $vectorizer->vectorizeTextDocuments([$document]); $this->assertCount(1, $vectorDocuments); $this->assertEquals($vector, $vectorDocuments[0]->vector); @@ -250,7 +250,7 @@ public function testVectorizeDocumentsWithSpecialCharacters() $model = new Embeddings(); $vectorizer = new Vectorizer($platform, $model); - $vectorDocuments = $vectorizer->vectorize($documents); + $vectorDocuments = $vectorizer->vectorizeTextDocuments($documents); $this->assertCount(3, $vectorDocuments); @@ -313,10 +313,70 @@ public function convert(RawResultInterface $result, array $options = []): Result $platform = new Platform([$handler], [$handler]); $vectorizer = new Vectorizer($platform, $model); - $vectorDocuments = $vectorizer->vectorize($documents); + $vectorDocuments = $vectorizer->vectorizeTextDocuments($documents); $this->assertCount(2, $vectorDocuments); $this->assertEquals($vectors[0], $vectorDocuments[0]->vector); $this->assertEquals($vectors[1], $vectorDocuments[1]->vector); } + + public function testVectorizeString() + { + $text = 'This is a test string to vectorize'; + $vector = new Vector([0.1, 0.2, 0.3]); + + $platform = PlatformTestHandler::createPlatform(new VectorResult($vector)); + $model = new Embeddings(); + + $vectorizer = new Vectorizer($platform, $model); + $result = $vectorizer->vectorize($text); + + $this->assertInstanceOf(Vector::class, $result); + $this->assertEquals($vector, $result); + } + + public function testVectorizeStringWithSpecialCharacters() + { + $text = 'Test with émojis 🚀 and ünïcödé characters'; + $vector = new Vector([0.5, 0.6, 0.7]); + + $platform = PlatformTestHandler::createPlatform(new VectorResult($vector)); + $model = new Embeddings(); + + $vectorizer = new Vectorizer($platform, $model); + $result = $vectorizer->vectorize($text); + + $this->assertInstanceOf(Vector::class, $result); + $this->assertEquals($vector, $result); + } + + public function testVectorizeEmptyString() + { + $text = ''; + $vector = new Vector([0.0, 0.0, 0.0]); + + $platform = PlatformTestHandler::createPlatform(new VectorResult($vector)); + $model = new Embeddings(); + + $vectorizer = new Vectorizer($platform, $model); + $result = $vectorizer->vectorize($text); + + $this->assertInstanceOf(Vector::class, $result); + $this->assertEquals($vector, $result); + } + + public function testVectorizeStringThrowsExceptionWhenNoVectorReturned() + { + $text = 'Test string'; + + $platform = PlatformTestHandler::createPlatform(new VectorResult()); + $model = new Embeddings(); + + $vectorizer = new Vectorizer($platform, $model); + + $this->expectException(\Symfony\AI\Store\Exception\RuntimeException::class); + $this->expectExceptionMessage('No vector returned for string vectorization.'); + + $vectorizer->vectorize($text); + } }