From 29349c1104552e712a6f2f0e2f462b82ef8af311 Mon Sep 17 00:00:00 2001 From: Oskar Stark Date: Mon, 8 Sep 2025 12:56:08 +0200 Subject: [PATCH] [Examples][Store] Implement indexing pipeline --- demo/CLAUDE.md | 2 +- demo/README.md | 2 +- demo/config/packages/ai.yaml | 8 +- demo/src/Blog/Command/EmbedCommand.php | 37 --- demo/src/Blog/Embedder.php | 35 --- demo/src/Blog/FeedLoader.php | 22 +- examples/indexer/index-file-loader.php | 47 ++++ examples/indexer/index-inmemory-loader.php | 58 +++++ fixtures/movies/gladiator.md | 32 +++ fixtures/movies/inception.md | 28 ++ fixtures/movies/jurassic-park.md | 30 +++ src/ai-bundle/config/options.php | 13 + src/ai-bundle/config/services.php | 6 + src/ai-bundle/src/AiBundle.php | 9 + .../DependencyInjection/AiBundleTest.php | 244 +++++++++++++++++- src/store/src/Command/IndexCommand.php | 109 ++++++++ .../src/Document/Loader/InMemoryLoader.php | 2 +- .../src/Document/Loader/TextFileLoader.php | 7 +- src/store/src/Document/LoaderInterface.php | 4 +- .../Transformer/TextReplaceTransformer.php | 55 ++++ .../Transformer/TextSplitTransformer.php | 2 +- .../Transformer/TextTrimTransformer.php | 29 +++ src/store/src/Exception/LogicException.php | 19 ++ src/store/src/Indexer.php | 72 +++++- src/store/src/IndexerInterface.php | 18 +- .../Document/Loader/InMemoryLoaderTest.php | 24 +- .../Document/Loader/TextFileLoaderTest.php | 11 + .../TextReplaceTransformerTest.php | 196 ++++++++++++++ .../Transformer/TextTrimTransformerTest.php | 98 +++++++ src/store/tests/IndexerTest.php | 80 +++++- 30 files changed, 1173 insertions(+), 126 deletions(-) delete mode 100644 demo/src/Blog/Command/EmbedCommand.php delete mode 100644 demo/src/Blog/Embedder.php create mode 100644 examples/indexer/index-file-loader.php create mode 100644 examples/indexer/index-inmemory-loader.php create mode 100644 fixtures/movies/gladiator.md create mode 100644 fixtures/movies/inception.md create mode 100644 fixtures/movies/jurassic-park.md create mode 100644 src/store/src/Command/IndexCommand.php create mode 100644 src/store/src/Document/Transformer/TextReplaceTransformer.php create mode 100644 src/store/src/Document/Transformer/TextTrimTransformer.php create mode 100644 src/store/src/Exception/LogicException.php create mode 100644 src/store/tests/Document/Transformer/TextReplaceTransformerTest.php create mode 100644 src/store/tests/Document/Transformer/TextTrimTransformerTest.php diff --git a/demo/CLAUDE.md b/demo/CLAUDE.md index 3d046078c..28d310398 100644 --- a/demo/CLAUDE.md +++ b/demo/CLAUDE.md @@ -36,7 +36,7 @@ composer install echo "OPENAI_API_KEY='sk-...'" > .env.local # Initialize vector store -symfony console app:blog:embed -vv +symfony console ai:store:index blog -vv # Test vector store symfony console app:blog:query diff --git a/demo/README.md b/demo/README.md index c41269847..61ffab2a5 100644 --- a/demo/README.md +++ b/demo/README.md @@ -74,7 +74,7 @@ The [Chroma DB](https://www.trychroma.com/) is a vector store that is used to st To initialize the Chroma DB, you need to run the following command: ```shell -symfony console app:blog:embed -vv +symfony console ai:store:index blog -vv ``` Now you should be able to run the test command and get some results: diff --git a/demo/config/packages/ai.yaml b/demo/config/packages/ai.yaml index e980bd9c3..9c39b449f 100644 --- a/demo/config/packages/ai.yaml +++ b/demo/config/packages/ai.yaml @@ -59,7 +59,11 @@ ai: class: 'Symfony\AI\Platform\Bridge\OpenAi\Embeddings' name: !php/const Symfony\AI\Platform\Bridge\OpenAi\Embeddings::TEXT_ADA_002 indexer: - default: + blog: + loader: 'App\Blog\FeedLoader' + source: 'https://feeds.feedburner.com/symfony/blog' + transformers: + - 'Symfony\AI\Store\Document\Transformer\TextTrimTransformer' vectorizer: 'ai.vectorizer.openai_embeddings' store: 'ai.store.chroma_db.symfonycon' @@ -75,3 +79,5 @@ services: Symfony\AI\Agent\Toolbox\Tool\Wikipedia: ~ Symfony\AI\Agent\Toolbox\Tool\SimilaritySearch: $vectorizer: '@ai.vectorizer.openai_embeddings' + + Symfony\AI\Store\Document\Transformer\TextTrimTransformer: ~ diff --git a/demo/src/Blog/Command/EmbedCommand.php b/demo/src/Blog/Command/EmbedCommand.php deleted file mode 100644 index a97ec064e..000000000 --- a/demo/src/Blog/Command/EmbedCommand.php +++ /dev/null @@ -1,37 +0,0 @@ - - * - * For the full copyright and license information, please view the LICENSE - * file that was distributed with this source code. - */ - -namespace App\Blog\Command; - -use App\Blog\Embedder; -use Symfony\Component\Console\Attribute\AsCommand; -use Symfony\Component\Console\Command\Command; -use Symfony\Component\Console\Style\SymfonyStyle; - -#[AsCommand('app:blog:embed', description: 'Create embeddings for Symfony blog and push to ChromaDB.')] -final class EmbedCommand -{ - public function __construct( - private readonly Embedder $embedder, - ) { - } - - public function __invoke(SymfonyStyle $io): int - { - $io->title('Loading RSS of Symfony blog as embeddings into ChromaDB'); - - $this->embedder->embedBlog(); - - $io->success('Symfony Blog Successfully Embedded!'); - - return Command::SUCCESS; - } -} diff --git a/demo/src/Blog/Embedder.php b/demo/src/Blog/Embedder.php deleted file mode 100644 index ae3a92d03..000000000 --- a/demo/src/Blog/Embedder.php +++ /dev/null @@ -1,35 +0,0 @@ - - * - * For the full copyright and license information, please view the LICENSE - * file that was distributed with this source code. - */ - -namespace App\Blog; - -use Symfony\AI\Store\Document\Metadata; -use Symfony\AI\Store\Document\TextDocument; -use Symfony\AI\Store\IndexerInterface; - -final readonly class Embedder -{ - public function __construct( - private FeedLoader $loader, - private IndexerInterface $indexer, - ) { - } - - public function embedBlog(): void - { - $documents = []; - foreach ($this->loader->load() as $post) { - $documents[] = new TextDocument($post->id, $post->toString(), new Metadata($post->toArray())); - } - - $this->indexer->index($documents); - } -} diff --git a/demo/src/Blog/FeedLoader.php b/demo/src/Blog/FeedLoader.php index d02eb058f..92676f4c7 100644 --- a/demo/src/Blog/FeedLoader.php +++ b/demo/src/Blog/FeedLoader.php @@ -11,11 +11,15 @@ namespace App\Blog; +use Symfony\AI\Store\Document\LoaderInterface; +use Symfony\AI\Store\Document\Metadata; +use Symfony\AI\Store\Document\TextDocument; +use Symfony\AI\Store\Exception\InvalidArgumentException; use Symfony\Component\DomCrawler\Crawler; use Symfony\Component\Uid\Uuid; use Symfony\Contracts\HttpClient\HttpClientInterface; -class FeedLoader +final class FeedLoader implements LoaderInterface { public function __construct( private HttpClientInterface $httpClient, @@ -23,11 +27,17 @@ public function __construct( } /** - * @return Post[] + * @param ?string $source RSS feed URL + * @param array $options + * + * @return iterable */ - public function load(): array + public function load(?string $source, array $options = []): iterable { - $result = $this->httpClient->request('GET', 'https://feeds.feedburner.com/symfony/blog'); + if (null === $source) { + throw new InvalidArgumentException('FeedLoader requires a RSS feed URL as source, null given.'); + } + $result = $this->httpClient->request('GET', $source); $posts = []; $crawler = new Crawler($result->getContent()); @@ -44,6 +54,8 @@ public function load(): array ); }); - return $posts; + foreach ($posts as $post) { + yield new TextDocument($post->id, $post->toString(), new Metadata($post->toArray())); + } } } diff --git a/examples/indexer/index-file-loader.php b/examples/indexer/index-file-loader.php new file mode 100644 index 000000000..8a8b22e71 --- /dev/null +++ b/examples/indexer/index-file-loader.php @@ -0,0 +1,47 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +use Symfony\AI\Platform\Bridge\OpenAi\Embeddings; +use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory; +use Symfony\AI\Store\Bridge\Local\InMemoryStore; +use Symfony\AI\Store\Document\Loader\TextFileLoader; +use Symfony\AI\Store\Document\Transformer\TextReplaceTransformer; +use Symfony\AI\Store\Document\Transformer\TextSplitTransformer; +use Symfony\AI\Store\Document\Vectorizer; +use Symfony\AI\Store\Indexer; + +require_once dirname(__DIR__).'/bootstrap.php'; + +$platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); +$store = new InMemoryStore(); +$vectorizer = new Vectorizer($platform, new Embeddings('text-embedding-3-small')); +$indexer = new Indexer( + loader: new TextFileLoader(), + vectorizer: $vectorizer, + store: $store, + source: [ + dirname(__DIR__, 2).'/fixtures/movies/gladiator.md', + dirname(__DIR__, 2).'/fixtures/movies/inception.md', + dirname(__DIR__, 2).'/fixtures/movies/jurassic-park.md', + ], + transformers: [ + new TextReplaceTransformer(search: '## Plot', replace: '## Synopsis'), + new TextSplitTransformer(chunkSize: 500, overlap: 100), + ], +); + +$indexer->index(); + +$vector = $vectorizer->vectorize('Roman gladiator revenge'); +$results = $store->query($vector); +foreach ($results as $i => $document) { + echo sprintf("%d. %s\n", $i + 1, substr($document->id, 0, 40).'...'); +} diff --git a/examples/indexer/index-inmemory-loader.php b/examples/indexer/index-inmemory-loader.php new file mode 100644 index 000000000..20356b7b1 --- /dev/null +++ b/examples/indexer/index-inmemory-loader.php @@ -0,0 +1,58 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +use Symfony\AI\Platform\Bridge\OpenAi\Embeddings; +use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory; +use Symfony\AI\Store\Bridge\Local\InMemoryStore; +use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Metadata; +use Symfony\AI\Store\Document\TextDocument; +use Symfony\AI\Store\Document\Transformer\TextSplitTransformer; +use Symfony\AI\Store\Document\Vectorizer; +use Symfony\AI\Store\Indexer; +use Symfony\Component\Uid\Uuid; + +require_once dirname(__DIR__).'/bootstrap.php'; + +$platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); +$store = new InMemoryStore(); +$vectorizer = new Vectorizer($platform, new Embeddings('text-embedding-3-small')); + +$documents = [ + new TextDocument( + Uuid::v4(), + 'Artificial Intelligence is transforming the way we work and live. Machine learning algorithms can now process vast amounts of data and make predictions with remarkable accuracy.', + new Metadata(['title' => 'AI Revolution']) + ), + new TextDocument( + Uuid::v4(), + 'Climate change is one of the most pressing challenges of our time. Renewable energy sources like solar and wind power are becoming increasingly important for a sustainable future.', + new Metadata(['title' => 'Climate Action']) + ), +]; + +$indexer = new Indexer( + loader: new InMemoryLoader($documents), + vectorizer: $vectorizer, + store: $store, + source: null, + transformers: [ + new TextSplitTransformer(chunkSize: 100, overlap: 20), + ], +); + +$indexer->index(); + +$vector = $vectorizer->vectorize('machine learning artificial intelligence'); +$results = $store->query($vector); +foreach ($results as $i => $document) { + echo sprintf("%d. %s\n", $i + 1, substr($document->id, 0, 40).'...'); +} diff --git a/fixtures/movies/gladiator.md b/fixtures/movies/gladiator.md new file mode 100644 index 000000000..5217fb5e0 --- /dev/null +++ b/fixtures/movies/gladiator.md @@ -0,0 +1,32 @@ +# Gladiator (2000) + +**IMDB**: https://www.imdb.com/title/tt0172495/ + +**Director:** Ridley Scott + +## Cast + +- **Russell Crowe** as Maximus Decimus Meridius +- **Joaquin Phoenix** as Emperor Commodus +- **Connie Nielsen** as Lucilla +- **Oliver Reed** as Proximo +- **Derek Jacobi** as Senator Gracchus +- **Djimon Hounsou** as Juba +- **Richard Harris** as Marcus Aurelius +- **Ralf Möller** as Hagen +- **Tommy Flanagan** as Cicero +- **David Schofield** as Falco + +## Plot + +A former Roman General sets out to exact vengeance against the corrupt emperor who murdered his family and sent him into slavery. + +**Maximus Decimus Meridius** is a powerful Roman general beloved by the people and the aging Emperor **Marcus Aurelius**. As Marcus Aurelius lies dying, he makes known his wish that Maximus should succeed him and return Rome to the former glory of the Republic rather than the corrupt Empire it has become. + +However, Marcus Aurelius's son **Commodus** learns of his father's plan and murders him before he can publicly name Maximus as his successor. Commodus then orders the execution of Maximus and his family. Maximus escapes the execution but arrives at his farm too late to save his wife and son. + +Wounded and devastated, Maximus is captured by slave traders and forced to become a gladiator. Under the training of **Proximo**, a former gladiator, Maximus becomes a skilled fighter and eventually makes his way to the **Colosseum** in Rome, where he gains fame and the crowd's favor. + +Using his newfound popularity with the people, Maximus seeks to avenge the murder of his family and fulfill his promise to Marcus Aurelius to restore Rome to a republic. The film culminates in a final confrontation between Maximus and Commodus in the arena. + +The film explores themes of *honor*, *revenge*, *political corruption*, and the struggle between personal desires and duty to the greater good. diff --git a/fixtures/movies/inception.md b/fixtures/movies/inception.md new file mode 100644 index 000000000..bed14fb9e --- /dev/null +++ b/fixtures/movies/inception.md @@ -0,0 +1,28 @@ +# Inception (2010) + +**IMDB**: https://www.imdb.com/title/tt1375666/ + +**Director:** Christopher Nolan + +## Cast + +- **Leonardo DiCaprio** as Dom Cobb +- **Marion Cotillard** as Mal Cobb +- **Tom Hardy** as Eames +- **Elliot Page** as Ariadne +- **Ken Watanabe** as Saito +- **Dileep Rao** as Yusuf +- **Cillian Murphy** as Robert Fischer Jr. +- **Tom Berenger** as Peter Browning +- **Michael Caine** as Professor Stephen Miles +- **Lukas Haas** as Nash + +## Plot + +A skilled thief is given a chance at redemption if he can successfully perform inception, the act of planting an idea in someone's subconscious. + +**Dom Cobb** is a skilled thief who specializes in *extraction* - stealing secrets from people's subconscious minds while they dream. This unique skill has made him a valuable player in the world of corporate espionage, but it has also cost him everything he loves. Cobb's rare ability has made him a coveted player in this treacherous new world of corporate espionage, but it has also made him an international fugitive and cost him everything he has ever loved. + +Now Cobb is being offered a chance at redemption. One last job could give him his life back but only if he can accomplish the impossible - **inception**. Instead of the perfect heist, Cobb and his team of specialists have to pull off the reverse: their task is not to steal an idea but to plant one. If they succeed, it could be the perfect crime. + +The film explores themes of *reality*, *dreams*, *memory*, and the nature of consciousness through multiple layers of dream states, creating a complex narrative structure that challenges both characters and audience to question what is real. diff --git a/fixtures/movies/jurassic-park.md b/fixtures/movies/jurassic-park.md new file mode 100644 index 000000000..cf8b7251a --- /dev/null +++ b/fixtures/movies/jurassic-park.md @@ -0,0 +1,30 @@ +# Jurassic Park (1993) + +**IMDB**: https://www.imdb.com/title/tt0107290/ + +**Director:** Steven Spielberg + +## Cast + +- **Sam Neill** as Dr. Alan Grant +- **Laura Dern** as Dr. Ellie Sattler +- **Jeff Goldblum** as Dr. Ian Malcolm +- **Richard Attenborough** as John Hammond +- **Bob Peck** as Robert Muldoon +- **Martin Ferrero** as Donald Gennaro +- **BD Wong** as Dr. Henry Wu +- **Joseph Mazzello** as Tim Murphy +- **Ariana Richards** as Lex Murphy +- **Wayne Knight** as Dennis Nedry + +## Plot + +During a preview tour, a theme park suffers a major power breakdown that allows its cloned dinosaur exhibits to run amok. + +Billionaire **John Hammond** has created a theme park on a remote island where he has successfully cloned dinosaurs from ancient DNA found in prehistoric mosquitoes preserved in amber. Before opening to the public, Hammond invites a select group of people to tour the park, including paleontologist **Dr. Alan Grant**, paleobotanist **Dr. Ellie Sattler**, and mathematician **Dr. Ian Malcolm**. + +The tour begins smoothly, but things quickly go wrong when the park's computer systems are sabotaged by the disgruntled programmer **Dennis Nedry**, who is attempting to steal dinosaur embryos. The security systems fail, and the dinosaurs break free from their enclosures. + +As the island descends into chaos, the visitors must survive encounters with various dangerous dinosaurs, including the intelligent and deadly **Velociraptors** and the massive **Tyrannosaurus Rex**. Dr. Grant finds himself responsible for Hammond's grandchildren, Tim and Lex, as they attempt to reach safety. + +The film explores themes of *scientific ethics*, the *hubris of trying to control nature*, and the *unintended consequences of genetic engineering*. It questions whether humans have the right to resurrect extinct species and whether scientific advancement should be pursued without considering the potential risks and moral implications. diff --git a/src/ai-bundle/config/options.php b/src/ai-bundle/config/options.php index d6a8440c7..d2c11f1e0 100644 --- a/src/ai-bundle/config/options.php +++ b/src/ai-bundle/config/options.php @@ -397,6 +397,19 @@ ->useAttributeAsKey('name') ->arrayPrototype() ->children() + ->scalarNode('loader') + ->info('Service name of loader') + ->isRequired() + ->end() + ->variableNode('source') + ->info('Source identifier (file path, URL, etc.) or array of sources') + ->defaultNull() + ->end() + ->arrayNode('transformers') + ->info('Array of transformer service names') + ->scalarPrototype()->end() + ->defaultValue([]) + ->end() ->scalarNode('vectorizer') ->info('Service name of vectorizer') ->defaultValue(VectorizerInterface::class) diff --git a/src/ai-bundle/config/services.php b/src/ai-bundle/config/services.php index f0812f6fd..88be4784f 100644 --- a/src/ai-bundle/config/services.php +++ b/src/ai-bundle/config/services.php @@ -38,6 +38,7 @@ use Symfony\AI\Platform\Contract\JsonSchema\DescriptionParser; use Symfony\AI\Platform\Contract\JsonSchema\Factory as SchemaFactory; use Symfony\AI\Store\Command\DropStoreCommand; +use Symfony\AI\Store\Command\IndexCommand; use Symfony\AI\Store\Command\SetupStoreCommand; return static function (ContainerConfigurator $container): void { @@ -157,5 +158,10 @@ tagged_locator('ai.store', indexAttribute: 'name'), ]) ->tag('console.command') + ->set('ai.command.index', IndexCommand::class) + ->args([ + tagged_locator('ai.indexer', indexAttribute: 'name'), + ]) + ->tag('console.command') ; }; diff --git a/src/ai-bundle/src/AiBundle.php b/src/ai-bundle/src/AiBundle.php index c7c6d6b34..f52212ec2 100644 --- a/src/ai-bundle/src/AiBundle.php +++ b/src/ai-bundle/src/AiBundle.php @@ -1068,11 +1068,20 @@ private function processVectorizerConfig(string $name, array $config, ContainerB */ private function processIndexerConfig(int|string $name, array $config, ContainerBuilder $container): void { + $transformers = []; + foreach ($config['transformers'] ?? [] as $transformer) { + $transformers[] = new Reference($transformer); + } + $definition = new Definition(Indexer::class, [ + new Reference($config['loader']), new Reference($config['vectorizer']), new Reference($config['store']), + $config['source'], + $transformers, new Reference('logger', ContainerInterface::IGNORE_ON_INVALID_REFERENCE), ]); + $definition->addTag('ai.indexer', ['name' => $name]); $container->setDefinition('ai.indexer.'.$name, $definition); } diff --git a/src/ai-bundle/tests/DependencyInjection/AiBundleTest.php b/src/ai-bundle/tests/DependencyInjection/AiBundleTest.php index 038f34dff..35e340b01 100644 --- a/src/ai-bundle/tests/DependencyInjection/AiBundleTest.php +++ b/src/ai-bundle/tests/DependencyInjection/AiBundleTest.php @@ -19,6 +19,8 @@ use PHPUnit\Framework\TestCase; use Symfony\AI\AiBundle\AiBundle; use Symfony\AI\Platform\Bridge\OpenAi\Embeddings; +use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Transformer\TextTrimTransformer; use Symfony\AI\Store\Document\Vectorizer; use Symfony\Component\Config\Definition\Exception\InvalidConfigurationException; use Symfony\Component\DependencyInjection\ContainerBuilder; @@ -678,6 +680,7 @@ public function testIndexerWithConfiguredVectorizer() ], 'indexer' => [ 'my_indexer' => [ + 'loader' => InMemoryLoader::class, 'vectorizer' => 'ai.vectorizer.my_vectorizer', 'store' => 'ai.store.memory.my_store', ], @@ -691,15 +694,251 @@ public function testIndexerWithConfiguredVectorizer() $indexerDefinition = $container->getDefinition('ai.indexer.my_indexer'); $arguments = $indexerDefinition->getArguments(); - // First argument should be a reference to the vectorizer $this->assertInstanceOf(Reference::class, $arguments[0]); - $this->assertSame('ai.vectorizer.my_vectorizer', (string) $arguments[0]); + $this->assertSame(InMemoryLoader::class, (string) $arguments[0]); + + $this->assertInstanceOf(Reference::class, $arguments[1]); + $this->assertSame('ai.vectorizer.my_vectorizer', (string) $arguments[1]); // Should not create model-specific vectorizer when using configured one $this->assertFalse($container->hasDefinition('ai.indexer.my_indexer.vectorizer')); $this->assertFalse($container->hasDefinition('ai.indexer.my_indexer.model')); } + public function testIndexerWithStringSource() + { + $container = $this->buildContainer([ + 'ai' => [ + 'store' => [ + 'memory' => [ + 'my_store' => [], + ], + ], + 'indexer' => [ + 'my_indexer' => [ + 'loader' => InMemoryLoader::class, + 'source' => 'https://example.com/feed.xml', + 'vectorizer' => 'my_vectorizer_service', + 'store' => 'ai.store.memory.my_store', + ], + ], + ], + ]); + + $this->assertTrue($container->hasDefinition('ai.indexer.my_indexer')); + $indexerDefinition = $container->getDefinition('ai.indexer.my_indexer'); + $arguments = $indexerDefinition->getArguments(); + + $this->assertSame('https://example.com/feed.xml', $arguments[3]); + } + + public function testIndexerWithArraySource() + { + $container = $this->buildContainer([ + 'ai' => [ + 'store' => [ + 'memory' => [ + 'my_store' => [], + ], + ], + 'indexer' => [ + 'my_indexer' => [ + 'loader' => InMemoryLoader::class, + 'source' => [ + '/path/to/file1.txt', + '/path/to/file2.txt', + 'https://example.com/feed.xml', + ], + 'vectorizer' => 'my_vectorizer_service', + 'store' => 'ai.store.memory.my_store', + ], + ], + ], + ]); + + $this->assertTrue($container->hasDefinition('ai.indexer.my_indexer')); + $indexerDefinition = $container->getDefinition('ai.indexer.my_indexer'); + $arguments = $indexerDefinition->getArguments(); + + $this->assertIsArray($arguments[3]); + $this->assertCount(3, $arguments[3]); + $this->assertSame([ + '/path/to/file1.txt', + '/path/to/file2.txt', + 'https://example.com/feed.xml', + ], $arguments[3]); + } + + public function testIndexerWithNullSource() + { + $container = $this->buildContainer([ + 'ai' => [ + 'store' => [ + 'memory' => [ + 'my_store' => [], + ], + ], + 'indexer' => [ + 'my_indexer' => [ + 'loader' => InMemoryLoader::class, + 'vectorizer' => 'my_vectorizer_service', + 'store' => 'ai.store.memory.my_store', + // source not configured, should default to null + ], + ], + ], + ]); + + $this->assertTrue($container->hasDefinition('ai.indexer.my_indexer')); + $indexerDefinition = $container->getDefinition('ai.indexer.my_indexer'); + $arguments = $indexerDefinition->getArguments(); + + $this->assertNull($arguments[3]); + } + + public function testIndexerWithConfiguredTransformers() + { + $container = $this->buildContainer([ + 'ai' => [ + 'store' => [ + 'memory' => [ + 'my_store' => [], + ], + ], + 'indexer' => [ + 'my_indexer' => [ + 'loader' => InMemoryLoader::class, + 'transformers' => [ + TextTrimTransformer::class, + 'App\CustomTransformer', + ], + 'vectorizer' => 'my_vectorizer_service', + 'store' => 'ai.store.memory.my_store', + ], + ], + ], + ]); + + $this->assertTrue($container->hasDefinition('ai.indexer.my_indexer')); + $indexerDefinition = $container->getDefinition('ai.indexer.my_indexer'); + $arguments = $indexerDefinition->getArguments(); + + $this->assertIsArray($arguments[4]); + $this->assertCount(2, $arguments[4]); + + $this->assertInstanceOf(Reference::class, $arguments[4][0]); + $this->assertSame(TextTrimTransformer::class, (string) $arguments[4][0]); + + $this->assertInstanceOf(Reference::class, $arguments[4][1]); + $this->assertSame('App\CustomTransformer', (string) $arguments[4][1]); + } + + public function testIndexerWithEmptyTransformers() + { + $container = $this->buildContainer([ + 'ai' => [ + 'store' => [ + 'memory' => [ + 'my_store' => [], + ], + ], + 'indexer' => [ + 'my_indexer' => [ + 'loader' => InMemoryLoader::class, + 'transformers' => [], + 'vectorizer' => 'my_vectorizer_service', + 'store' => 'ai.store.memory.my_store', + ], + ], + ], + ]); + + $this->assertTrue($container->hasDefinition('ai.indexer.my_indexer')); + $indexerDefinition = $container->getDefinition('ai.indexer.my_indexer'); + $arguments = $indexerDefinition->getArguments(); + + $this->assertSame([], $arguments[4]); + } + + public function testIndexerWithoutTransformers() + { + $container = $this->buildContainer([ + 'ai' => [ + 'store' => [ + 'memory' => [ + 'my_store' => [], + ], + ], + 'indexer' => [ + 'my_indexer' => [ + 'loader' => InMemoryLoader::class, + 'vectorizer' => 'my_vectorizer_service', + 'store' => 'ai.store.memory.my_store', + // transformers not configured, should default to empty array + ], + ], + ], + ]); + + $this->assertTrue($container->hasDefinition('ai.indexer.my_indexer')); + $indexerDefinition = $container->getDefinition('ai.indexer.my_indexer'); + $arguments = $indexerDefinition->getArguments(); + + $this->assertSame([], $arguments[4]); + } + + public function testIndexerWithSourceAndTransformers() + { + $container = $this->buildContainer([ + 'ai' => [ + 'store' => [ + 'memory' => [ + 'my_store' => [], + ], + ], + 'indexer' => [ + 'my_indexer' => [ + 'loader' => InMemoryLoader::class, + 'source' => [ + '/path/to/file1.txt', + '/path/to/file2.txt', + ], + 'transformers' => [ + TextTrimTransformer::class, + ], + 'vectorizer' => 'my_vectorizer_service', + 'store' => 'ai.store.memory.my_store', + ], + ], + ], + ]); + + $this->assertTrue($container->hasDefinition('ai.indexer.my_indexer')); + $indexerDefinition = $container->getDefinition('ai.indexer.my_indexer'); + $arguments = $indexerDefinition->getArguments(); + + $this->assertInstanceOf(Reference::class, $arguments[0]); + $this->assertSame(InMemoryLoader::class, (string) $arguments[0]); + + $this->assertInstanceOf(Reference::class, $arguments[1]); + $this->assertSame('my_vectorizer_service', (string) $arguments[1]); + + $this->assertInstanceOf(Reference::class, $arguments[2]); + $this->assertSame('ai.store.memory.my_store', (string) $arguments[2]); + + $this->assertIsArray($arguments[3]); + $this->assertCount(2, $arguments[3]); + $this->assertSame([ + '/path/to/file1.txt', + '/path/to/file2.txt', + ], $arguments[3]); + + $this->assertIsArray($arguments[4]); + $this->assertCount(1, $arguments[4]); + $this->assertInstanceOf(Reference::class, $arguments[4][0]); + $this->assertSame(TextTrimTransformer::class, (string) $arguments[4][0]); + } + private function buildContainer(array $configuration): ContainerBuilder { $container = new ContainerBuilder(); @@ -959,6 +1198,7 @@ private function getFullConfig(): array ], 'indexer' => [ 'my_text_indexer' => [ + 'loader' => InMemoryLoader::class, 'vectorizer' => 'ai.vectorizer.test_vectorizer', 'store' => 'my_azure_search_store_service_id', ], diff --git a/src/store/src/Command/IndexCommand.php b/src/store/src/Command/IndexCommand.php new file mode 100644 index 000000000..97ae1fae9 --- /dev/null +++ b/src/store/src/Command/IndexCommand.php @@ -0,0 +1,109 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Command; + +use Symfony\AI\Store\Exception\RuntimeException; +use Symfony\AI\Store\IndexerInterface; +use Symfony\Component\Console\Attribute\AsCommand; +use Symfony\Component\Console\Command\Command; +use Symfony\Component\Console\Completion\CompletionInput; +use Symfony\Component\Console\Completion\CompletionSuggestions; +use Symfony\Component\Console\Input\InputArgument; +use Symfony\Component\Console\Input\InputInterface; +use Symfony\Component\Console\Input\InputOption; +use Symfony\Component\Console\Output\OutputInterface; +use Symfony\Component\Console\Style\SymfonyStyle; +use Symfony\Component\DependencyInjection\ServiceLocator; + +/** + * @author Oskar Stark + */ +#[AsCommand( + name: 'ai:store:index', + description: 'Index documents into a store', +)] +final class IndexCommand extends Command +{ + /** + * @param ServiceLocator $indexers + */ + public function __construct( + private readonly ServiceLocator $indexers, + ) { + parent::__construct(); + } + + public function complete(CompletionInput $input, CompletionSuggestions $suggestions): void + { + if ($input->mustSuggestArgumentValuesFor('indexer')) { + $suggestions->suggestValues(array_keys($this->indexers->getProvidedServices())); + } + } + + protected function configure(): void + { + $this + ->addArgument('indexer', InputArgument::REQUIRED, 'Name of the indexer to run') + ->addOption('source', 's', InputOption::VALUE_OPTIONAL | InputOption::VALUE_IS_ARRAY, 'Source(s) to index (overrides configured source)') + ->setHelp(<<<'EOF' +The %command.name% command indexes documents into a store using the specified indexer. + +Basic usage: + php %command.full_name% blog + +Override the configured source with a single source: + php %command.full_name% blog --source=/path/to/file.txt + +Override with multiple sources: + php %command.full_name% blog --source=/path/to/file1.txt --source=/path/to/file2.txt +EOF + ) + ; + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + $io = new SymfonyStyle($input, $output); + + $indexer = $input->getArgument('indexer'); + $sources = $input->getOption('source'); + // Convert array of sources to single source or null + $source = match (true) { + [] === $sources => null, + 1 === \count($sources) => $sources[0], + default => $sources, + }; + + if (!$this->indexers->has($indexer)) { + throw new RuntimeException(\sprintf('The "%s" indexer does not exist.', $indexer)); + } + + $indexerService = $this->indexers->get($indexer); + + // If source override is provided, use withSource to create a new indexer instance + if (null !== $source) { + $indexerService = $indexerService->withSource($source); + } + + $io->title(\sprintf('Indexing documents using "%s" indexer', $indexer)); + + try { + $indexerService->index([]); + + $io->success(\sprintf('Documents indexed successfully using "%s" indexer.', $indexer)); + } catch (\Exception $e) { + throw new RuntimeException(\sprintf('An error occurred while indexing with "%s": ', $indexer).$e->getMessage(), previous: $e); + } + + return Command::SUCCESS; + } +} diff --git a/src/store/src/Document/Loader/InMemoryLoader.php b/src/store/src/Document/Loader/InMemoryLoader.php index cfea1c298..5c09a35ac 100644 --- a/src/store/src/Document/Loader/InMemoryLoader.php +++ b/src/store/src/Document/Loader/InMemoryLoader.php @@ -30,7 +30,7 @@ public function __construct( ) { } - public function load(string $source, array $options = []): iterable + public function load(?string $source, array $options = []): iterable { yield from $this->documents; } diff --git a/src/store/src/Document/Loader/TextFileLoader.php b/src/store/src/Document/Loader/TextFileLoader.php index 975437681..bf365ff48 100644 --- a/src/store/src/Document/Loader/TextFileLoader.php +++ b/src/store/src/Document/Loader/TextFileLoader.php @@ -14,6 +14,7 @@ use Symfony\AI\Store\Document\LoaderInterface; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; +use Symfony\AI\Store\Exception\InvalidArgumentException; use Symfony\AI\Store\Exception\RuntimeException; use Symfony\Component\Uid\Uuid; @@ -22,8 +23,12 @@ */ final readonly class TextFileLoader implements LoaderInterface { - public function load(string $source, array $options = []): iterable + public function load(?string $source, array $options = []): iterable { + if (null === $source) { + throw new InvalidArgumentException('TextFileLoader requires a file path as source, null given.'); + } + if (!is_file($source)) { throw new RuntimeException(\sprintf('File "%s" does not exist.', $source)); } diff --git a/src/store/src/Document/LoaderInterface.php b/src/store/src/Document/LoaderInterface.php index 549283d79..339ebb6b2 100644 --- a/src/store/src/Document/LoaderInterface.php +++ b/src/store/src/Document/LoaderInterface.php @@ -17,10 +17,10 @@ interface LoaderInterface { /** - * @param string $source Identifier for the loader to load the documents from, e.g. file path, folder, or URL. + * @param string|null $source Identifier for the loader to load the documents from, e.g. file path, folder, or URL. Can be null for InMemoryLoader. * @param array $options loader specific set of options to control the loading process * * @return iterable iterable of TextDocuments loaded from the source */ - public function load(string $source, array $options = []): iterable; + public function load(?string $source, array $options = []): iterable; } diff --git a/src/store/src/Document/Transformer/TextReplaceTransformer.php b/src/store/src/Document/Transformer/TextReplaceTransformer.php new file mode 100644 index 000000000..22d3b891a --- /dev/null +++ b/src/store/src/Document/Transformer/TextReplaceTransformer.php @@ -0,0 +1,55 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Document\Transformer; + +use Symfony\AI\Store\Document\TransformerInterface; +use Symfony\AI\Store\Exception\InvalidArgumentException; + +/** + * Replaces specified text within document content. + * + * @author Oskar Stark + */ +final readonly class TextReplaceTransformer implements TransformerInterface +{ + public const OPTION_SEARCH = 'search'; + public const OPTION_REPLACE = 'replace'; + + public function __construct( + private string $search = '', + private string $replace = '', + ) { + self::validate($search, $replace); + } + + /** + * @param array{search?: string, replace?: string} $options + */ + public function transform(iterable $documents, array $options = []): iterable + { + $search = $options[self::OPTION_SEARCH] ?? $this->search; + $replace = $options[self::OPTION_REPLACE] ?? $this->replace; + + self::validate($search, $replace); + + foreach ($documents as $document) { + yield $document->withContent(str_replace($search, $replace, $document->content)); + } + } + + private static function validate(string $search, string $replace): void + { + if ($search === $replace) { + throw new InvalidArgumentException('Search and replace strings must be different.'); + } + } +} diff --git a/src/store/src/Document/Transformer/TextSplitTransformer.php b/src/store/src/Document/Transformer/TextSplitTransformer.php index 2f24b976a..b062b87fe 100644 --- a/src/store/src/Document/Transformer/TextSplitTransformer.php +++ b/src/store/src/Document/Transformer/TextSplitTransformer.php @@ -34,7 +34,7 @@ public function __construct( private int $overlap = 200, ) { if ($this->overlap < 0 || $this->overlap >= $this->chunkSize) { - throw new InvalidArgumentException(\sprintf('Overlap must be non-negative and less than chunk size. Got chunk size: %d, overlap: %d', $this->chunkSize, $this->overlap)); + throw new InvalidArgumentException(\sprintf('Overlap must be non-negative and less than chunk size. Got chunk size: %d, overlap: %d.', $this->chunkSize, $this->overlap)); } } diff --git a/src/store/src/Document/Transformer/TextTrimTransformer.php b/src/store/src/Document/Transformer/TextTrimTransformer.php new file mode 100644 index 000000000..1a47fec63 --- /dev/null +++ b/src/store/src/Document/Transformer/TextTrimTransformer.php @@ -0,0 +1,29 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Document\Transformer; + +use Symfony\AI\Store\Document\TransformerInterface; + +/** + * Trims whitespace from document content. + * + * @author Oskar Stark + */ +final readonly class TextTrimTransformer implements TransformerInterface +{ + public function transform(iterable $documents, array $options = []): iterable + { + foreach ($documents as $document) { + yield $document->withContent(trim($document->content)); + } + } +} diff --git a/src/store/src/Exception/LogicException.php b/src/store/src/Exception/LogicException.php new file mode 100644 index 000000000..860173f66 --- /dev/null +++ b/src/store/src/Exception/LogicException.php @@ -0,0 +1,19 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Exception; + +/** + * @author Oskar Stark + */ +class LogicException extends \LogicException implements ExceptionInterface +{ +} diff --git a/src/store/src/Indexer.php b/src/store/src/Indexer.php index a7a3137c9..e368ba178 100644 --- a/src/store/src/Indexer.php +++ b/src/store/src/Indexer.php @@ -13,31 +13,69 @@ use Psr\Log\LoggerInterface; use Psr\Log\NullLogger; +use Symfony\AI\Store\Document\LoaderInterface; use Symfony\AI\Store\Document\TextDocument; +use Symfony\AI\Store\Document\TransformerInterface; use Symfony\AI\Store\Document\VectorizerInterface; /** * @author Christopher Hertel + * @author Oskar Stark */ -final readonly class Indexer implements IndexerInterface +class Indexer implements IndexerInterface { + /** + * @var array + */ + private array $sources = []; + + /** + * @param string|array|null $source + * @param TransformerInterface[] $transformers + */ public function __construct( + private LoaderInterface $loader, private VectorizerInterface $vectorizer, private StoreInterface $store, + string|array|null $source = null, + private array $transformers = [], private LoggerInterface $logger = new NullLogger(), ) { + $this->sources = null === $source ? [] : (array) $source; } - /** - * @param TextDocument|iterable $documents - * @param int $chunkSize number of documents to vectorize and store in one batch - */ - public function index(TextDocument|iterable $documents, int $chunkSize = 50): void + public function withSource(string|array $source): self + { + return new self($this->loader, $this->vectorizer, $this->store, $source, $this->transformers, $this->logger); + } + + public function index(array $options = []): void { - if ($documents instanceof TextDocument) { - $documents = [$documents]; + $this->logger->debug('Starting document processing', ['sources' => $this->sources, 'options' => $options]); + + $documents = []; + if ([] === $this->sources) { + // No specific source provided, load with null + $documents = $this->loadSource(null); + } else { + foreach ($this->sources as $singleSource) { + $documents = array_merge($documents, $this->loadSource($singleSource)); + } } + if ([] === $documents) { + $this->logger->debug('No documents to process', ['sources' => $this->sources]); + + return; + } + + // Transform documents through all transformers + foreach ($this->transformers as $transformer) { + $documents = $transformer->transform($documents); + } + + // Vectorize and store documents in chunks + $chunkSize = $options['chunk_size'] ?? 50; $counter = 0; $chunk = []; foreach ($documents as $document) { @@ -50,10 +88,24 @@ public function index(TextDocument|iterable $documents, int $chunkSize = 50): vo } } - if (\count($chunk) > 0) { + // Handle remaining documents + if ([] !== $chunk) { $this->store->add(...$this->vectorizer->vectorizeTextDocuments($chunk)); } - $this->logger->debug(0 === $counter ? 'No documents to index' : \sprintf('Indexed %d documents', $counter)); + $this->logger->debug('Document processing completed', ['total_documents' => $counter]); + } + + /** + * @return TextDocument[] + */ + private function loadSource(?string $source): array + { + $documents = []; + foreach ($this->loader->load($source) as $document) { + $documents[] = $document; + } + + return $documents; } } diff --git a/src/store/src/IndexerInterface.php b/src/store/src/IndexerInterface.php index 18be074ac..fcb27494f 100644 --- a/src/store/src/IndexerInterface.php +++ b/src/store/src/IndexerInterface.php @@ -11,18 +11,24 @@ namespace Symfony\AI\Store; -use Symfony\AI\Store\Document\TextDocument; - /** - * Converts a collection of TextDocuments into VectorDocuments and pushes them to a store implementation. + * Handles the complete document processing pipeline: load → transform → vectorize → store. * * @author Oskar Stark */ interface IndexerInterface { /** - * @param TextDocument|iterable $documents - * @param int $chunkSize number of documents to vectorize and store in one batch + * Process sources through the complete document pipeline: load → transform → vectorize → store. + * + * @param array{chunk_size?: int} $options Processing options + */ + public function index(array $options = []): void; + + /** + * Create a new instance with a different source. + * + * @param string|array $source Source identifier (file path, URL, etc.) or array of sources */ - public function index(TextDocument|iterable $documents, int $chunkSize = 50): void; + public function withSource(string|array $source): self; } diff --git a/src/store/tests/Document/Loader/InMemoryLoaderTest.php b/src/store/tests/Document/Loader/InMemoryLoaderTest.php index fcfc0784e..6a3b989a2 100644 --- a/src/store/tests/Document/Loader/InMemoryLoaderTest.php +++ b/src/store/tests/Document/Loader/InMemoryLoaderTest.php @@ -27,9 +27,9 @@ final class InMemoryLoaderTest extends TestCase public function testLoadWithEmptyDocuments() { $loader = new InMemoryLoader(); - $documents = iterator_to_array($loader->load('ignored-source')); + $documents = iterator_to_array($loader->load(null)); - $this->assertEmpty($documents); + $this->assertSame([], $documents); } public function testLoadWithSingleDocument() @@ -37,7 +37,7 @@ public function testLoadWithSingleDocument() $document = new TextDocument(Uuid::v4(), 'This is test content'); $loader = new InMemoryLoader([$document]); - $documents = iterator_to_array($loader->load('ignored-source')); + $documents = iterator_to_array($loader->load(null)); $this->assertCount(1, $documents); $this->assertSame($document, $documents[0]); @@ -50,7 +50,7 @@ public function testLoadWithMultipleDocuments() $document2 = new TextDocument(Uuid::v4(), 'Second document', new Metadata(['type' => 'test'])); $loader = new InMemoryLoader([$document1, $document2]); - $documents = iterator_to_array($loader->load('ignored-source')); + $documents = iterator_to_array($loader->load(null)); $this->assertCount(2, $documents); $this->assertSame($document1, $documents[0]); @@ -62,18 +62,14 @@ public function testLoadWithMultipleDocuments() public function testLoadIgnoresSourceParameter() { - $document = new TextDocument(Uuid::v4(), 'test content'); + $document = new TextDocument(Uuid::v4(), 'Test content'); $loader = new InMemoryLoader([$document]); - $documents1 = iterator_to_array($loader->load('source1')); - $documents2 = iterator_to_array($loader->load('source2')); - $documents3 = iterator_to_array($loader->load('any-source')); + // Source parameter should be ignored - same result regardless of value + $documentsWithNull = iterator_to_array($loader->load(null)); + $documentsWithString = iterator_to_array($loader->load('ignored-source')); - $this->assertCount(1, $documents1); - $this->assertCount(1, $documents2); - $this->assertCount(1, $documents3); - $this->assertSame($document, $documents1[0]); - $this->assertSame($document, $documents2[0]); - $this->assertSame($document, $documents3[0]); + $this->assertSame($documentsWithNull, $documentsWithString); + $this->assertSame($document, $documentsWithNull[0]); } } diff --git a/src/store/tests/Document/Loader/TextFileLoaderTest.php b/src/store/tests/Document/Loader/TextFileLoaderTest.php index b39a710de..76275da53 100644 --- a/src/store/tests/Document/Loader/TextFileLoaderTest.php +++ b/src/store/tests/Document/Loader/TextFileLoaderTest.php @@ -15,11 +15,22 @@ use PHPUnit\Framework\TestCase; use Symfony\AI\Store\Document\Loader\TextFileLoader; use Symfony\AI\Store\Document\TextDocument; +use Symfony\AI\Store\Exception\InvalidArgumentException; use Symfony\AI\Store\Exception\RuntimeException; #[CoversClass(TextFileLoader::class)] final class TextFileLoaderTest extends TestCase { + public function testLoadWithNullSource() + { + $loader = new TextFileLoader(); + + $this->expectException(InvalidArgumentException::class); + $this->expectExceptionMessage('TextFileLoader requires a file path as source, null given.'); + + iterator_to_array($loader->load(null)); + } + public function testLoadWithInvalidSource() { $loader = new TextFileLoader(); diff --git a/src/store/tests/Document/Transformer/TextReplaceTransformerTest.php b/src/store/tests/Document/Transformer/TextReplaceTransformerTest.php new file mode 100644 index 000000000..957305948 --- /dev/null +++ b/src/store/tests/Document/Transformer/TextReplaceTransformerTest.php @@ -0,0 +1,196 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Tests\Document\Transformer; + +use PHPUnit\Framework\Attributes\CoversClass; +use PHPUnit\Framework\TestCase; +use Symfony\AI\Store\Document\Metadata; +use Symfony\AI\Store\Document\TextDocument; +use Symfony\AI\Store\Document\Transformer\TextReplaceTransformer; +use Symfony\AI\Store\Exception\InvalidArgumentException; +use Symfony\Component\Uid\Uuid; + +/** + * @author Oskar Stark + */ +#[CoversClass(TextReplaceTransformer::class)] +final class TextReplaceTransformerTest extends TestCase +{ + public function testReplaceWithConstructorParameters() + { + $transformer = new TextReplaceTransformer('foo', 'bar'); + $document = new TextDocument(Uuid::v4(), 'foo is foo'); + + $result = iterator_to_array($transformer->transform([$document])); + + $this->assertCount(1, $result); + $this->assertSame('bar is bar', $result[0]->content); + } + + public function testReplaceWithOptions() + { + $transformer = new TextReplaceTransformer('initial', 'value'); + $document = new TextDocument(Uuid::v4(), 'hello world'); + + $result = iterator_to_array($transformer->transform([$document], [ + TextReplaceTransformer::OPTION_SEARCH => 'hello', + TextReplaceTransformer::OPTION_REPLACE => 'goodbye', + ])); + + $this->assertCount(1, $result); + $this->assertSame('goodbye world', $result[0]->content); + } + + public function testOptionsOverrideConstructorParameters() + { + $transformer = new TextReplaceTransformer('foo', 'bar'); + $document = new TextDocument(Uuid::v4(), 'foo hello'); + + $result = iterator_to_array($transformer->transform([$document], [ + TextReplaceTransformer::OPTION_SEARCH => 'hello', + TextReplaceTransformer::OPTION_REPLACE => 'world', + ])); + + $this->assertCount(1, $result); + $this->assertSame('foo world', $result[0]->content); + } + + public function testReplaceMultipleOccurrences() + { + $transformer = new TextReplaceTransformer('a', 'b'); + $document = new TextDocument(Uuid::v4(), 'a a a'); + + $result = iterator_to_array($transformer->transform([$document])); + + $this->assertCount(1, $result); + $this->assertSame('b b b', $result[0]->content); + } + + public function testReplaceWithEmptyString() + { + $transformer = new TextReplaceTransformer('remove', ''); + $document = new TextDocument(Uuid::v4(), 'remove this word'); + + $result = iterator_to_array($transformer->transform([$document])); + + $this->assertCount(1, $result); + $this->assertSame(' this word', $result[0]->content); + } + + public function testReplacePreservesMetadata() + { + $metadata = new Metadata(['key' => 'value']); + $transformer = new TextReplaceTransformer('old', 'new'); + $document = new TextDocument(Uuid::v4(), 'old text', $metadata); + + $result = iterator_to_array($transformer->transform([$document])); + + $this->assertCount(1, $result); + $this->assertSame('new text', $result[0]->content); + $this->assertSame($metadata, $result[0]->metadata); + } + + public function testReplacePreservesDocumentId() + { + $id = Uuid::v4(); + $transformer = new TextReplaceTransformer('old', 'new'); + $document = new TextDocument($id, 'old text'); + + $result = iterator_to_array($transformer->transform([$document])); + + $this->assertCount(1, $result); + $this->assertSame($id, $result[0]->id); + } + + public function testReplaceProcessesMultipleDocuments() + { + $transformer = new TextReplaceTransformer('x', 'y'); + $documents = [ + new TextDocument(Uuid::v4(), 'x marks the spot'), + new TextDocument(Uuid::v4(), 'find x here'), + new TextDocument(Uuid::v4(), 'no match'), + ]; + + $result = iterator_to_array($transformer->transform($documents)); + + $this->assertCount(3, $result); + $this->assertSame('y marks the spot', $result[0]->content); + $this->assertSame('find y here', $result[1]->content); + $this->assertSame('no match', $result[2]->content); + } + + public function testReplaceCaseSensitive() + { + $transformer = new TextReplaceTransformer('Hello', 'Goodbye'); + $document = new TextDocument(Uuid::v4(), 'Hello hello HELLO'); + + $result = iterator_to_array($transformer->transform([$document])); + + $this->assertCount(1, $result); + $this->assertSame('Goodbye hello HELLO', $result[0]->content); + } + + public function testReplaceHandlesNoMatch() + { + $transformer = new TextReplaceTransformer('notfound', 'replacement'); + $document = new TextDocument(Uuid::v4(), 'original text'); + + $result = iterator_to_array($transformer->transform([$document])); + + $this->assertCount(1, $result); + $this->assertSame('original text', $result[0]->content); + } + + public function testConstructorThrowsExceptionWhenSearchEqualsReplace() + { + $this->expectException(InvalidArgumentException::class); + $this->expectExceptionMessage('Search and replace strings must be different.'); + + new TextReplaceTransformer('same', 'same'); + } + + public function testTransformThrowsExceptionWhenSearchEqualsReplaceInOptions() + { + $transformer = new TextReplaceTransformer('initial', 'value'); + $document = new TextDocument(Uuid::v4(), 'text'); + + $this->expectException(InvalidArgumentException::class); + $this->expectExceptionMessage('Search and replace strings must be different.'); + + iterator_to_array($transformer->transform([$document], [ + TextReplaceTransformer::OPTION_SEARCH => 'same', + TextReplaceTransformer::OPTION_REPLACE => 'same', + ])); + } + + public function testEmptySearchAndReplaceThrowsException() + { + $this->expectException(InvalidArgumentException::class); + $this->expectExceptionMessage('Search and replace strings must be different.'); + + new TextReplaceTransformer('', ''); + } + + public function testPartialOptionsUseConstructorDefaults() + { + $transformer = new TextReplaceTransformer('default', 'replacement'); + $document = new TextDocument(Uuid::v4(), 'default text'); + + // Only provide search option, should use constructor's replace value + $result = iterator_to_array($transformer->transform([$document], [ + TextReplaceTransformer::OPTION_SEARCH => 'text', + ])); + + $this->assertCount(1, $result); + $this->assertSame('default replacement', $result[0]->content); + } +} diff --git a/src/store/tests/Document/Transformer/TextTrimTransformerTest.php b/src/store/tests/Document/Transformer/TextTrimTransformerTest.php new file mode 100644 index 000000000..78cef6296 --- /dev/null +++ b/src/store/tests/Document/Transformer/TextTrimTransformerTest.php @@ -0,0 +1,98 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Tests\Document\Transformer; + +use PHPUnit\Framework\Attributes\CoversClass; +use PHPUnit\Framework\Attributes\TestWith; +use PHPUnit\Framework\TestCase; +use Symfony\AI\Store\Document\Metadata; +use Symfony\AI\Store\Document\TextDocument; +use Symfony\AI\Store\Document\Transformer\TextTrimTransformer; +use Symfony\AI\Store\Exception\InvalidArgumentException; +use Symfony\Component\Uid\Uuid; + +/** + * @author Oskar Stark + */ +#[CoversClass(TextTrimTransformer::class)] +final class TextTrimTransformerTest extends TestCase +{ + #[TestWith([' text with spaces ', 'text with spaces'])] + #[TestWith(["\n\ntext with newlines\n\n", 'text with newlines'])] + #[TestWith(["\t\ttext with tabs\t\t", 'text with tabs'])] + #[TestWith([' text with middle spaces ', 'text with middle spaces'])] + #[TestWith(['already trimmed', 'already trimmed'])] + #[TestWith([' mixed whitespace ', 'mixed whitespace'])] + #[TestWith(["\r\ncarriage return and newline\r\n", 'carriage return and newline'])] + public function testTrim(string $input, string $expected) + { + $transformer = new TextTrimTransformer(); + $document = new TextDocument(Uuid::v4(), $input); + + $result = iterator_to_array($transformer->transform([$document])); + + $this->assertCount(1, $result); + $this->assertSame($expected, $result[0]->content); + } + + public function testTrimHandlesOnlyWhitespace() + { + // Note: TextDocument doesn't allow empty content, so we can't test trimming to empty string + // This test verifies that attempting to create a document with only whitespace throws an exception + $this->expectException(InvalidArgumentException::class); + $this->expectExceptionMessage('The content shall not be an empty string.'); + + new TextDocument(Uuid::v4(), ' '); + } + + public function testTrimProcessesMultipleDocuments() + { + $transformer = new TextTrimTransformer(); + $documents = [ + new TextDocument(Uuid::v4(), ' first '), + new TextDocument(Uuid::v4(), ' second '), + new TextDocument(Uuid::v4(), ' third '), + ]; + + $result = iterator_to_array($transformer->transform($documents)); + + $this->assertCount(3, $result); + $this->assertSame('first', $result[0]->content); + $this->assertSame('second', $result[1]->content); + $this->assertSame('third', $result[2]->content); + } + + public function testTrimPreservesMetadata() + { + $transformer = new TextTrimTransformer(); + $metadata = new Metadata(['key' => 'value']); + $document = new TextDocument(Uuid::v4(), ' text ', $metadata); + + $result = iterator_to_array($transformer->transform([$document])); + + $this->assertCount(1, $result); + $this->assertSame('text', $result[0]->content); + $this->assertSame($metadata, $result[0]->metadata); + } + + public function testTrimPreservesDocumentId() + { + $transformer = new TextTrimTransformer(); + $id = Uuid::v4(); + $document = new TextDocument($id, ' text '); + + $result = iterator_to_array($transformer->transform([$document])); + + $this->assertCount(1, $result); + $this->assertSame($id, $result[0]->id); + } +} diff --git a/src/store/tests/IndexerTest.php b/src/store/tests/IndexerTest.php index e7865f9de..893f4ca9f 100644 --- a/src/store/tests/IndexerTest.php +++ b/src/store/tests/IndexerTest.php @@ -15,7 +15,6 @@ use PHPUnit\Framework\Attributes\Medium; use PHPUnit\Framework\Attributes\UsesClass; use PHPUnit\Framework\TestCase; -use Psr\Log\LoggerInterface; use Symfony\AI\Platform\Bridge\OpenAi\Embeddings; use Symfony\AI\Platform\Message\ToolCallMessage; use Symfony\AI\Platform\Platform; @@ -23,6 +22,7 @@ use Symfony\AI\Platform\Result\ToolCall; use Symfony\AI\Platform\Result\VectorResult; use Symfony\AI\Platform\Vector\Vector; +use Symfony\AI\Store\Document\Loader\InMemoryLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\VectorDocument; @@ -34,6 +34,7 @@ #[CoversClass(Indexer::class)] #[Medium] +#[UsesClass(InMemoryLoader::class)] #[UsesClass(TextDocument::class)] #[UsesClass(Vector::class)] #[UsesClass(VectorDocument::class)] @@ -49,10 +50,11 @@ public function testIndexSingleDocument() { $document = new TextDocument($id = Uuid::v4(), 'Test content'); $vector = new Vector([0.1, 0.2, 0.3]); + $loader = new InMemoryLoader([$document]); $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector)), new Embeddings()); - $indexer = new Indexer($vectorizer, $store = new TestStore()); - $indexer->index($document); + $indexer = new Indexer($loader, $vectorizer, $store = new TestStore()); + $indexer->index(); $this->assertCount(1, $store->documents); $this->assertInstanceOf(VectorDocument::class, $store->documents[0]); @@ -62,12 +64,11 @@ public function testIndexSingleDocument() public function testIndexEmptyDocumentList() { - $logger = self::createMock(LoggerInterface::class); - $logger->expects($this->once())->method('debug')->with('No documents to index'); + $loader = new InMemoryLoader([]); $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(), new Embeddings()); - $indexer = new Indexer($vectorizer, $store = new TestStore(), $logger); - $indexer->index([]); + $indexer = new Indexer($loader, $vectorizer, $store = new TestStore()); + $indexer->index(); $this->assertSame([], $store->documents); } @@ -77,10 +78,11 @@ public function testIndexDocumentWithMetadata() $metadata = new Metadata(['key' => 'value']); $document = new TextDocument($id = Uuid::v4(), 'Test content', $metadata); $vector = new Vector([0.1, 0.2, 0.3]); + $loader = new InMemoryLoader([$document]); $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector)), new Embeddings()); - $indexer = new Indexer($vectorizer, $store = new TestStore()); - $indexer->index($document); + $indexer = new Indexer($loader, $vectorizer, $store = new TestStore()); + $indexer->index(); $this->assertSame(1, $store->addCalls); $this->assertCount(1, $store->documents); @@ -89,4 +91,64 @@ public function testIndexDocumentWithMetadata() $this->assertSame($vector, $store->documents[0]->vector); $this->assertSame(['key' => 'value'], $store->documents[0]->metadata->getArrayCopy()); } + + public function testWithSource() + { + $document1 = new TextDocument(Uuid::v4(), 'Document 1'); + $vector = new Vector([0.1, 0.2, 0.3]); + + // InMemoryLoader doesn't use source parameter, so we'll test withSource method's immutability + $loader = new InMemoryLoader([$document1]); + $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector)), new Embeddings()); + + // Create indexer with initial source + $indexer = new Indexer($loader, $vectorizer, $store = new TestStore(), 'source1'); + + // Create new indexer with different source + $indexerWithNewSource = $indexer->withSource('source2'); + + // Verify it returns a new instance (immutability) + $this->assertNotSame($indexer, $indexerWithNewSource); + + // Both can index successfully + $indexer->index(); + $this->assertCount(1, $store->documents); + + $store2 = new TestStore(); + $indexer2 = new Indexer($loader, $vectorizer, $store2, 'source2'); + $indexer2->index(); + $this->assertCount(1, $store2->documents); + } + + public function testWithSourceArray() + { + $document1 = new TextDocument(Uuid::v4(), 'Document 1'); + $document2 = new TextDocument(Uuid::v4(), 'Document 2'); + $vector = new Vector([0.1, 0.2, 0.3]); + + // InMemoryLoader returns all documents regardless of source + $loader = new InMemoryLoader([$document1, $document2]); + $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector)), new Embeddings()); + + // Create indexer with single source + $indexer = new Indexer($loader, $vectorizer, $store1 = new TestStore(), 'source1'); + + // Create new indexer with array of sources + $indexerWithMultipleSources = $indexer->withSource(['source2', 'source3']); + + // Verify it returns a new instance (immutability) + $this->assertNotSame($indexer, $indexerWithMultipleSources); + + // Since InMemoryLoader ignores source, both will index all documents + $indexer->index(); + $this->assertCount(2, $store1->documents); + + $store2 = new TestStore(); + $indexer2 = new Indexer($loader, $vectorizer, $store2, ['source2', 'source3']); + $indexer2->index(); + // With array sources, loadSource is called for each source + // Since InMemoryLoader ignores source, it returns all docs each time + // So with 2 sources and 2 docs each time = 4 documents total + $this->assertCount(4, $store2->documents); + } }