From e56e022ed79a7a89eaf8b56ae5782b4786122ff1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9goire=20Pineau?= Date: Wed, 26 Nov 2025 15:39:56 +0100 Subject: [PATCH] [Store] Do not allow to store state in Indexer type Life is always easier without storing state in a service --- demo/config/packages/ai.yaml | 8 +- docs/components/store.rst | 2 +- docs/cookbook/rag-implementation.rst | 4 +- examples/indexer/index-file-loader.php | 11 +- examples/indexer/index-inmemory-loader.php | 3 +- examples/indexer/index-rss-loader.php | 9 +- examples/indexer/index-with-filters.php | 3 +- examples/memory/mariadb.php | 2 +- examples/ollama/indexer.php | 11 +- examples/ollama/rag.php | 2 +- examples/rag/cache.php | 2 +- examples/rag/chromadb.php | 2 +- examples/rag/clickhouse.php | 2 +- examples/rag/cloudflare.php | 2 +- examples/rag/in-memory.php | 2 +- examples/rag/manticore.php | 2 +- examples/rag/mariadb-gemini.php | 2 +- examples/rag/mariadb-openai.php | 2 +- examples/rag/meilisearch-hybrid.php | 2 +- examples/rag/meilisearch.php | 2 +- examples/rag/milvus.php | 2 +- examples/rag/mongodb.php | 2 +- examples/rag/neo4j.php | 2 +- examples/rag/pinecone.php | 2 +- examples/rag/postgres.php | 2 +- examples/rag/qdrant.php | 2 +- examples/rag/redis.php | 2 +- examples/rag/supabase.php | 2 +- examples/rag/surrealdb.php | 2 +- examples/rag/typesense.php | 2 +- examples/rag/weaviate.php | 2 +- examples/retriever/basic.php | 11 +- examples/retriever/movies.php | 2 +- src/ai-bundle/config/options.php | 4 - src/ai-bundle/src/AiBundle.php | 1 - .../DependencyInjection/AiBundleTest.php | 261 ++---------------- src/store/src/Command/IndexCommand.php | 6 +- .../src/Document/Loader/RssFeedLoader.php | 3 + .../src/Document/Loader/TextFileLoader.php | 7 + src/store/src/Indexer.php | 59 ++-- src/store/src/IndexerInterface.php | 12 +- src/store/tests/IndexerTest.php | 82 ++---- 42 files changed, 138 insertions(+), 407 deletions(-) diff --git a/demo/config/packages/ai.yaml b/demo/config/packages/ai.yaml index 06499c33e..6fea63fa0 100644 --- a/demo/config/packages/ai.yaml +++ b/demo/config/packages/ai.yaml @@ -91,8 +91,7 @@ ai: model: 'text-embedding-ada-002' indexer: blog: - loader: 'Symfony\AI\Store\Document\Loader\RssFeedLoader' - source: 'https://feeds.feedburner.com/symfony/blog' + loader: app.loader.blog filters: - 'app.filter.week_of_symfony' transformers: @@ -123,3 +122,8 @@ services: arguments: $needle: 'Week of Symfony' $caseSensitive: false + + app.loader.blog: + class: 'Symfony\AI\Store\Document\Loader\RssFeedLoader' + arguments: + $defaultSource: 'https://feeds.feedburner.com/symfony/blog' diff --git a/docs/components/store.rst b/docs/components/store.rst index 7b518944c..126d5cc57 100644 --- a/docs/components/store.rst +++ b/docs/components/store.rst @@ -31,7 +31,7 @@ used vector store:: $indexer = new Indexer($platform, $model, $store); $document = new TextDocument('This is a sample document.'); - $indexer->index($document); + $indexer->loadAndIndex($document); You can find more advanced usage in combination with an Agent using the store for RAG in the examples folder. diff --git a/docs/cookbook/rag-implementation.rst b/docs/cookbook/rag-implementation.rst index 8a208abb1..98ccd81b8 100644 --- a/docs/cookbook/rag-implementation.rst +++ b/docs/cookbook/rag-implementation.rst @@ -97,7 +97,7 @@ Use a vectorizer to convert documents into embeddings and store them:: $vectorizer, $store ); - $indexer->index($documents); + $indexer->loadAndIndex(); The indexer handles: @@ -324,7 +324,7 @@ Index documents in batches for better performance:: $batchSize = 100; foreach (array_chunk($documents, $batchSize) as $batch) { - $indexer->index($batch); + $indexer->loadAndIndex(options: $batch); } Caching Embeddings diff --git a/examples/indexer/index-file-loader.php b/examples/indexer/index-file-loader.php index 37c4e1b34..36dfc8f82 100644 --- a/examples/indexer/index-file-loader.php +++ b/examples/indexer/index-file-loader.php @@ -26,18 +26,17 @@ loader: new TextFileLoader(), vectorizer: $vectorizer, store: $store, - source: [ - dirname(__DIR__, 2).'/fixtures/movies/gladiator.md', - dirname(__DIR__, 2).'/fixtures/movies/inception.md', - dirname(__DIR__, 2).'/fixtures/movies/jurassic-park.md', - ], transformers: [ new TextReplaceTransformer(search: '## Plot', replace: '## Synopsis'), new TextSplitTransformer(chunkSize: 500, overlap: 100), ], ); -$indexer->index(); +$indexer->loadAndIndex([ + dirname(__DIR__, 2).'/fixtures/movies/gladiator.md', + dirname(__DIR__, 2).'/fixtures/movies/inception.md', + dirname(__DIR__, 2).'/fixtures/movies/jurassic-park.md', +]); $vector = $vectorizer->vectorize('Roman gladiator revenge'); $results = $store->query($vector); diff --git a/examples/indexer/index-inmemory-loader.php b/examples/indexer/index-inmemory-loader.php index 083db0ba8..99a76448a 100644 --- a/examples/indexer/index-inmemory-loader.php +++ b/examples/indexer/index-inmemory-loader.php @@ -42,13 +42,12 @@ loader: new InMemoryLoader($documents), vectorizer: $vectorizer, store: $store, - source: null, transformers: [ new TextSplitTransformer(chunkSize: 100, overlap: 20), ], ); -$indexer->index(); +$indexer->loadAndIndex(); $vector = $vectorizer->vectorize('machine learning artificial intelligence'); $results = $store->query($vector); diff --git a/examples/indexer/index-rss-loader.php b/examples/indexer/index-rss-loader.php index f9b0e48ec..f9ff2c350 100644 --- a/examples/indexer/index-rss-loader.php +++ b/examples/indexer/index-rss-loader.php @@ -26,16 +26,15 @@ loader: new RssFeedLoader(HttpClient::create()), vectorizer: $vectorizer, store: $store, - source: [ - 'https://feeds.feedburner.com/symfony/blog', - 'https://www.tagesschau.de/index~rss2.xml', - ], transformers: [ new TextSplitTransformer(chunkSize: 500, overlap: 100), ], ); -$indexer->index(); +$indexer->loadAndIndex([ + 'https://feeds.feedburner.com/symfony/blog', + 'https://www.tagesschau.de/index~rss2.xml', +]); $vector = $vectorizer->vectorize('Week of Symfony'); $results = $store->query($vector); diff --git a/examples/indexer/index-with-filters.php b/examples/indexer/index-with-filters.php index 34f76b076..66825e17c 100644 --- a/examples/indexer/index-with-filters.php +++ b/examples/indexer/index-with-filters.php @@ -60,14 +60,13 @@ loader: new InMemoryLoader($documents), vectorizer: $vectorizer, store: $store, - source: null, filters: $filters, transformers: [ new TextTrimTransformer(), ], ); -$indexer->index(); +$indexer->loadAndIndex(); $vector = $vectorizer->vectorize('technology artificial intelligence'); $results = $store->query($vector); diff --git a/examples/memory/mariadb.php b/examples/memory/mariadb.php index d3641098c..66473e08f 100644 --- a/examples/memory/mariadb.php +++ b/examples/memory/mariadb.php @@ -58,7 +58,7 @@ $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, $embeddings = 'text-embedding-3-small'); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); // Execute a chat call that is utilizing the memory $embeddingsModel = $platform->getModelCatalog()->getModel($embeddings); diff --git a/examples/ollama/indexer.php b/examples/ollama/indexer.php index 2eb4d51df..ae7b6fb7d 100644 --- a/examples/ollama/indexer.php +++ b/examples/ollama/indexer.php @@ -26,18 +26,17 @@ loader: new TextFileLoader(), vectorizer: $vectorizer, store: $store, - source: [ - dirname(__DIR__, 2).'/fixtures/movies/gladiator.md', - dirname(__DIR__, 2).'/fixtures/movies/inception.md', - dirname(__DIR__, 2).'/fixtures/movies/jurassic-park.md', - ], transformers: [ new TextReplaceTransformer(search: '## Plot', replace: '## Synopsis'), new TextSplitTransformer(chunkSize: 500, overlap: 100), ], ); -$indexer->index(); +$indexer->loadAndIndex([ + dirname(__DIR__, 2).'/fixtures/movies/gladiator.md', + dirname(__DIR__, 2).'/fixtures/movies/inception.md', + dirname(__DIR__, 2).'/fixtures/movies/jurassic-park.md', +]); $vector = $vectorizer->vectorize('Roman gladiator revenge'); $results = $store->query($vector); diff --git a/examples/ollama/rag.php b/examples/ollama/rag.php index 1946d291e..59a4ec2b4 100644 --- a/examples/ollama/rag.php +++ b/examples/ollama/rag.php @@ -44,7 +44,7 @@ $platform = PlatformFactory::create(env('OLLAMA_HOST_URL'), http_client()); $vectorizer = new Vectorizer($platform, env('OLLAMA_EMBEDDINGS'), logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/rag/cache.php b/examples/rag/cache.php index a48aaada3..9a33c2695 100644 --- a/examples/rag/cache.php +++ b/examples/rag/cache.php @@ -45,7 +45,7 @@ $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/rag/chromadb.php b/examples/rag/chromadb.php index ebdb684d0..5ebb46b73 100644 --- a/examples/rag/chromadb.php +++ b/examples/rag/chromadb.php @@ -52,7 +52,7 @@ $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/rag/clickhouse.php b/examples/rag/clickhouse.php index 52d9c8f2e..653a3065b 100644 --- a/examples/rag/clickhouse.php +++ b/examples/rag/clickhouse.php @@ -52,7 +52,7 @@ $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/rag/cloudflare.php b/examples/rag/cloudflare.php index f51e76637..7b5d2c5a7 100644 --- a/examples/rag/cloudflare.php +++ b/examples/rag/cloudflare.php @@ -52,7 +52,7 @@ $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/rag/in-memory.php b/examples/rag/in-memory.php index edc75a997..ad6694a01 100644 --- a/examples/rag/in-memory.php +++ b/examples/rag/in-memory.php @@ -44,7 +44,7 @@ $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/rag/manticore.php b/examples/rag/manticore.php index 481424a4b..819037437 100644 --- a/examples/rag/manticore.php +++ b/examples/rag/manticore.php @@ -52,7 +52,7 @@ $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/rag/mariadb-gemini.php b/examples/rag/mariadb-gemini.php index 2c44feb98..af3d0c7f5 100644 --- a/examples/rag/mariadb-gemini.php +++ b/examples/rag/mariadb-gemini.php @@ -54,7 +54,7 @@ $model = 'gemini-embedding-exp-03-07?dimensions=768&task_type=SEMANTIC_SIMILARITY'; $vectorizer = new Vectorizer($platform, $model, logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/rag/mariadb-openai.php b/examples/rag/mariadb-openai.php index b616958a9..22ec24e4d 100644 --- a/examples/rag/mariadb-openai.php +++ b/examples/rag/mariadb-openai.php @@ -53,7 +53,7 @@ $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/rag/meilisearch-hybrid.php b/examples/rag/meilisearch-hybrid.php index cb8346b66..9c72372e9 100644 --- a/examples/rag/meilisearch-hybrid.php +++ b/examples/rag/meilisearch-hybrid.php @@ -51,7 +51,7 @@ $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); // Create a query embedding $queryText = 'futuristic technology and artificial intelligence'; diff --git a/examples/rag/meilisearch.php b/examples/rag/meilisearch.php index e5ce5742e..2d5f19735 100644 --- a/examples/rag/meilisearch.php +++ b/examples/rag/meilisearch.php @@ -52,7 +52,7 @@ $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/rag/milvus.php b/examples/rag/milvus.php index 1f8411bf8..4b674c0ea 100644 --- a/examples/rag/milvus.php +++ b/examples/rag/milvus.php @@ -53,7 +53,7 @@ $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/rag/mongodb.php b/examples/rag/mongodb.php index 589f38838..769ca8b91 100644 --- a/examples/rag/mongodb.php +++ b/examples/rag/mongodb.php @@ -51,7 +51,7 @@ $platform = PlatformFactory::create(env('OPENAI_API_KEY')); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); // initialize the index $store->setup(); diff --git a/examples/rag/neo4j.php b/examples/rag/neo4j.php index 8a109fbfe..99d82c80f 100644 --- a/examples/rag/neo4j.php +++ b/examples/rag/neo4j.php @@ -55,7 +55,7 @@ $platform = PlatformFactory::create($_SERVER['OPENAI_API_KEY']); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/rag/pinecone.php b/examples/rag/pinecone.php index 072dd6b96..5ef55e387 100644 --- a/examples/rag/pinecone.php +++ b/examples/rag/pinecone.php @@ -45,7 +45,7 @@ $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/rag/postgres.php b/examples/rag/postgres.php index 5a9dc126a..e5e016fe1 100644 --- a/examples/rag/postgres.php +++ b/examples/rag/postgres.php @@ -52,7 +52,7 @@ $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/rag/qdrant.php b/examples/rag/qdrant.php index 7a142ca53..8a64a2030 100644 --- a/examples/rag/qdrant.php +++ b/examples/rag/qdrant.php @@ -52,7 +52,7 @@ $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/rag/redis.php b/examples/rag/redis.php index eb682a4ff..49354e6eb 100644 --- a/examples/rag/redis.php +++ b/examples/rag/redis.php @@ -54,7 +54,7 @@ $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/rag/supabase.php b/examples/rag/supabase.php index c7cdfca64..2818733e5 100644 --- a/examples/rag/supabase.php +++ b/examples/rag/supabase.php @@ -52,7 +52,7 @@ functionName: env('SUPABASE_MATCH_FUNCTION'), $vectorizer = new Vectorizer($platform, env('OLLAMA_EMBEDDINGS')); $loader = new InMemoryLoader($documents); $indexer = new Indexer($loader, $vectorizer, $store, logger: logger()); -$indexer->index(); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/rag/surrealdb.php b/examples/rag/surrealdb.php index 4a99a39c0..8f905f6ea 100644 --- a/examples/rag/surrealdb.php +++ b/examples/rag/surrealdb.php @@ -55,7 +55,7 @@ $platform = PlatformFactory::create($_SERVER['OPENAI_API_KEY']); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/rag/typesense.php b/examples/rag/typesense.php index bd2128d02..b0d62e3e8 100644 --- a/examples/rag/typesense.php +++ b/examples/rag/typesense.php @@ -52,7 +52,7 @@ $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/rag/weaviate.php b/examples/rag/weaviate.php index 15e14c46f..c8e6ebbcb 100644 --- a/examples/rag/weaviate.php +++ b/examples/rag/weaviate.php @@ -53,7 +53,7 @@ $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index($documents); +$indexer->loadAndIndex(); $similaritySearch = new SimilaritySearch($vectorizer, $store); $toolbox = new Toolbox([$similaritySearch], logger: logger()); diff --git a/examples/retriever/basic.php b/examples/retriever/basic.php index 55bbb1dbd..3b5cc1df1 100644 --- a/examples/retriever/basic.php +++ b/examples/retriever/basic.php @@ -28,16 +28,15 @@ loader: new TextFileLoader(), vectorizer: $vectorizer, store: $store, - source: [ - dirname(__DIR__, 2).'/fixtures/movies/gladiator.md', - dirname(__DIR__, 2).'/fixtures/movies/inception.md', - dirname(__DIR__, 2).'/fixtures/movies/jurassic-park.md', - ], transformers: [ new TextSplitTransformer(chunkSize: 500, overlap: 100), ], ); -$indexer->index(); +$indexer->loadAndIndex([ + dirname(__DIR__, 2).'/fixtures/movies/gladiator.md', + dirname(__DIR__, 2).'/fixtures/movies/inception.md', + dirname(__DIR__, 2).'/fixtures/movies/jurassic-park.md', +]); $retriever = new Retriever( vectorizer: $vectorizer, diff --git a/examples/retriever/movies.php b/examples/retriever/movies.php index a4699c4f7..b2cb383de 100644 --- a/examples/retriever/movies.php +++ b/examples/retriever/movies.php @@ -37,7 +37,7 @@ $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); $indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); -$indexer->index(); +$indexer->loadAndIndex(); $retriever = new Retriever($vectorizer, $store, logger()); diff --git a/src/ai-bundle/config/options.php b/src/ai-bundle/config/options.php index e7776f892..3d9e35e6c 100644 --- a/src/ai-bundle/config/options.php +++ b/src/ai-bundle/config/options.php @@ -1064,10 +1064,6 @@ ->info('Service name of loader') ->isRequired() ->end() - ->variableNode('source') - ->info('Source identifier (file path, URL, etc.) or array of sources') - ->defaultNull() - ->end() ->arrayNode('transformers') ->info('Array of transformer service names') ->scalarPrototype()->end() diff --git a/src/ai-bundle/src/AiBundle.php b/src/ai-bundle/src/AiBundle.php index e0923fbc0..db64e3e43 100644 --- a/src/ai-bundle/src/AiBundle.php +++ b/src/ai-bundle/src/AiBundle.php @@ -1863,7 +1863,6 @@ private function processIndexerConfig(int|string $name, array $config, Container new Reference($config['loader']), new Reference($config['vectorizer']), new Reference($config['store']), - $config['source'], $filters, $transformers, new Reference('logger', ContainerInterface::IGNORE_ON_INVALID_REFERENCE), diff --git a/src/ai-bundle/tests/DependencyInjection/AiBundleTest.php b/src/ai-bundle/tests/DependencyInjection/AiBundleTest.php index 42cac8aca..e43fe080b 100644 --- a/src/ai-bundle/tests/DependencyInjection/AiBundleTest.php +++ b/src/ai-bundle/tests/DependencyInjection/AiBundleTest.php @@ -2828,97 +2828,6 @@ public function testIndexerWithConfiguredVectorizer() $this->assertFalse($container->hasDefinition('ai.indexer.my_indexer.model')); } - public function testIndexerWithStringSource() - { - $container = $this->buildContainer([ - 'ai' => [ - 'store' => [ - 'memory' => [ - 'my_store' => [], - ], - ], - 'indexer' => [ - 'my_indexer' => [ - 'loader' => InMemoryLoader::class, - 'source' => 'https://example.com/feed.xml', - 'vectorizer' => 'my_vectorizer_service', - 'store' => 'ai.store.memory.my_store', - ], - ], - ], - ]); - - $this->assertTrue($container->hasDefinition('ai.indexer.my_indexer')); - $indexerDefinition = $container->getDefinition('ai.indexer.my_indexer'); - $arguments = $indexerDefinition->getArguments(); - - $this->assertSame('https://example.com/feed.xml', $arguments[3]); - } - - public function testIndexerWithArraySource() - { - $container = $this->buildContainer([ - 'ai' => [ - 'store' => [ - 'memory' => [ - 'my_store' => [], - ], - ], - 'indexer' => [ - 'my_indexer' => [ - 'loader' => InMemoryLoader::class, - 'source' => [ - '/path/to/file1.txt', - '/path/to/file2.txt', - 'https://example.com/feed.xml', - ], - 'vectorizer' => 'my_vectorizer_service', - 'store' => 'ai.store.memory.my_store', - ], - ], - ], - ]); - - $this->assertTrue($container->hasDefinition('ai.indexer.my_indexer')); - $indexerDefinition = $container->getDefinition('ai.indexer.my_indexer'); - $arguments = $indexerDefinition->getArguments(); - - $this->assertIsArray($arguments[3]); - $this->assertCount(3, $arguments[3]); - $this->assertSame([ - '/path/to/file1.txt', - '/path/to/file2.txt', - 'https://example.com/feed.xml', - ], $arguments[3]); - } - - public function testIndexerWithNullSource() - { - $container = $this->buildContainer([ - 'ai' => [ - 'store' => [ - 'memory' => [ - 'my_store' => [], - ], - ], - 'indexer' => [ - 'my_indexer' => [ - 'loader' => InMemoryLoader::class, - 'vectorizer' => 'my_vectorizer_service', - 'store' => 'ai.store.memory.my_store', - // source not configured, should default to null - ], - ], - ], - ]); - - $this->assertTrue($container->hasDefinition('ai.indexer.my_indexer')); - $indexerDefinition = $container->getDefinition('ai.indexer.my_indexer'); - $arguments = $indexerDefinition->getArguments(); - - $this->assertNull($arguments[3]); - } - public function testIndexerWithConfiguredTransformers() { $container = $this->buildContainer([ @@ -2946,15 +2855,15 @@ public function testIndexerWithConfiguredTransformers() $indexerDefinition = $container->getDefinition('ai.indexer.my_indexer'); $arguments = $indexerDefinition->getArguments(); - $this->assertSame([], $arguments[4]); // Empty filters - $this->assertIsArray($arguments[5]); - $this->assertCount(2, $arguments[5]); + $this->assertSame([], $arguments[3]); // Empty filters + $this->assertIsArray($arguments[4]); + $this->assertCount(2, $arguments[4]); - $this->assertInstanceOf(Reference::class, $arguments[5][0]); - $this->assertSame(TextTrimTransformer::class, (string) $arguments[5][0]); + $this->assertInstanceOf(Reference::class, $arguments[4][0]); + $this->assertSame(TextTrimTransformer::class, (string) $arguments[4][0]); - $this->assertInstanceOf(Reference::class, $arguments[5][1]); - $this->assertSame('App\CustomTransformer', (string) $arguments[5][1]); + $this->assertInstanceOf(Reference::class, $arguments[4][1]); + $this->assertSame('App\CustomTransformer', (string) $arguments[4][1]); } public function testIndexerWithEmptyTransformers() @@ -2981,8 +2890,8 @@ public function testIndexerWithEmptyTransformers() $indexerDefinition = $container->getDefinition('ai.indexer.my_indexer'); $arguments = $indexerDefinition->getArguments(); - $this->assertSame([], $arguments[4]); // Empty filters - $this->assertSame([], $arguments[5]); // Empty transformers + $this->assertSame([], $arguments[3]); // Empty filters + $this->assertSame([], $arguments[4]); // Empty transformers } public function testIndexerWithoutTransformers() @@ -3009,61 +2918,8 @@ public function testIndexerWithoutTransformers() $indexerDefinition = $container->getDefinition('ai.indexer.my_indexer'); $arguments = $indexerDefinition->getArguments(); - $this->assertSame([], $arguments[4]); // Empty filters - $this->assertSame([], $arguments[5]); // Empty transformers - } - - public function testIndexerWithSourceAndTransformers() - { - $container = $this->buildContainer([ - 'ai' => [ - 'store' => [ - 'memory' => [ - 'my_store' => [], - ], - ], - 'indexer' => [ - 'my_indexer' => [ - 'loader' => InMemoryLoader::class, - 'source' => [ - '/path/to/file1.txt', - '/path/to/file2.txt', - ], - 'transformers' => [ - TextTrimTransformer::class, - ], - 'vectorizer' => 'my_vectorizer_service', - 'store' => 'ai.store.memory.my_store', - ], - ], - ], - ]); - - $this->assertTrue($container->hasDefinition('ai.indexer.my_indexer')); - $indexerDefinition = $container->getDefinition('ai.indexer.my_indexer'); - $arguments = $indexerDefinition->getArguments(); - - $this->assertInstanceOf(Reference::class, $arguments[0]); - $this->assertSame(InMemoryLoader::class, (string) $arguments[0]); - - $this->assertInstanceOf(Reference::class, $arguments[1]); - $this->assertSame('my_vectorizer_service', (string) $arguments[1]); - - $this->assertInstanceOf(Reference::class, $arguments[2]); - $this->assertSame('ai.store.memory.my_store', (string) $arguments[2]); - - $this->assertIsArray($arguments[3]); - $this->assertCount(2, $arguments[3]); - $this->assertSame([ - '/path/to/file1.txt', - '/path/to/file2.txt', - ], $arguments[3]); - - $this->assertSame([], $arguments[4]); // Empty filters - $this->assertIsArray($arguments[5]); - $this->assertCount(1, $arguments[5]); - $this->assertInstanceOf(Reference::class, $arguments[5][0]); - $this->assertSame(TextTrimTransformer::class, (string) $arguments[5][0]); + $this->assertSame([], $arguments[3]); // Empty filters + $this->assertSame([], $arguments[4]); // Empty transformers } public function testIndexerWithConfiguredFilters() @@ -3093,18 +2949,18 @@ public function testIndexerWithConfiguredFilters() $indexerDefinition = $container->getDefinition('ai.indexer.my_indexer'); $arguments = $indexerDefinition->getArguments(); - // Verify filters are in the correct position (index 4, before transformers) - $this->assertIsArray($arguments[4]); - $this->assertCount(2, $arguments[4]); + // Verify filters are in the correct position (index 3, before transformers) + $this->assertIsArray($arguments[3]); + $this->assertCount(2, $arguments[3]); - $this->assertInstanceOf(Reference::class, $arguments[4][0]); - $this->assertSame(TextContainsFilter::class, (string) $arguments[4][0]); + $this->assertInstanceOf(Reference::class, $arguments[3][0]); + $this->assertSame(TextContainsFilter::class, (string) $arguments[3][0]); - $this->assertInstanceOf(Reference::class, $arguments[4][1]); - $this->assertSame('App\CustomFilter', (string) $arguments[4][1]); + $this->assertInstanceOf(Reference::class, $arguments[3][1]); + $this->assertSame('App\CustomFilter', (string) $arguments[3][1]); - // Verify transformers are in the correct position (index 5, after filters) - $this->assertSame([], $arguments[5]); // Empty transformers + // Verify transformers are in the correct position (index 4, after filters) + $this->assertSame([], $arguments[4]); // Empty transformers } public function testIndexerWithEmptyFilters() @@ -3190,78 +3046,17 @@ public function testIndexerWithFiltersAndTransformers() $indexerDefinition = $container->getDefinition('ai.indexer.my_indexer'); $arguments = $indexerDefinition->getArguments(); - // Verify filters are at index 4 - $this->assertIsArray($arguments[4]); - $this->assertCount(1, $arguments[4]); - $this->assertInstanceOf(Reference::class, $arguments[4][0]); - $this->assertSame(TextContainsFilter::class, (string) $arguments[4][0]); - - // Verify transformers are at index 5 - $this->assertIsArray($arguments[5]); - $this->assertCount(1, $arguments[5]); - $this->assertInstanceOf(Reference::class, $arguments[5][0]); - $this->assertSame(TextTrimTransformer::class, (string) $arguments[5][0]); - } - - public function testIndexerWithSourceFiltersAndTransformers() - { - $container = $this->buildContainer([ - 'ai' => [ - 'store' => [ - 'memory' => [ - 'my_store' => [], - ], - ], - 'indexer' => [ - 'my_indexer' => [ - 'loader' => InMemoryLoader::class, - 'source' => [ - '/path/to/file1.txt', - '/path/to/file2.txt', - ], - 'filters' => [ - TextContainsFilter::class, - ], - 'transformers' => [ - TextTrimTransformer::class, - ], - 'vectorizer' => 'my_vectorizer_service', - 'store' => 'ai.store.memory.my_store', - ], - ], - ], - ]); - - $this->assertTrue($container->hasDefinition('ai.indexer.my_indexer')); - $indexerDefinition = $container->getDefinition('ai.indexer.my_indexer'); - $arguments = $indexerDefinition->getArguments(); - - // Verify correct order: loader, vectorizer, store, source, filters, transformers, logger - $this->assertInstanceOf(Reference::class, $arguments[0]); // loader - $this->assertSame(InMemoryLoader::class, (string) $arguments[0]); - - $this->assertInstanceOf(Reference::class, $arguments[1]); // vectorizer - $this->assertSame('my_vectorizer_service', (string) $arguments[1]); - - $this->assertInstanceOf(Reference::class, $arguments[2]); // store - $this->assertSame('ai.store.memory.my_store', (string) $arguments[2]); - - $this->assertIsArray($arguments[3]); // source - $this->assertCount(2, $arguments[3]); - $this->assertSame(['/path/to/file1.txt', '/path/to/file2.txt'], $arguments[3]); + // Verify filters are at index 3 + $this->assertIsArray($arguments[3]); + $this->assertCount(1, $arguments[3]); + $this->assertInstanceOf(Reference::class, $arguments[3][0]); + $this->assertSame(TextContainsFilter::class, (string) $arguments[3][0]); - $this->assertIsArray($arguments[4]); // filters + // Verify transformers are at index 4 + $this->assertIsArray($arguments[4]); $this->assertCount(1, $arguments[4]); $this->assertInstanceOf(Reference::class, $arguments[4][0]); - $this->assertSame(TextContainsFilter::class, (string) $arguments[4][0]); - - $this->assertIsArray($arguments[5]); // transformers - $this->assertCount(1, $arguments[5]); - $this->assertInstanceOf(Reference::class, $arguments[5][0]); - $this->assertSame(TextTrimTransformer::class, (string) $arguments[5][0]); - - $this->assertInstanceOf(Reference::class, $arguments[6]); // logger - $this->assertSame('logger', (string) $arguments[6]); + $this->assertSame(TextTrimTransformer::class, (string) $arguments[4][0]); } public function testInjectionIndexerAliasIsRegistered() diff --git a/src/store/src/Command/IndexCommand.php b/src/store/src/Command/IndexCommand.php index 65fc75224..21e346c76 100644 --- a/src/store/src/Command/IndexCommand.php +++ b/src/store/src/Command/IndexCommand.php @@ -88,14 +88,10 @@ protected function execute(InputInterface $input, OutputInterface $output): int $indexerService = $this->indexers->get($indexer); - if (null !== $source) { - $indexerService = $indexerService->withSource($source); - } - $io->title(\sprintf('Indexing documents using "%s" indexer', $indexer)); try { - $indexerService->index([]); + $indexerService->loadAndIndex($source); $io->success(\sprintf('Documents indexed successfully using "%s" indexer.', $indexer)); } catch (\Exception $e) { diff --git a/src/store/src/Document/Loader/RssFeedLoader.php b/src/store/src/Document/Loader/RssFeedLoader.php index 927e47287..85714a841 100644 --- a/src/store/src/Document/Loader/RssFeedLoader.php +++ b/src/store/src/Document/Loader/RssFeedLoader.php @@ -35,6 +35,7 @@ final class RssFeedLoader implements LoaderInterface public function __construct( private readonly HttpClientInterface $httpClient, private readonly string $uuidNamespace = '6ba7b810-9dad-11d1-80b4-00c04fd430c8', + private readonly ?string $defaultSource = null, ) { } @@ -47,6 +48,8 @@ public function load(?string $source = null, array $options = []): iterable throw new RuntimeException('For using the RSS loader, the Symfony DomCrawler component is required. Try running "composer require symfony/dom-crawler".'); } + $source ??= $this->defaultSource; + if (null === $source) { throw new InvalidArgumentException(\sprintf('"%s" requires a URL as source, null given.', self::class)); } diff --git a/src/store/src/Document/Loader/TextFileLoader.php b/src/store/src/Document/Loader/TextFileLoader.php index 18cec8cb5..5788453d3 100644 --- a/src/store/src/Document/Loader/TextFileLoader.php +++ b/src/store/src/Document/Loader/TextFileLoader.php @@ -23,8 +23,15 @@ */ final class TextFileLoader implements LoaderInterface { + public function __construct( + private readonly ?string $defaultSource = null, + ) { + } + public function load(?string $source = null, array $options = []): iterable { + $source ??= $this->defaultSource; + if (null === $source) { throw new InvalidArgumentException('TextFileLoader requires a file path as source, null given.'); } diff --git a/src/store/src/Indexer.php b/src/store/src/Indexer.php index d797c05f8..8300efc26 100644 --- a/src/store/src/Indexer.php +++ b/src/store/src/Indexer.php @@ -13,7 +13,6 @@ use Psr\Log\LoggerInterface; use Psr\Log\NullLogger; -use Symfony\AI\Store\Document\EmbeddableDocumentInterface; use Symfony\AI\Store\Document\FilterInterface; use Symfony\AI\Store\Document\LoaderInterface; use Symfony\AI\Store\Document\TransformerInterface; @@ -26,47 +25,40 @@ class Indexer implements IndexerInterface { /** - * @var array - */ - private array $sources = []; - - /** - * @param string|array|null $source Source identifier(s) for data loading (file paths, URLs, etc.) - * @param FilterInterface[] $filters Filters to apply after loading documents to remove unwanted content - * @param TransformerInterface[] $transformers Transformers to mutate documents after filtering (chunking, cleaning, etc.) + * @param FilterInterface[] $filters Filters to apply after loading documents to remove unwanted content + * @param TransformerInterface[] $transformers Transformers to mutate documents after filtering (chunking, cleaning, etc.) */ public function __construct( private LoaderInterface $loader, private VectorizerInterface $vectorizer, private StoreInterface $store, - string|array|null $source = null, private array $filters = [], private array $transformers = [], private LoggerInterface $logger = new NullLogger(), ) { - $this->sources = null === $source ? [] : (array) $source; } - public function withSource(string|array $source): self + public function loadAndIndex(array $sources = [], array $options = []): void { - return new self($this->loader, $this->vectorizer, $this->store, $source, $this->filters, $this->transformers, $this->logger); - } - - public function index(array $options = []): void - { - $this->logger->debug('Starting document processing', ['sources' => $this->sources, 'options' => $options]); - - $documents = []; - if ([] === $this->sources) { - $documents = $this->loadSource(null); + $this->logger->debug('Starting document processing', ['sources' => $sources, 'options' => $options]); + + if ($sources) { + $documents = (function () use ($sources) { + foreach ($sources as $singleSource) { + yield from $this->loader->load($singleSource); + } + })(); } else { - foreach ($this->sources as $singleSource) { - $documents = array_merge($documents, $this->loadSource($singleSource)); - } + $documents = $this->loader->load(null); } - if ([] === $documents) { - $this->logger->debug('No documents to process', ['sources' => $this->sources]); + $this->index($documents, $options); + } + + public function index(iterable $documents, array $options = []): void + { + if (!$documents) { + $this->logger->debug('No documents to process.'); return; } @@ -98,17 +90,4 @@ public function index(array $options = []): void $this->logger->debug('Document processing completed', ['total_documents' => $counter]); } - - /** - * @return EmbeddableDocumentInterface[] - */ - private function loadSource(?string $source): array - { - $documents = []; - foreach ($this->loader->load($source) as $document) { - $documents[] = $document; - } - - return $documents; - } } diff --git a/src/store/src/IndexerInterface.php b/src/store/src/IndexerInterface.php index fcb27494f..0116785b0 100644 --- a/src/store/src/IndexerInterface.php +++ b/src/store/src/IndexerInterface.php @@ -11,6 +11,8 @@ namespace Symfony\AI\Store; +use Symfony\AI\Store\Document\EmbeddableDocumentInterface; + /** * Handles the complete document processing pipeline: load → transform → vectorize → store. * @@ -21,14 +23,16 @@ interface IndexerInterface /** * Process sources through the complete document pipeline: load → transform → vectorize → store. * + * @param array $sources Source identifier(s) for data loading (file paths, URLs, etc.) * @param array{chunk_size?: int} $options Processing options */ - public function index(array $options = []): void; + public function loadAndIndex(array $sources = [], array $options = []): void; /** - * Create a new instance with a different source. + * Process documents through the document pipeline: transform → vectorize → store. * - * @param string|array $source Source identifier (file path, URL, etc.) or array of sources + * @param iterable $documents + * @param array{chunk_size?: int} $options Processing options */ - public function withSource(string|array $source): self; + public function index(iterable $documents, array $options = []): void; } diff --git a/src/store/tests/IndexerTest.php b/src/store/tests/IndexerTest.php index 6ee808383..9de4549cd 100644 --- a/src/store/tests/IndexerTest.php +++ b/src/store/tests/IndexerTest.php @@ -37,7 +37,7 @@ public function testIndexSingleDocument() $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector)), 'text-embedding-3-small'); $indexer = new Indexer($loader, $vectorizer, $store = new TestStore()); - $indexer->index(); + $indexer->loadAndIndex(); $this->assertCount(1, $store->documents); $this->assertInstanceOf(VectorDocument::class, $store->documents[0]); @@ -51,7 +51,7 @@ public function testIndexEmptyDocumentList() $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(), 'text-embedding-3-small'); $indexer = new Indexer($loader, $vectorizer, $store = new TestStore()); - $indexer->index(); + $indexer->loadAndIndex(); $this->assertSame([], $store->documents); } @@ -65,7 +65,7 @@ public function testIndexDocumentWithMetadata() $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector)), 'text-embedding-3-small'); $indexer = new Indexer($loader, $vectorizer, $store = new TestStore()); - $indexer->index(); + $indexer->loadAndIndex(); $this->assertSame(1, $store->addCalls); $this->assertCount(1, $store->documents); @@ -75,32 +75,7 @@ public function testIndexDocumentWithMetadata() $this->assertSame(['key' => 'value'], $store->documents[0]->metadata->getArrayCopy()); } - public function testWithSource() - { - $document1 = new TextDocument(Uuid::v4(), 'Document 1'); - $vector = new Vector([0.1, 0.2, 0.3]); - - // InMemoryLoader doesn't use source parameter, so we'll test withSource method's immutability - $loader = new InMemoryLoader([$document1]); - $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector)), 'text-embedding-3-small'); - - $indexer = new Indexer($loader, $vectorizer, $store = new TestStore(), 'source1'); - - $indexerWithNewSource = $indexer->withSource('source2'); - - $this->assertNotSame($indexer, $indexerWithNewSource); - - // Both can index successfully - $indexer->index(); - $this->assertCount(1, $store->documents); - - $store2 = new TestStore(); - $indexer2 = new Indexer($loader, $vectorizer, $store2, 'source2'); - $indexer2->index(); - $this->assertCount(1, $store2->documents); - } - - public function testWithSourceArray() + public function testWithCustomSourceArray() { $document1 = new TextDocument(Uuid::v4(), 'Document 1'); $document2 = new TextDocument(Uuid::v4(), 'Document 2'); @@ -117,19 +92,15 @@ public function testWithSourceArray() $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector1, $vector2, $vector3, $vector4, $vector5, $vector6)), 'test-embedding-model'); // Create indexer with single source - $indexer = new Indexer($loader, $vectorizer, $store1 = new TestStore(), 'source1'); - - $indexerWithMultipleSources = $indexer->withSource(['source2', 'source3']); - - $this->assertNotSame($indexer, $indexerWithMultipleSources); + $indexer = new Indexer($loader, $vectorizer, $store1 = new TestStore()); // Since InMemoryLoader ignores source, both will index all documents - $indexer->index(); + $indexer->loadAndIndex(); $this->assertCount(2, $store1->documents); $store2 = new TestStore(); - $indexer2 = new Indexer($loader, $vectorizer, $store2, ['source2', 'source3']); - $indexer2->index(); + $indexer2 = new Indexer($loader, $vectorizer, $store2); + $indexer2->loadAndIndex(['source2', 'source3']); // With array sources, loadSource is called for each source // Since InMemoryLoader ignores source, it returns all docs each time // So with 2 sources and 2 docs each time = 4 documents total @@ -150,8 +121,8 @@ public function testIndexWithTextContainsFilter() $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector1, $vector2)), 'test-embedding-model'); $filter = new TextContainsFilter('Week of Symfony'); - $indexer = new Indexer($loader, $vectorizer, $store = new TestStore(), null, [$filter]); - $indexer->index(); + $indexer = new Indexer($loader, $vectorizer, $store = new TestStore(), [$filter]); + $indexer->loadAndIndex(); // Should only have 2 documents (the "Week of Symfony" one should be filtered out) $this->assertCount(2, $store->documents); @@ -175,8 +146,8 @@ public function testIndexWithMultipleFilters() new TextContainsFilter('SPAM'), ]; - $indexer = new Indexer($loader, $vectorizer, $store = new TestStore(), null, $filters); - $indexer->index(); + $indexer = new Indexer($loader, $vectorizer, $store = new TestStore(), $filters); + $indexer->loadAndIndex(); // Should only have 2 documents (filtered out "Week of Symfony" and "SPAM") $this->assertCount(2, $store->documents); @@ -207,8 +178,8 @@ public function transform(iterable $documents, array $options = []): iterable } }; - $indexer = new Indexer($loader, $vectorizer, $store = new TestStore(), null, [$filter], [$transformer]); - $indexer->index(); + $indexer = new Indexer($loader, $vectorizer, $store = new TestStore(), [$filter], [$transformer]); + $indexer->loadAndIndex(); // Should have 2 documents (filtered out "Week of Symfony"), and transformation should have occurred $this->assertCount(2, $store->documents); @@ -253,8 +224,8 @@ public function transform(iterable $documents, array $options = []): iterable } }; - $indexer = new Indexer($loader, $vectorizer, $store = new TestStore(), null, [$filter], [$transformer]); - $indexer->index(); + $indexer = new Indexer($loader, $vectorizer, $store = new TestStore(), [$filter], [$transformer]); + $indexer->loadAndIndex(); // Should have 2 documents (one filtered out) $this->assertCount(2, $store->documents); @@ -272,26 +243,9 @@ public function testIndexWithNoFilters() $loader = new InMemoryLoader([$document]); $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector)), 'text-embedding-3-small'); - $indexer = new Indexer($loader, $vectorizer, $store = new TestStore(), null, []); - $indexer->index(); + $indexer = new Indexer($loader, $vectorizer, $store = new TestStore(), []); + $indexer->loadAndIndex(); $this->assertCount(1, $store->documents); } - - public function testWithSourcePreservesFilters() - { - $document = new TextDocument(Uuid::v4(), 'Test content'); - $vector = new Vector([0.1, 0.2, 0.3]); - $loader = new InMemoryLoader([$document]); - $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector)), 'text-embedding-3-small'); - $filter = new TextContainsFilter('nonexistent'); - - $indexer = new Indexer($loader, $vectorizer, $store = new TestStore(), 'source1', [$filter]); - $indexerWithNewSource = $indexer->withSource('source2'); - - $this->assertNotSame($indexer, $indexerWithNewSource); - - $indexerWithNewSource->index(); - $this->assertCount(1, $store->documents); // Filter should still work - } }