Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/document/vectorizing-text-documents.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,6 @@
];

$vectorizer = new Vectorizer($platform, 'text-embedding-3-large');
$vectorDocuments = $vectorizer->vectorizeTextDocuments($textDocuments);
$vectorDocuments = $vectorizer->vectorizeEmbeddableDocuments($textDocuments);

dump(array_map(fn (VectorDocument $document) => $document->vector->getDimensions(), $vectorDocuments));
21 changes: 21 additions & 0 deletions src/store/src/Document/EmbeddableDocumentInterface.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<?php

/*
* This file is part of the Symfony package.
*
* (c) Fabien Potencier <fabien@symfony.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

namespace Symfony\AI\Store\Document;

interface EmbeddableDocumentInterface
{
public function getId(): mixed;

public function getContent(): string;

public function getMetadata(): Metadata;
}
2 changes: 1 addition & 1 deletion src/store/src/Document/Filter/TextContainsFilter.php
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ public function filter(iterable $documents, array $options = []): iterable
$caseSensitive = $options[self::OPTION_CASE_SENSITIVE] ?? $this->caseSensitive;

foreach ($documents as $document) {
$content = $document->content;
$content = $document->getContent();

if ($caseSensitive) {
$contains = str_contains($content, $needle);
Expand Down
6 changes: 3 additions & 3 deletions src/store/src/Document/FilterInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
interface FilterInterface
{
/**
* @param iterable<TextDocument> $documents
* @param array<string, mixed> $options
* @param iterable<EmbeddableDocumentInterface> $documents
* @param array<string, mixed> $options
*
* @return iterable<TextDocument>
* @return iterable<EmbeddableDocumentInterface>
*/
public function filter(iterable $documents, array $options = []): iterable;
}
6 changes: 3 additions & 3 deletions src/store/src/Document/Loader/InMemoryLoader.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,19 @@

namespace Symfony\AI\Store\Document\Loader;

use Symfony\AI\Store\Document\EmbeddableDocumentInterface;
use Symfony\AI\Store\Document\LoaderInterface;
use Symfony\AI\Store\Document\TextDocument;

/**
* Loader that returns pre-loaded TextDocuments from memory.
* Loader that returns preloaded documents from memory.
* Useful for testing or when documents are already available as objects.
*
* @author Oskar Stark <oskarstark@googlemail.com>
*/
final readonly class InMemoryLoader implements LoaderInterface
{
/**
* @param TextDocument[] $documents
* @param EmbeddableDocumentInterface[] $documents
*/
public function __construct(
private array $documents = [],
Expand Down
2 changes: 1 addition & 1 deletion src/store/src/Document/LoaderInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ interface LoaderInterface
* @param string|null $source Identifier for the loader to load the documents from, e.g. file path, folder, or URL. Can be null for InMemoryLoader.
* @param array<string, mixed> $options loader specific set of options to control the loading process
*
* @return iterable<TextDocument> iterable of TextDocuments loaded from the source
* @return iterable<EmbeddableDocumentInterface> iterable of embeddable documents loaded from the source
*/
public function load(?string $source, array $options = []): iterable;
}
23 changes: 19 additions & 4 deletions src/store/src/Document/TextDocument.php
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@
/**
* @author Christopher Hertel <mail@christopher-hertel.de>
*/
final readonly class TextDocument
final readonly class TextDocument implements EmbeddableDocumentInterface
{
public function __construct(
public Uuid $id,
public string $content,
public Metadata $metadata = new Metadata(),
private Uuid $id,
private string $content,
private Metadata $metadata = new Metadata(),
) {
if ('' === trim($this->content)) {
throw new InvalidArgumentException('The content shall not be an empty string.');
Expand All @@ -33,4 +33,19 @@ public function withContent(string $content): self
{
return new self($this->id, $content, $this->metadata);
}

public function getId(): Uuid
{
return $this->id;
}

public function getContent(): string
{
return $this->content;
}

public function getMetadata(): Metadata
{
return $this->metadata;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

namespace Symfony\AI\Store\Document\Transformer;

use Symfony\AI\Store\Document\TextDocument;
use Symfony\AI\Store\Document\TransformerInterface;
use Symfony\AI\Store\Exception\InvalidArgumentException;

Expand All @@ -32,6 +33,7 @@ public function __construct(
}

/**
* @param iterable<TextDocument> $documents
* @param array{search?: string, replace?: string} $options
*/
public function transform(iterable $documents, array $options = []): iterable
Expand All @@ -42,7 +44,7 @@ public function transform(iterable $documents, array $options = []): iterable
self::validate($search, $replace);

foreach ($documents as $document) {
yield $document->withContent(str_replace($search, $replace, $document->content));
yield $document->withContent(str_replace($search, $replace, $document->getContent()));
}
}

Expand Down
8 changes: 4 additions & 4 deletions src/store/src/Document/Transformer/TextSplitTransformer.php
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,13 @@ public function transform(iterable $documents, array $options = []): iterable
}

foreach ($documents as $document) {
if (mb_strlen($document->content) <= $chunkSize) {
if (mb_strlen($document->getContent()) <= $chunkSize) {
yield $document;

continue;
}

$text = $document->content;
$text = $document->getContent();
$length = mb_strlen($text);
$start = 0;

Expand All @@ -66,9 +66,9 @@ public function transform(iterable $documents, array $options = []): iterable
$chunkText = mb_substr($text, $start, $end - $start);

yield new TextDocument(Uuid::v4(), $chunkText, new Metadata([
Metadata::KEY_PARENT_ID => $document->id,
Metadata::KEY_PARENT_ID => $document->getId(),
Metadata::KEY_TEXT => $chunkText,
...$document->metadata,
...$document->getMetadata(),
]));

$start += ($chunkSize - $overlap);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

namespace Symfony\AI\Store\Document\Transformer;

use Symfony\AI\Store\Document\TextDocument;
use Symfony\AI\Store\Document\TransformerInterface;

/**
Expand All @@ -20,10 +21,13 @@
*/
final readonly class TextTrimTransformer implements TransformerInterface
{
/**
* @param iterable<TextDocument> $documents
*/
public function transform(iterable $documents, array $options = []): iterable
{
foreach ($documents as $document) {
yield $document->withContent(trim($document->content));
yield $document->withContent(trim($document->getContent()));
}
}
}
8 changes: 4 additions & 4 deletions src/store/src/Document/TransformerInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
namespace Symfony\AI\Store\Document;

/**
* A Transformer is designed to mutate a stream of TextDocuments with the purpose of preparing them for indexing.
* A Transformer is designed to mutate a stream of embeddable with the purpose of preparing them for indexing.
* It can reduce or expand the number of documents, modify their content or metadata.
* It should not act blocking, but is expected to iterate over incoming documents and yield prepared ones.
*
Expand All @@ -21,10 +21,10 @@
interface TransformerInterface
{
/**
* @param iterable<TextDocument> $documents
* @param array<string, mixed> $options
* @param iterable<EmbeddableDocumentInterface> $documents
* @param array<string, mixed> $options
*
* @return iterable<TextDocument>
* @return iterable<EmbeddableDocumentInterface>
*/
public function transform(iterable $documents, array $options = []): iterable;
}
10 changes: 5 additions & 5 deletions src/store/src/Document/Vectorizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,23 +27,23 @@ public function __construct(
) {
}

public function vectorizeTextDocuments(array $documents, array $options = []): array
public function vectorizeEmbeddableDocuments(array $documents, array $options = []): array
{
$documentCount = \count($documents);
$this->logger->info('Starting vectorization process', ['document_count' => $documentCount]);

if ($this->platform->getModelCatalog()->getModel($this->model)->supports(Capability::INPUT_MULTIPLE)) {
$this->logger->debug('Using batch vectorization with model that supports multiple inputs');
$result = $this->platform->invoke($this->model, array_map(fn (TextDocument $document) => $document->content, $documents), $options);
$result = $this->platform->invoke($this->model, array_map(fn (EmbeddableDocumentInterface $document) => $document->getContent(), $documents), $options);

$vectors = $result->asVectors();
$this->logger->debug('Batch vectorization completed', ['vector_count' => \count($vectors)]);
} else {
$this->logger->debug('Using sequential vectorization for model without multiple input support');
$results = [];
foreach ($documents as $i => $document) {
$this->logger->debug('Vectorizing document', ['document_index' => $i, 'document_id' => $document->id]);
$results[] = $this->platform->invoke($this->model, $document->content, $options);
$this->logger->debug('Vectorizing document', ['document_index' => $i, 'document_id' => $document->getId()]);
$results[] = $this->platform->invoke($this->model, $document->getContent(), $options);
}

$vectors = [];
Expand All @@ -55,7 +55,7 @@ public function vectorizeTextDocuments(array $documents, array $options = []): a

$vectorDocuments = [];
foreach ($documents as $i => $document) {
$vectorDocuments[] = new VectorDocument($document->id, $vectors[$i], $document->metadata);
$vectorDocuments[] = new VectorDocument($document->getId(), $vectors[$i], $document->getMetadata());
}

$this->logger->info('Vectorization process completed', [
Expand Down
8 changes: 4 additions & 4 deletions src/store/src/Document/VectorizerInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,20 @@
use Symfony\AI\Platform\Vector\Vector;

/**
* Interface for converting a collection of TextDocuments into VectorDocuments
* Interface for converting a collection of Embeddable documents into VectorDocuments
* and for vectorizing individual strings.
*
* @author Oskar Stark <oskarstark@googlemail.com>
*/
interface VectorizerInterface
{
/**
* @param TextDocument[] $documents
* @param array<string, mixed> $options Options to pass to the underlying platform
* @param EmbeddableDocumentInterface[] $documents
* @param array<string, mixed> $options Options to pass to the underlying platform
*
* @return VectorDocument[]
*/
public function vectorizeTextDocuments(array $documents, array $options = []): array;
public function vectorizeEmbeddableDocuments(array $documents, array $options = []): array;

/**
* Vectorizes a single string or Stringable object into a Vector.
Expand Down
8 changes: 4 additions & 4 deletions src/store/src/Indexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@

use Psr\Log\LoggerInterface;
use Psr\Log\NullLogger;
use Symfony\AI\Store\Document\EmbeddableDocumentInterface;
use Symfony\AI\Store\Document\FilterInterface;
use Symfony\AI\Store\Document\LoaderInterface;
use Symfony\AI\Store\Document\TextDocument;
use Symfony\AI\Store\Document\TransformerInterface;
use Symfony\AI\Store\Document\VectorizerInterface;

Expand Down Expand Up @@ -87,20 +87,20 @@ public function index(array $options = []): void
++$counter;

if ($chunkSize === \count($chunk)) {
$this->store->add(...$this->vectorizer->vectorizeTextDocuments($chunk));
$this->store->add(...$this->vectorizer->vectorizeEmbeddableDocuments($chunk));
$chunk = [];
}
}

if ([] !== $chunk) {
$this->store->add(...$this->vectorizer->vectorizeTextDocuments($chunk));
$this->store->add(...$this->vectorizer->vectorizeEmbeddableDocuments($chunk));
}

$this->logger->debug('Document processing completed', ['total_documents' => $counter]);
}

/**
* @return TextDocument[]
* @return EmbeddableDocumentInterface[]
*/
private function loadSource(?string $source): array
{
Expand Down
Loading