Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/document/vectorizing-text-documents.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,6 @@
];

$vectorizer = new Vectorizer($platform, 'text-embedding-3-large');
$vectorDocuments = $vectorizer->vectorizeEmbeddableDocuments($textDocuments);
$vectorDocuments = $vectorizer->vectorize($textDocuments);

dump(array_map(fn (VectorDocument $document) => $document->vector->getDimensions(), $vectorDocuments));
138 changes: 123 additions & 15 deletions src/store/src/Document/Vectorizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,129 @@ public function __construct(
) {
}

public function vectorizeEmbeddableDocuments(array $documents, array $options = []): array
public function vectorize(string|\Stringable|EmbeddableDocumentInterface|array $values, array $options = []): Vector|VectorDocument|array
{
if (\is_string($values) || $values instanceof \Stringable) {
return $this->vectorizeString($values, $options);
}

if ($values instanceof EmbeddableDocumentInterface) {
return $this->vectorizeEmbeddableDocument($values, $options);
}

if ([] === $values) {
return [];
}

$firstElement = reset($values);
if ($firstElement instanceof EmbeddableDocumentInterface) {
$this->validateArray($values, EmbeddableDocumentInterface::class);

return $this->vectorizeEmbeddableDocuments($values, $options);
}

if (\is_string($firstElement) || $firstElement instanceof \Stringable) {
$this->validateArray($values, 'string|stringable');

return $this->vectorizeStrings($values, $options);
}

throw new RuntimeException('Array must contain only strings, Stringable objects, or EmbeddableDocumentInterface instances.');
}

/**
* @param array<mixed> $values
*/
private function validateArray(array $values, string $expectedType): void
{
foreach ($values as $value) {
if ('string|stringable' === $expectedType) {
if (!\is_string($value) && !$value instanceof \Stringable) {
throw new RuntimeException('Array must contain only strings or Stringable objects.');
}
} elseif (!$value instanceof $expectedType) {
throw new RuntimeException(\sprintf('Array must contain only "%s" instances.', $expectedType));
}
}
}

/**
* @param array<string, mixed> $options
*/
private function vectorizeString(string|\Stringable $string, array $options = []): Vector
{
$stringValue = (string) $string;
$this->logger->debug('Vectorizing string', ['string' => $stringValue]);

$result = $this->platform->invoke($this->model, $stringValue, $options);
$vectors = $result->asVectors();

if (!isset($vectors[0])) {
throw new RuntimeException('No vector returned for string vectorization.');
}

return $vectors[0];
}

/**
* @param array<string, mixed> $options
*/
private function vectorizeEmbeddableDocument(EmbeddableDocumentInterface $document, array $options = []): VectorDocument
{
$this->logger->debug('Vectorizing embeddable document', ['document_id' => $document->getId()]);

$vector = $this->vectorizeString($document->getContent(), $options);

return new VectorDocument($document->getId(), $vector, $document->getMetadata());
}

/**
* @param array<string|\Stringable> $strings
* @param array<string, mixed> $options
*
* @return array<Vector>
*/
private function vectorizeStrings(array $strings, array $options = []): array
{
$stringCount = \count($strings);
$this->logger->info('Starting vectorization of strings', ['string_count' => $stringCount]);

// Convert all values to strings
$stringValues = array_map(fn (string|\Stringable $s) => (string) $s, $strings);

if ($this->platform->getModelCatalog()->getModel($this->model)->supports(Capability::INPUT_MULTIPLE)) {
$this->logger->debug('Using batch vectorization with model that supports multiple inputs');
$result = $this->platform->invoke($this->model, $stringValues, $options);

$vectors = $result->asVectors();
$this->logger->debug('Batch vectorization completed', ['vector_count' => \count($vectors)]);
} else {
$this->logger->debug('Using sequential vectorization for model without multiple input support');
$results = [];
foreach ($stringValues as $i => $string) {
$this->logger->debug('Vectorizing string', ['string_index' => $i]);
$results[] = $this->platform->invoke($this->model, $string, $options);
}

$vectors = [];
foreach ($results as $result) {
$vectors = array_merge($vectors, $result->asVectors());
}
$this->logger->debug('Sequential vectorization completed', ['vector_count' => \count($vectors)]);
}

$this->logger->info('Vectorization process completed', ['string_count' => $stringCount, 'vector_count' => \count($vectors)]);

return $vectors;
}

/**
* @param array<EmbeddableDocumentInterface> $documents
* @param array<string, mixed> $options
*
* @return array<VectorDocument>
*/
private function vectorizeEmbeddableDocuments(array $documents, array $options = []): array
{
$documentCount = \count($documents);
$this->logger->info('Starting vectorization process', ['document_count' => $documentCount]);
Expand Down Expand Up @@ -65,18 +187,4 @@ public function vectorizeEmbeddableDocuments(array $documents, array $options =

return $vectorDocuments;
}

public function vectorize(string|\Stringable $string, array $options = []): Vector
{
$this->logger->debug('Vectorizing string', ['string' => (string) $string]);

$result = $this->platform->invoke($this->model, (string) $string, $options);
$vectors = $result->asVectors();

if (!isset($vectors[0])) {
throw new RuntimeException('No vector returned for string vectorization.');
}

return $vectors[0];
}
}
26 changes: 14 additions & 12 deletions src/store/src/Document/VectorizerInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,27 @@
use Symfony\AI\Platform\Vector\Vector;

/**
* Interface for converting a collection of Embeddable documents into VectorDocuments
* and for vectorizing individual strings.
* Interface for vectorizing strings and EmbeddableDocuments into Vectors and VectorDocuments.
*
* @author Oskar Stark <oskarstark@googlemail.com>
*/
interface VectorizerInterface
{
/**
* @param EmbeddableDocumentInterface[] $documents
* @param array<string, mixed> $options Options to pass to the underlying platform
* Vectorizes strings or EmbeddableDocuments into Vectors or VectorDocuments.
*
* @return VectorDocument[]
*/
public function vectorizeEmbeddableDocuments(array $documents, array $options = []): array;

/**
* Vectorizes a single string or Stringable object into a Vector.
* @param string|\Stringable|EmbeddableDocumentInterface|array<string|\Stringable>|array<EmbeddableDocumentInterface> $values The values to vectorize
* @param array<string, mixed> $options Options to pass to the underlying platform
*
* @return Vector|VectorDocument|array<Vector>|array<VectorDocument>
*
* @param array<string, mixed> $options Options to pass to the underlying platform
* @phpstan-return (
* $values is string|\Stringable ? Vector : (
* $values is EmbeddableDocumentInterface ? VectorDocument : (
* $values is array<string|\Stringable> ? array<Vector> : array<VectorDocument>
* )
* )
* )
*/
public function vectorize(string|\Stringable $string, array $options = []): Vector;
public function vectorize(string|\Stringable|EmbeddableDocumentInterface|array $values, array $options = []): Vector|VectorDocument|array;
}
4 changes: 2 additions & 2 deletions src/store/src/Indexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,13 @@ public function index(array $options = []): void
++$counter;

if ($chunkSize === \count($chunk)) {
$this->store->add(...$this->vectorizer->vectorizeEmbeddableDocuments($chunk));
$this->store->add(...$this->vectorizer->vectorize($chunk));
$chunk = [];
}
}

if ([] !== $chunk) {
$this->store->add(...$this->vectorizer->vectorizeEmbeddableDocuments($chunk));
$this->store->add(...$this->vectorizer->vectorize($chunk));
}

$this->logger->debug('Document processing completed', ['total_documents' => $counter]);
Expand Down
Loading