Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
322 changes: 322 additions & 0 deletions src/store/tests/Document/VectorizerTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,322 @@
<?php

/*
* This file is part of the Symfony package.
*
* (c) Fabien Potencier <fabien@symfony.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

namespace Symfony\AI\Store\Tests\Document;

use PHPUnit\Framework\Attributes\CoversClass;
use PHPUnit\Framework\Attributes\DataProvider;
use PHPUnit\Framework\Attributes\TestDox;
use PHPUnit\Framework\Attributes\UsesClass;
use PHPUnit\Framework\TestCase;
use Symfony\AI\Platform\Bridge\OpenAi\Embeddings;
use Symfony\AI\Platform\Capability;
use Symfony\AI\Platform\Model;
use Symfony\AI\Platform\ModelClientInterface;
use Symfony\AI\Platform\Platform;
use Symfony\AI\Platform\Result\RawHttpResult;
use Symfony\AI\Platform\Result\RawResultInterface;
use Symfony\AI\Platform\Result\ResultInterface;
use Symfony\AI\Platform\Result\ResultPromise;
use Symfony\AI\Platform\Result\VectorResult;
use Symfony\AI\Platform\ResultConverterInterface;
use Symfony\AI\Platform\Vector\Vector;
use Symfony\AI\Store\Document\Metadata;
use Symfony\AI\Store\Document\TextDocument;
use Symfony\AI\Store\Document\VectorDocument;
use Symfony\AI\Store\Document\Vectorizer;
use Symfony\AI\Store\Tests\Double\PlatformTestHandler;
use Symfony\Component\HttpClient\Response\MockResponse;
use Symfony\Component\Uid\Uuid;

#[CoversClass(Vectorizer::class)]
#[UsesClass(TextDocument::class)]
#[UsesClass(VectorDocument::class)]
#[UsesClass(Metadata::class)]
#[UsesClass(Vector::class)]
#[UsesClass(VectorResult::class)]
#[UsesClass(Platform::class)]
#[UsesClass(ResultPromise::class)]
#[UsesClass(Embeddings::class)]
#[TestDox('Tests for the Vectorizer class')]
final class VectorizerTest extends TestCase
{
public function testVectorizeDocumentsWithBatchSupport()
{
$documents = [
new TextDocument(Uuid::v4(), 'First document content', new Metadata(['source' => 'test1'])),
new TextDocument(Uuid::v4(), 'Second document content', new Metadata(['source' => 'test2'])),
new TextDocument(Uuid::v4(), 'Third document content', new Metadata(['source' => 'test3'])),
];

$vectors = [
new Vector([0.1, 0.2, 0.3]),
new Vector([0.4, 0.5, 0.6]),
new Vector([0.7, 0.8, 0.9]),
];

$platform = PlatformTestHandler::createPlatform(new VectorResult(...$vectors));

$model = new Embeddings();

$vectorizer = new Vectorizer($platform, $model);
$vectorDocuments = $vectorizer->vectorizeDocuments($documents);

$this->assertCount(3, $vectorDocuments);

foreach ($vectorDocuments as $i => $vectorDoc) {
$this->assertInstanceOf(VectorDocument::class, $vectorDoc);
$this->assertSame($documents[$i]->id, $vectorDoc->id);
$this->assertEquals($vectors[$i], $vectorDoc->vector);
$this->assertSame($documents[$i]->metadata, $vectorDoc->metadata);
}
}

public function testVectorizeDocumentsWithSingleDocument()
{
$document = new TextDocument(Uuid::v4(), 'Single document content', new Metadata(['test' => 'value']));
$vector = new Vector([0.1, 0.2, 0.3]);

$platform = PlatformTestHandler::createPlatform(new VectorResult($vector));
$model = new Embeddings();

$vectorizer = new Vectorizer($platform, $model);
$vectorDocuments = $vectorizer->vectorizeDocuments([$document]);

$this->assertCount(1, $vectorDocuments);
$this->assertInstanceOf(VectorDocument::class, $vectorDocuments[0]);
$this->assertSame($document->id, $vectorDocuments[0]->id);
$this->assertEquals($vector, $vectorDocuments[0]->vector);
$this->assertSame($document->metadata, $vectorDocuments[0]->metadata);
}

public function testVectorizeEmptyDocumentsArray()
{
$platform = PlatformTestHandler::createPlatform(new VectorResult());
$model = new Embeddings();

$vectorizer = new Vectorizer($platform, $model);
$vectorDocuments = $vectorizer->vectorizeDocuments([]);

$this->assertSame([], $vectorDocuments);
}

public function testVectorizeDocumentsPreservesMetadata()
{
$metadata1 = new Metadata(['source' => 'file1.txt', 'author' => 'Alice', 'tags' => ['important']]);
$metadata2 = new Metadata(['source' => 'file2.txt', 'author' => 'Bob', 'version' => 2]);

$documents = [
new TextDocument(Uuid::v4(), 'Content 1', $metadata1),
new TextDocument(Uuid::v4(), 'Content 2', $metadata2),
];

$vectors = [
new Vector([0.1, 0.2]),
new Vector([0.3, 0.4]),
];

$platform = PlatformTestHandler::createPlatform(new VectorResult(...$vectors));
$model = new Embeddings();

$vectorizer = new Vectorizer($platform, $model);
$vectorDocuments = $vectorizer->vectorizeDocuments($documents);

$this->assertCount(2, $vectorDocuments);
$this->assertSame($metadata1, $vectorDocuments[0]->metadata);
$this->assertSame($metadata2, $vectorDocuments[1]->metadata);
$this->assertSame(['source' => 'file1.txt', 'author' => 'Alice', 'tags' => ['important']], $vectorDocuments[0]->metadata->getArrayCopy());
$this->assertSame(['source' => 'file2.txt', 'author' => 'Bob', 'version' => 2], $vectorDocuments[1]->metadata->getArrayCopy());
}

public function testVectorizeDocumentsPreservesDocumentIds()
{
$id1 = Uuid::v4();
$id2 = Uuid::v4();
$id3 = Uuid::v4();

$documents = [
new TextDocument($id1, 'Document 1'),
new TextDocument($id2, 'Document 2'),
new TextDocument($id3, 'Document 3'),
];

$vectors = [
new Vector([0.1]),
new Vector([0.2]),
new Vector([0.3]),
];

$platform = PlatformTestHandler::createPlatform(new VectorResult(...$vectors));
$model = new Embeddings();

$vectorizer = new Vectorizer($platform, $model);
$vectorDocuments = $vectorizer->vectorizeDocuments($documents);

$this->assertCount(3, $vectorDocuments);
$this->assertSame($id1, $vectorDocuments[0]->id);
$this->assertSame($id2, $vectorDocuments[1]->id);
$this->assertSame($id3, $vectorDocuments[2]->id);
}

#[DataProvider('provideDocumentCounts')]
public function testVectorizeVariousDocumentCounts(int $count)
{
$documents = [];
$vectors = [];

for ($i = 0; $i < $count; ++$i) {
$documents[] = new TextDocument(
Uuid::v4(),
\sprintf('Document %d content', $i),
new Metadata(['index' => $i])
);
$vectors[] = new Vector([$i * 0.1, $i * 0.2, $i * 0.3]);
}

$platform = PlatformTestHandler::createPlatform(
$count > 0 ? new VectorResult(...$vectors) : new VectorResult()
);
$model = new Embeddings();

$vectorizer = new Vectorizer($platform, $model);
$vectorDocuments = $vectorizer->vectorizeDocuments($documents);

$this->assertCount($count, $vectorDocuments);

foreach ($vectorDocuments as $i => $vectorDoc) {
$this->assertInstanceOf(VectorDocument::class, $vectorDoc);
$this->assertSame($documents[$i]->id, $vectorDoc->id);
$this->assertEquals($vectors[$i], $vectorDoc->vector);
$this->assertSame($documents[$i]->metadata, $vectorDoc->metadata);
$this->assertSame(['index' => $i], $vectorDoc->metadata->getArrayCopy());
}
}

/**
* @return \Generator<string, array{int}>
*/
public static function provideDocumentCounts(): \Generator
{
yield 'no documents' => [0];
yield 'single document' => [1];
yield 'two documents' => [2];
yield 'three documents' => [3];
}

public function testVectorizeDocumentsWithLargeVectors()
{
$document = new TextDocument(Uuid::v4(), 'Test content');

// Create a large vector with 1536 dimensions (typical for OpenAI embeddings)
$dimensions = [];
for ($i = 0; $i < 1536; ++$i) {
$dimensions[] = $i * 0.001;
}
$vector = new Vector($dimensions);

$platform = PlatformTestHandler::createPlatform(new VectorResult($vector));
$model = new Embeddings();

$vectorizer = new Vectorizer($platform, $model);
$vectorDocuments = $vectorizer->vectorizeDocuments([$document]);

$this->assertCount(1, $vectorDocuments);
$this->assertEquals($vector, $vectorDocuments[0]->vector);
}

public function testVectorizeDocumentsWithSpecialCharacters()
{
$documents = [
new TextDocument(Uuid::v4(), 'Document with "quotes" and special chars: @#$%'),
new TextDocument(Uuid::v4(), "Document with\nnewlines\nand\ttabs"),
new TextDocument(Uuid::v4(), 'Document with émojis 🚀 and ünïcödé'),
];

$vectors = [
new Vector([0.1, 0.2]),
new Vector([0.3, 0.4]),
new Vector([0.5, 0.6]),
];

$platform = PlatformTestHandler::createPlatform(new VectorResult(...$vectors));
$model = new Embeddings();

$vectorizer = new Vectorizer($platform, $model);
$vectorDocuments = $vectorizer->vectorizeDocuments($documents);

$this->assertCount(3, $vectorDocuments);

foreach ($vectorDocuments as $i => $vectorDoc) {
$this->assertSame($documents[$i]->id, $vectorDoc->id);
$this->assertEquals($vectors[$i], $vectorDoc->vector);
}
}

public function testVectorizeDocumentsWithoutBatchSupportUsesNonBatchMode()
{
// Test with a model that doesn't support batch processing
$model = $this->createMock(Model::class);
$model->expects($this->once())
->method('supports')
->with(Capability::INPUT_MULTIPLE)
->willReturn(false);

$documents = [
new TextDocument(Uuid::v4(), 'Document 1'),
new TextDocument(Uuid::v4(), 'Document 2'),
];

// When batch is not supported, the platform should be invoked once per document
// We simulate this by providing separate vectors for each invocation
$vectors = [
new Vector([0.1, 0.2]),
new Vector([0.3, 0.4]),
];

// Create a custom platform handler for non-batch mode
$handler = new class($vectors) implements ModelClientInterface, ResultConverterInterface {
private int $callIndex = 0;

/**
* @param Vector[] $vectors
*/
public function __construct(
private readonly array $vectors,
) {
}

public function supports(Model $model): bool
{
return true;
}

public function request(Model $model, array|string|object $payload, array $options = []): RawHttpResult
{
return new RawHttpResult(new MockResponse());
}

public function convert(RawResultInterface $result, array $options = []): ResultInterface
{
// Return one vector at a time for non-batch mode
return new VectorResult($this->vectors[$this->callIndex++]);
}
};

$platform = new Platform([$handler], [$handler]);

$vectorizer = new Vectorizer($platform, $model);
$vectorDocuments = $vectorizer->vectorizeDocuments($documents);

$this->assertCount(2, $vectorDocuments);
$this->assertEquals($vectors[0], $vectorDocuments[0]->vector);
$this->assertEquals($vectors[1], $vectorDocuments[1]->vector);
}
}