From ad663e5c29f044bf5e054fdd7a8d5fff77b09c15 Mon Sep 17 00:00:00 2001 From: Oskar Stark Date: Sat, 27 Sep 2025 08:08:48 +0200 Subject: [PATCH] [Platform][ElevenLabs] Use capability-based speech-to-text detection --- .../src/Bridge/ElevenLabs/ElevenLabs.php | 20 ++- .../Bridge/ElevenLabs/ElevenLabsClient.php | 3 +- src/platform/src/Capability.php | 4 + src/platform/src/Model.php | 2 +- .../ElevenLabs/ElevenLabsClientTest.php | 30 ++-- .../Bridge/ElevenLabs/ElevenLabsTest.php | 135 ++++++++++++++++++ 6 files changed, 182 insertions(+), 12 deletions(-) create mode 100644 src/platform/tests/Bridge/ElevenLabs/ElevenLabsTest.php diff --git a/src/platform/src/Bridge/ElevenLabs/ElevenLabs.php b/src/platform/src/Bridge/ElevenLabs/ElevenLabs.php index 86fce604c..8948ec6c9 100644 --- a/src/platform/src/Bridge/ElevenLabs/ElevenLabs.php +++ b/src/platform/src/Bridge/ElevenLabs/ElevenLabs.php @@ -11,6 +11,7 @@ namespace Symfony\AI\Platform\Bridge\ElevenLabs; +use Symfony\AI\Platform\Capability; use Symfony\AI\Platform\Model; /** @@ -18,6 +19,7 @@ */ final class ElevenLabs extends Model { + // text-to-speech models public const ELEVEN_V3 = 'eleven_v3'; public const ELEVEN_TTV_V3 = 'eleven_ttv_v3'; public const ELEVEN_MULTILINGUAL_V2 = 'eleven_multilingual_v2'; @@ -28,11 +30,27 @@ final class ElevenLabs extends Model public const ELEVEN_MULTILINGUAL_STS_V2 = 'eleven_multilingual_sts_v2'; public const ELEVEN_MULTILINGUAL_ttv_V2 = 'eleven_multilingual_ttv_v2'; public const ELEVEN_ENGLISH_STS_V2 = 'eleven_english_sts_v2'; + + // speech-to-text models public const SCRIBE_V1 = 'scribe_v1'; public const SCRIBE_V1_EXPERIMENTAL = 'scribe_v1_experimental'; public function __construct(string $name, array $options = []) { - parent::__construct($name, [], $options); + $capabilities = [ + Capability::INPUT_TEXT, + Capability::OUTPUT_AUDIO, + Capability::TEXT_TO_SPEECH, + ]; + + if (\in_array($name, [self::SCRIBE_V1, self::SCRIBE_V1_EXPERIMENTAL], true)) { + $capabilities = [ + Capability::INPUT_AUDIO, + Capability::OUTPUT_TEXT, + Capability::SPEECH_TO_TEXT, + ]; + } + + parent::__construct($name, $capabilities, $options); } } diff --git a/src/platform/src/Bridge/ElevenLabs/ElevenLabsClient.php b/src/platform/src/Bridge/ElevenLabs/ElevenLabsClient.php index 100c05330..6ff61810c 100644 --- a/src/platform/src/Bridge/ElevenLabs/ElevenLabsClient.php +++ b/src/platform/src/Bridge/ElevenLabs/ElevenLabsClient.php @@ -11,6 +11,7 @@ namespace Symfony\AI\Platform\Bridge\ElevenLabs; +use Symfony\AI\Platform\Capability; use Symfony\AI\Platform\Exception\InvalidArgumentException; use Symfony\AI\Platform\Model; use Symfony\AI\Platform\ModelClientInterface; @@ -41,7 +42,7 @@ public function request(Model $model, array|string $payload, array $options = [] throw new InvalidArgumentException(\sprintf('The payload must be an array, received "%s".', get_debug_type($payload))); } - if (\in_array($model->getName(), [ElevenLabs::SCRIBE_V1, ElevenLabs::SCRIBE_V1_EXPERIMENTAL], true)) { + if ($model->supports(Capability::SPEECH_TO_TEXT)) { return $this->doSpeechToTextRequest($model, $payload); } diff --git a/src/platform/src/Capability.php b/src/platform/src/Capability.php index 455cbf4bf..7d65c2caf 100644 --- a/src/platform/src/Capability.php +++ b/src/platform/src/Capability.php @@ -11,11 +11,15 @@ namespace Symfony\AI\Platform; +use OskarStark\Enum\Trait\Comparable; + /** * @author Christopher Hertel */ enum Capability: string { + use Comparable; + // INPUT case INPUT_AUDIO = 'input-audio'; case INPUT_IMAGE = 'input-image'; diff --git a/src/platform/src/Model.php b/src/platform/src/Model.php index 393b1528e..62069f222 100644 --- a/src/platform/src/Model.php +++ b/src/platform/src/Model.php @@ -51,7 +51,7 @@ public function getCapabilities(): array public function supports(Capability $capability): bool { - return \in_array($capability, $this->capabilities, true); + return $capability->equalsOneOf($this->capabilities); } /** diff --git a/src/platform/tests/Bridge/ElevenLabs/ElevenLabsClientTest.php b/src/platform/tests/Bridge/ElevenLabs/ElevenLabsClientTest.php index c0639367a..21941544e 100644 --- a/src/platform/tests/Bridge/ElevenLabs/ElevenLabsClientTest.php +++ b/src/platform/tests/Bridge/ElevenLabs/ElevenLabsClientTest.php @@ -30,7 +30,6 @@ public function testSupportsModel() $client = new ElevenLabsClient( new MockHttpClient(), 'my-api-key', - 'https://api.elevenlabs.io/v1', ); $this->assertTrue($client->supports(new ElevenLabs(ElevenLabs::ELEVEN_MULTILINGUAL_V2))); @@ -53,7 +52,6 @@ public function testClientCannotPerformWithInvalidModel() $client = new ElevenLabsClient( $mockHttpClient, 'my-api-key', - 'https://api.elevenlabs.io/v1', ); $payload = $normalizer->normalize(Audio::fromFile(\dirname(__DIR__, 5).'/fixtures/audio.mp3')); @@ -69,7 +67,6 @@ public function testClientCannotPerformSpeechToTextRequestWithInvalidPayload() $client = new ElevenLabsClient( new MockHttpClient(), 'my-api-key', - 'https://api.elevenlabs.io/v1', ); $this->expectException(InvalidArgumentException::class); @@ -90,7 +87,6 @@ public function testClientCanPerformSpeechToTextRequest() $client = new ElevenLabsClient( $httpClient, 'my-api-key', - 'https://api.elevenlabs.io/v1', ); $payload = $normalizer->normalize(Audio::fromFile(\dirname(__DIR__, 5).'/fixtures/audio.mp3')); @@ -100,6 +96,27 @@ public function testClientCanPerformSpeechToTextRequest() $this->assertSame(1, $httpClient->getRequestsCount()); } + public function testClientCanPerformSpeechToTextRequestWithExperimentalModel() + { + $httpClient = new MockHttpClient([ + new JsonMockResponse([ + 'text' => 'foo', + ]), + ]); + $normalizer = new AudioNormalizer(); + + $client = new ElevenLabsClient( + $httpClient, + 'my-api-key', + ); + + $payload = $normalizer->normalize(Audio::fromFile(\dirname(__DIR__, 5).'/fixtures/audio.mp3')); + + $client->request(new ElevenLabs(ElevenLabs::SCRIBE_V1_EXPERIMENTAL), $payload); + + $this->assertSame(1, $httpClient->getRequestsCount()); + } + public function testClientCannotPerformTextToSpeechRequestWithoutValidPayload() { $mockHttpClient = new MockHttpClient([ @@ -115,7 +132,6 @@ public function testClientCannotPerformTextToSpeechRequestWithoutValidPayload() $client = new ElevenLabsClient( $mockHttpClient, 'my-api-key', - 'https://api.elevenlabs.io/v1', ); $this->expectException(InvalidArgumentException::class); @@ -143,7 +159,6 @@ public function testClientCanPerformTextToSpeechRequest() $client = new ElevenLabsClient( $httpClient, 'my-api-key', - 'https://api.elevenlabs.io/v1', ); $client->request(new ElevenLabs(ElevenLabs::ELEVEN_MULTILINGUAL_V2, [ @@ -172,7 +187,6 @@ public function testClientCanPerformTextToSpeechRequestWhenVoiceKeyIsProvidedAsR $client = new ElevenLabsClient( $httpClient, 'my-api-key', - 'https://api.elevenlabs.io/v1', ); $client->request(new ElevenLabs(ElevenLabs::ELEVEN_MULTILINGUAL_V2), [ @@ -201,7 +215,6 @@ public function testClientCanPerformTextToSpeechRequestAsStream() $client = new ElevenLabsClient( $httpClient, 'my-api-key', - 'https://api.elevenlabs.io/v1', ); $result = $client->request(new ElevenLabs(ElevenLabs::ELEVEN_MULTILINGUAL_V2, [ @@ -232,7 +245,6 @@ public function testClientCanPerformTextToSpeechRequestAsStreamVoiceKeyIsProvide $client = new ElevenLabsClient( $httpClient, 'my-api-key', - 'https://api.elevenlabs.io/v1', ); $result = $client->request(new ElevenLabs(ElevenLabs::ELEVEN_MULTILINGUAL_V2), [ diff --git a/src/platform/tests/Bridge/ElevenLabs/ElevenLabsTest.php b/src/platform/tests/Bridge/ElevenLabs/ElevenLabsTest.php new file mode 100644 index 000000000..e719c4bd2 --- /dev/null +++ b/src/platform/tests/Bridge/ElevenLabs/ElevenLabsTest.php @@ -0,0 +1,135 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Platform\Tests\Bridge\ElevenLabs; + +use PHPUnit\Framework\Attributes\DataProvider; +use PHPUnit\Framework\TestCase; +use Symfony\AI\Platform\Bridge\ElevenLabs\ElevenLabs; +use Symfony\AI\Platform\Capability; + +final class ElevenLabsTest extends TestCase +{ + public function testSpeechToTextModelHasCorrectCapabilities() + { + $model = new ElevenLabs(ElevenLabs::SCRIBE_V1); + + $this->assertTrue($model->supports(Capability::INPUT_AUDIO)); + $this->assertTrue($model->supports(Capability::OUTPUT_TEXT)); + $this->assertTrue($model->supports(Capability::SPEECH_TO_TEXT)); + $this->assertFalse($model->supports(Capability::INPUT_TEXT)); + $this->assertFalse($model->supports(Capability::OUTPUT_AUDIO)); + $this->assertFalse($model->supports(Capability::TEXT_TO_SPEECH)); + } + + public function testSpeechToTextExperimentalModelHasCorrectCapabilities() + { + $model = new ElevenLabs(ElevenLabs::SCRIBE_V1_EXPERIMENTAL); + + $this->assertTrue($model->supports(Capability::INPUT_AUDIO)); + $this->assertTrue($model->supports(Capability::OUTPUT_TEXT)); + $this->assertTrue($model->supports(Capability::SPEECH_TO_TEXT)); + $this->assertFalse($model->supports(Capability::INPUT_TEXT)); + $this->assertFalse($model->supports(Capability::OUTPUT_AUDIO)); + $this->assertFalse($model->supports(Capability::TEXT_TO_SPEECH)); + } + + public function testTextToSpeechModelHasCorrectCapabilities() + { + $model = new ElevenLabs(ElevenLabs::ELEVEN_MULTILINGUAL_V2); + + $this->assertTrue($model->supports(Capability::INPUT_TEXT)); + $this->assertTrue($model->supports(Capability::OUTPUT_AUDIO)); + $this->assertTrue($model->supports(Capability::TEXT_TO_SPEECH)); + $this->assertFalse($model->supports(Capability::INPUT_AUDIO)); + $this->assertFalse($model->supports(Capability::OUTPUT_TEXT)); + $this->assertFalse($model->supports(Capability::SPEECH_TO_TEXT)); + } + + public function testGetCapabilitiesReturnsSpeechToTextCapabilities() + { + $model = new ElevenLabs(ElevenLabs::SCRIBE_V1); + + $capabilities = $model->getCapabilities(); + + $this->assertCount(3, $capabilities); + $this->assertContains(Capability::INPUT_AUDIO, $capabilities); + $this->assertContains(Capability::OUTPUT_TEXT, $capabilities); + $this->assertContains(Capability::SPEECH_TO_TEXT, $capabilities); + } + + public function testGetCapabilitiesReturnsTextToSpeechCapabilities() + { + $model = new ElevenLabs(ElevenLabs::ELEVEN_V3); + + $capabilities = $model->getCapabilities(); + + $this->assertCount(3, $capabilities); + $this->assertContains(Capability::INPUT_TEXT, $capabilities); + $this->assertContains(Capability::OUTPUT_AUDIO, $capabilities); + $this->assertContains(Capability::TEXT_TO_SPEECH, $capabilities); + } + + public function testModelNameIsCorrectlySet() + { + $model = new ElevenLabs(ElevenLabs::SCRIBE_V1); + + $this->assertSame(ElevenLabs::SCRIBE_V1, $model->getName()); + } + + public function testModelOptionsAreCorrectlySet() + { + $options = ['voice' => 'test-voice', 'speed' => 1.2]; + $model = new ElevenLabs(ElevenLabs::ELEVEN_MULTILINGUAL_V2, $options); + + $this->assertSame($options, $model->getOptions()); + } + + #[DataProvider('speechToTextModelProvider')] + public function testAllSpeechToTextModelsHaveCorrectCapabilities(string $modelName) + { + $model = new ElevenLabs($modelName); + + $this->assertTrue($model->supports(Capability::SPEECH_TO_TEXT)); + $this->assertTrue($model->supports(Capability::INPUT_AUDIO)); + $this->assertTrue($model->supports(Capability::OUTPUT_TEXT)); + } + + #[DataProvider('textToSpeechModelProvider')] + public function testAllTextToSpeechModelsHaveCorrectCapabilities(string $modelName) + { + $model = new ElevenLabs($modelName); + + $this->assertTrue($model->supports(Capability::TEXT_TO_SPEECH)); + $this->assertTrue($model->supports(Capability::INPUT_TEXT)); + $this->assertTrue($model->supports(Capability::OUTPUT_AUDIO)); + } + + public static function speechToTextModelProvider(): iterable + { + yield [ElevenLabs::SCRIBE_V1]; + yield [ElevenLabs::SCRIBE_V1_EXPERIMENTAL]; + } + + public static function textToSpeechModelProvider(): iterable + { + yield [ElevenLabs::ELEVEN_V3]; + yield [ElevenLabs::ELEVEN_TTV_V3]; + yield [ElevenLabs::ELEVEN_MULTILINGUAL_V2]; + yield [ElevenLabs::ELEVEN_FLASH_V250]; + yield [ElevenLabs::ELEVEN_FLASH_V2]; + yield [ElevenLabs::ELEVEN_TURBO_V2_5]; + yield [ElevenLabs::ELEVEN_TURBO_v2]; + yield [ElevenLabs::ELEVEN_MULTILINGUAL_STS_V2]; + yield [ElevenLabs::ELEVEN_MULTILINGUAL_ttv_V2]; + yield [ElevenLabs::ELEVEN_ENGLISH_STS_V2]; + } +}