From baa789b870736e152d52fc3ea156fbb7aa426b7a Mon Sep 17 00:00:00 2001 From: Rajas Bansal Date: Thu, 30 Oct 2025 07:59:37 -0700 Subject: [PATCH 1/3] Add changes for audio speech and audio transcriptions --- src/together/resources/audio/speech.py | 10 +++++++-- .../resources/audio/transcriptions.py | 22 +++++++++++++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/together/resources/audio/speech.py b/src/together/resources/audio/speech.py index da01586d..3c726b6c 100644 --- a/src/together/resources/audio/speech.py +++ b/src/together/resources/audio/speech.py @@ -30,7 +30,7 @@ def create( response_format: str = "wav", language: str = "en", response_encoding: str = "pcm_f32le", - sample_rate: int = 44100, + sample_rate: int | None = None, stream: bool = False, **kwargs: Any, ) -> AudioSpeechStreamResponse: @@ -49,7 +49,7 @@ def create( response_encoding (str, optional): Audio encoding of response. Defaults to "pcm_f32le". sample_rate (int, optional): Sampling rate to use for the output audio. - Defaults to 44100. + Defaults to None. If not provided, the default sampling rate for the model will be used. stream (bool, optional): If true, output is streamed for several characters at a time. Defaults to False. @@ -57,6 +57,12 @@ def create( Union[bytes, Iterator[AudioSpeechStreamChunk]]: The generated audio as bytes or an iterator over audio stream chunks. """ + if sample_rate is None: + if 'cartesia' in model: + sample_rate = 44100 + else: + sample_rate = 24000 + requestor = api_requestor.APIRequestor( client=self._client, ) diff --git a/src/together/resources/audio/transcriptions.py b/src/together/resources/audio/transcriptions.py index 49aea2aa..dc815079 100644 --- a/src/together/resources/audio/transcriptions.py +++ b/src/together/resources/audio/transcriptions.py @@ -30,6 +30,7 @@ def create( timestamp_granularities: Optional[ Union[str, AudioTimestampGranularities] ] = None, + diarize: bool = False, **kwargs: Any, ) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]: """ @@ -52,7 +53,11 @@ def create( timestamp_granularities: The timestamp granularities to populate for this transcription. response_format must be set verbose_json to use timestamp granularities. Either or both of these options are supported: word, or segment. - + diarize: Whether to enable speaker diarization. When enabled, you will get the speaker id for each word in the transcription. + In the response, in the words array, you will get the speaker id for each word. + In addition, we also return the speaker_segments array which contains the speaker id for each speaker segment along with the start and end time of the segment along with all the words in the segment. + You can use the speaker_id to group the words by speaker. + You can use the speaker_segments to get the start and end time of each speaker segment. Returns: The transcribed text in the requested format. """ @@ -103,6 +108,9 @@ def create( else timestamp_granularities ) + if diarize: + params_data["diarize"] = diarize + # Add any additional kwargs # Convert boolean values to lowercase strings for proper form encoding for key, value in kwargs.items(): @@ -135,6 +143,7 @@ def create( if ( response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON + or diarize ): # Create response with model validation that preserves extra fields return AudioTranscriptionVerboseResponse.model_validate(response.data) @@ -158,6 +167,7 @@ async def create( timestamp_granularities: Optional[ Union[str, AudioTimestampGranularities] ] = None, + diarize: bool = False, **kwargs: Any, ) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]: """ @@ -180,7 +190,11 @@ async def create( timestamp_granularities: The timestamp granularities to populate for this transcription. response_format must be set verbose_json to use timestamp granularities. Either or both of these options are supported: word, or segment. - + diarize: Whether to enable speaker diarization. When enabled, you will get the speaker id for each word in the transcription. + In the response, in the words array, you will get the speaker id for each word. + In addition, we also return the speaker_segments array which contains the speaker id for each speaker segment along with the start and end time of the segment along with all the words in the segment. + You can use the speaker_id to group the words by speaker. + You can use the speaker_segments to get the start and end time of each speaker segment. Returns: The transcribed text in the requested format. """ @@ -239,6 +253,9 @@ async def create( ) ) + if diarize: + params_data["diarize"] = diarize + # Add any additional kwargs # Convert boolean values to lowercase strings for proper form encoding for key, value in kwargs.items(): @@ -271,6 +288,7 @@ async def create( if ( response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON + or diarize ): # Create response with model validation that preserves extra fields return AudioTranscriptionVerboseResponse.model_validate(response.data) From 601e7e3bcb3773738f0f651d3c55e94679f5b8f2 Mon Sep 17 00:00:00 2001 From: Rajas Bansal Date: Thu, 30 Oct 2025 20:20:11 -0700 Subject: [PATCH 2/3] Remove testing word stuff --- tests/integration/resources/test_transcriptions.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/tests/integration/resources/test_transcriptions.py b/tests/integration/resources/test_transcriptions.py index aae0377e..9202b85a 100644 --- a/tests/integration/resources/test_transcriptions.py +++ b/tests/integration/resources/test_transcriptions.py @@ -36,19 +36,6 @@ def validate_diarization_response(response_dict): assert "end" in word assert "speaker_id" in word - # Validate top-level words field - assert "words" in response_dict - assert isinstance(response_dict["words"], list) - assert len(response_dict["words"]) > 0 - - # Validate each word in top-level words - for word in response_dict["words"]: - assert "id" in word - assert "word" in word - assert "start" in word - assert "end" in word - assert "speaker_id" in word - class TestTogetherTranscriptions: @pytest.fixture From e166e1586a07d197b22853fde8347937e63455bf Mon Sep 17 00:00:00 2001 From: Rajas Bansal Date: Thu, 30 Oct 2025 20:22:12 -0700 Subject: [PATCH 3/3] Black formatting' --- src/together/resources/audio/speech.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/together/resources/audio/speech.py b/src/together/resources/audio/speech.py index 3c726b6c..1c3231b7 100644 --- a/src/together/resources/audio/speech.py +++ b/src/together/resources/audio/speech.py @@ -58,7 +58,7 @@ def create( """ if sample_rate is None: - if 'cartesia' in model: + if "cartesia" in model: sample_rate = 44100 else: sample_rate = 24000