diff --git a/src/together/resources/audio/speech.py b/src/together/resources/audio/speech.py index da01586..1c3231b 100644 --- a/src/together/resources/audio/speech.py +++ b/src/together/resources/audio/speech.py @@ -30,7 +30,7 @@ def create( response_format: str = "wav", language: str = "en", response_encoding: str = "pcm_f32le", - sample_rate: int = 44100, + sample_rate: int | None = None, stream: bool = False, **kwargs: Any, ) -> AudioSpeechStreamResponse: @@ -49,7 +49,7 @@ def create( response_encoding (str, optional): Audio encoding of response. Defaults to "pcm_f32le". sample_rate (int, optional): Sampling rate to use for the output audio. - Defaults to 44100. + Defaults to None. If not provided, the default sampling rate for the model will be used. stream (bool, optional): If true, output is streamed for several characters at a time. Defaults to False. @@ -57,6 +57,12 @@ def create( Union[bytes, Iterator[AudioSpeechStreamChunk]]: The generated audio as bytes or an iterator over audio stream chunks. """ + if sample_rate is None: + if "cartesia" in model: + sample_rate = 44100 + else: + sample_rate = 24000 + requestor = api_requestor.APIRequestor( client=self._client, ) diff --git a/src/together/resources/audio/transcriptions.py b/src/together/resources/audio/transcriptions.py index 49aea2a..dc81507 100644 --- a/src/together/resources/audio/transcriptions.py +++ b/src/together/resources/audio/transcriptions.py @@ -30,6 +30,7 @@ def create( timestamp_granularities: Optional[ Union[str, AudioTimestampGranularities] ] = None, + diarize: bool = False, **kwargs: Any, ) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]: """ @@ -52,7 +53,11 @@ def create( timestamp_granularities: The timestamp granularities to populate for this transcription. response_format must be set verbose_json to use timestamp granularities. Either or both of these options are supported: word, or segment. - + diarize: Whether to enable speaker diarization. When enabled, you will get the speaker id for each word in the transcription. + In the response, in the words array, you will get the speaker id for each word. + In addition, we also return the speaker_segments array which contains the speaker id for each speaker segment along with the start and end time of the segment along with all the words in the segment. + You can use the speaker_id to group the words by speaker. + You can use the speaker_segments to get the start and end time of each speaker segment. Returns: The transcribed text in the requested format. """ @@ -103,6 +108,9 @@ def create( else timestamp_granularities ) + if diarize: + params_data["diarize"] = diarize + # Add any additional kwargs # Convert boolean values to lowercase strings for proper form encoding for key, value in kwargs.items(): @@ -135,6 +143,7 @@ def create( if ( response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON + or diarize ): # Create response with model validation that preserves extra fields return AudioTranscriptionVerboseResponse.model_validate(response.data) @@ -158,6 +167,7 @@ async def create( timestamp_granularities: Optional[ Union[str, AudioTimestampGranularities] ] = None, + diarize: bool = False, **kwargs: Any, ) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]: """ @@ -180,7 +190,11 @@ async def create( timestamp_granularities: The timestamp granularities to populate for this transcription. response_format must be set verbose_json to use timestamp granularities. Either or both of these options are supported: word, or segment. - + diarize: Whether to enable speaker diarization. When enabled, you will get the speaker id for each word in the transcription. + In the response, in the words array, you will get the speaker id for each word. + In addition, we also return the speaker_segments array which contains the speaker id for each speaker segment along with the start and end time of the segment along with all the words in the segment. + You can use the speaker_id to group the words by speaker. + You can use the speaker_segments to get the start and end time of each speaker segment. Returns: The transcribed text in the requested format. """ @@ -239,6 +253,9 @@ async def create( ) ) + if diarize: + params_data["diarize"] = diarize + # Add any additional kwargs # Convert boolean values to lowercase strings for proper form encoding for key, value in kwargs.items(): @@ -271,6 +288,7 @@ async def create( if ( response_format == "verbose_json" or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON + or diarize ): # Create response with model validation that preserves extra fields return AudioTranscriptionVerboseResponse.model_validate(response.data) diff --git a/tests/integration/resources/test_transcriptions.py b/tests/integration/resources/test_transcriptions.py index aae0377..9202b85 100644 --- a/tests/integration/resources/test_transcriptions.py +++ b/tests/integration/resources/test_transcriptions.py @@ -36,19 +36,6 @@ def validate_diarization_response(response_dict): assert "end" in word assert "speaker_id" in word - # Validate top-level words field - assert "words" in response_dict - assert isinstance(response_dict["words"], list) - assert len(response_dict["words"]) > 0 - - # Validate each word in top-level words - for word in response_dict["words"]: - assert "id" in word - assert "word" in word - assert "start" in word - assert "end" in word - assert "speaker_id" in word - class TestTogetherTranscriptions: @pytest.fixture