From baa789b870736e152d52fc3ea156fbb7aa426b7a Mon Sep 17 00:00:00 2001
From: Rajas Bansal <rbansal@together.ai>
Date: Thu, 30 Oct 2025 07:59:37 -0700
Subject: [PATCH 1/3] Add changes for audio speech and audio transcriptions

---
 src/together/resources/audio/speech.py        | 10 +++++++--
 .../resources/audio/transcriptions.py         | 22 +++++++++++++++++--
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/together/resources/audio/speech.py b/src/together/resources/audio/speech.py
index da01586d..3c726b6c 100644
--- a/src/together/resources/audio/speech.py
+++ b/src/together/resources/audio/speech.py
@@ -30,7 +30,7 @@ def create(
         response_format: str = "wav",
         language: str = "en",
         response_encoding: str = "pcm_f32le",
-        sample_rate: int = 44100,
+        sample_rate: int | None = None,
         stream: bool = False,
         **kwargs: Any,
     ) -> AudioSpeechStreamResponse:
@@ -49,7 +49,7 @@ def create(
             response_encoding (str, optional): Audio encoding of response.
                 Defaults to "pcm_f32le".
             sample_rate (int, optional): Sampling rate to use for the output audio.
-                Defaults to 44100.
+                Defaults to None. If not provided, the default sampling rate for the model will be used.
             stream (bool, optional): If true, output is streamed for several characters at a time.
                 Defaults to False.
 
@@ -57,6 +57,12 @@ def create(
             Union[bytes, Iterator[AudioSpeechStreamChunk]]: The generated audio as bytes or an iterator over audio stream chunks.
         """
 
+        if sample_rate is None:
+            if 'cartesia' in model:
+                sample_rate = 44100
+            else:
+                sample_rate = 24000
+
         requestor = api_requestor.APIRequestor(
             client=self._client,
         )
diff --git a/src/together/resources/audio/transcriptions.py b/src/together/resources/audio/transcriptions.py
index 49aea2aa..dc815079 100644
--- a/src/together/resources/audio/transcriptions.py
+++ b/src/together/resources/audio/transcriptions.py
@@ -30,6 +30,7 @@ def create(
         timestamp_granularities: Optional[
             Union[str, AudioTimestampGranularities]
         ] = None,
+        diarize: bool = False,
         **kwargs: Any,
     ) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]:
         """
@@ -52,7 +53,11 @@ def create(
             timestamp_granularities: The timestamp granularities to populate for this
                 transcription. response_format must be set verbose_json to use timestamp
                 granularities. Either or both of these options are supported: word, or segment.
-
+            diarize: Whether to enable speaker diarization. When enabled, you will get the speaker id for each word in the transcription.
+                In the response, in the words array, you will get the speaker id for each word.
+                In addition, we also return the speaker_segments array which contains the speaker id for each speaker segment along with the start and end time of the segment along with all the words in the segment.
+                You can use the speaker_id to group the words by speaker.
+                You can use the speaker_segments to get the start and end time of each speaker segment.
         Returns:
             The transcribed text in the requested format.
         """
@@ -103,6 +108,9 @@ def create(
                 else timestamp_granularities
             )
 
+        if diarize:
+            params_data["diarize"] = diarize
+
         # Add any additional kwargs
         # Convert boolean values to lowercase strings for proper form encoding
         for key, value in kwargs.items():
@@ -135,6 +143,7 @@ def create(
         if (
             response_format == "verbose_json"
             or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON
+            or diarize
         ):
             # Create response with model validation that preserves extra fields
             return AudioTranscriptionVerboseResponse.model_validate(response.data)
@@ -158,6 +167,7 @@ async def create(
         timestamp_granularities: Optional[
             Union[str, AudioTimestampGranularities]
         ] = None,
+        diarize: bool = False,
         **kwargs: Any,
     ) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]:
         """
@@ -180,7 +190,11 @@ async def create(
             timestamp_granularities: The timestamp granularities to populate for this
                 transcription. response_format must be set verbose_json to use timestamp
                 granularities. Either or both of these options are supported: word, or segment.
-
+            diarize: Whether to enable speaker diarization. When enabled, you will get the speaker id for each word in the transcription.
+                In the response, in the words array, you will get the speaker id for each word.
+                In addition, we also return the speaker_segments array which contains the speaker id for each speaker segment along with the start and end time of the segment along with all the words in the segment.
+                You can use the speaker_id to group the words by speaker.
+                You can use the speaker_segments to get the start and end time of each speaker segment.
         Returns:
             The transcribed text in the requested format.
         """
@@ -239,6 +253,9 @@ async def create(
                 )
             )
 
+        if diarize:
+            params_data["diarize"] = diarize
+
         # Add any additional kwargs
         # Convert boolean values to lowercase strings for proper form encoding
         for key, value in kwargs.items():
@@ -271,6 +288,7 @@ async def create(
         if (
             response_format == "verbose_json"
             or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON
+            or diarize
         ):
             # Create response with model validation that preserves extra fields
             return AudioTranscriptionVerboseResponse.model_validate(response.data)

From 601e7e3bcb3773738f0f651d3c55e94679f5b8f2 Mon Sep 17 00:00:00 2001
From: Rajas Bansal <rbansal@together.ai>
Date: Thu, 30 Oct 2025 20:20:11 -0700
Subject: [PATCH 2/3] Remove testing word stuff

---
 tests/integration/resources/test_transcriptions.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/tests/integration/resources/test_transcriptions.py b/tests/integration/resources/test_transcriptions.py
index aae0377e..9202b85a 100644
--- a/tests/integration/resources/test_transcriptions.py
+++ b/tests/integration/resources/test_transcriptions.py
@@ -36,19 +36,6 @@ def validate_diarization_response(response_dict):
             assert "end" in word
             assert "speaker_id" in word
 
-    # Validate top-level words field
-    assert "words" in response_dict
-    assert isinstance(response_dict["words"], list)
-    assert len(response_dict["words"]) > 0
-
-    # Validate each word in top-level words
-    for word in response_dict["words"]:
-        assert "id" in word
-        assert "word" in word
-        assert "start" in word
-        assert "end" in word
-        assert "speaker_id" in word
-
 
 class TestTogetherTranscriptions:
     @pytest.fixture

From e166e1586a07d197b22853fde8347937e63455bf Mon Sep 17 00:00:00 2001
From: Rajas Bansal <rbansal@together.ai>
Date: Thu, 30 Oct 2025 20:22:12 -0700
Subject: [PATCH 3/3] Black formatting'

---
 src/together/resources/audio/speech.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/together/resources/audio/speech.py b/src/together/resources/audio/speech.py
index 3c726b6c..1c3231b7 100644
--- a/src/together/resources/audio/speech.py
+++ b/src/together/resources/audio/speech.py
@@ -58,7 +58,7 @@ def create(
         """
 
         if sample_rate is None:
-            if 'cartesia' in model:
+            if "cartesia" in model:
                 sample_rate = 44100
             else:
                 sample_rate = 24000