Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions src/together/resources/audio/speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def create(
response_format: str = "wav",
language: str = "en",
response_encoding: str = "pcm_f32le",
sample_rate: int = 44100,
sample_rate: int | None = None,
stream: bool = False,
**kwargs: Any,
) -> AudioSpeechStreamResponse:
Expand All @@ -49,14 +49,20 @@ def create(
response_encoding (str, optional): Audio encoding of response.
Defaults to "pcm_f32le".
sample_rate (int, optional): Sampling rate to use for the output audio.
Defaults to 44100.
Defaults to None. If not provided, the default sampling rate for the model will be used.
stream (bool, optional): If true, output is streamed for several characters at a time.
Defaults to False.

Returns:
Union[bytes, Iterator[AudioSpeechStreamChunk]]: The generated audio as bytes or an iterator over audio stream chunks.
"""

if sample_rate is None:
if "cartesia" in model:
sample_rate = 44100
else:
sample_rate = 24000

requestor = api_requestor.APIRequestor(
client=self._client,
)
Expand Down
22 changes: 20 additions & 2 deletions src/together/resources/audio/transcriptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def create(
timestamp_granularities: Optional[
Union[str, AudioTimestampGranularities]
] = None,
diarize: bool = False,
**kwargs: Any,
) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]:
"""
Expand All @@ -52,7 +53,11 @@ def create(
timestamp_granularities: The timestamp granularities to populate for this
transcription. response_format must be set verbose_json to use timestamp
granularities. Either or both of these options are supported: word, or segment.

diarize: Whether to enable speaker diarization. When enabled, you will get the speaker id for each word in the transcription.
In the response, in the words array, you will get the speaker id for each word.
In addition, we also return the speaker_segments array which contains the speaker id for each speaker segment along with the start and end time of the segment along with all the words in the segment.
You can use the speaker_id to group the words by speaker.
You can use the speaker_segments to get the start and end time of each speaker segment.
Returns:
The transcribed text in the requested format.
"""
Expand Down Expand Up @@ -103,6 +108,9 @@ def create(
else timestamp_granularities
)

if diarize:
params_data["diarize"] = diarize

# Add any additional kwargs
# Convert boolean values to lowercase strings for proper form encoding
for key, value in kwargs.items():
Expand Down Expand Up @@ -135,6 +143,7 @@ def create(
if (
response_format == "verbose_json"
or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON
or diarize
):
# Create response with model validation that preserves extra fields
return AudioTranscriptionVerboseResponse.model_validate(response.data)
Expand All @@ -158,6 +167,7 @@ async def create(
timestamp_granularities: Optional[
Union[str, AudioTimestampGranularities]
] = None,
diarize: bool = False,
**kwargs: Any,
) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]:
"""
Expand All @@ -180,7 +190,11 @@ async def create(
timestamp_granularities: The timestamp granularities to populate for this
transcription. response_format must be set verbose_json to use timestamp
granularities. Either or both of these options are supported: word, or segment.

diarize: Whether to enable speaker diarization. When enabled, you will get the speaker id for each word in the transcription.
In the response, in the words array, you will get the speaker id for each word.
In addition, we also return the speaker_segments array which contains the speaker id for each speaker segment along with the start and end time of the segment along with all the words in the segment.
You can use the speaker_id to group the words by speaker.
You can use the speaker_segments to get the start and end time of each speaker segment.
Returns:
The transcribed text in the requested format.
"""
Expand Down Expand Up @@ -239,6 +253,9 @@ async def create(
)
)

if diarize:
params_data["diarize"] = diarize

# Add any additional kwargs
# Convert boolean values to lowercase strings for proper form encoding
for key, value in kwargs.items():
Expand Down Expand Up @@ -271,6 +288,7 @@ async def create(
if (
response_format == "verbose_json"
or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON
or diarize
):
# Create response with model validation that preserves extra fields
return AudioTranscriptionVerboseResponse.model_validate(response.data)
Expand Down
13 changes: 0 additions & 13 deletions tests/integration/resources/test_transcriptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,6 @@ def validate_diarization_response(response_dict):
assert "end" in word
assert "speaker_id" in word

# Validate top-level words field
assert "words" in response_dict
assert isinstance(response_dict["words"], list)
assert len(response_dict["words"]) > 0

# Validate each word in top-level words
for word in response_dict["words"]:
assert "id" in word
assert "word" in word
assert "start" in word
assert "end" in word
assert "speaker_id" in word


class TestTogetherTranscriptions:
@pytest.fixture
Expand Down