From 6f73231202528d01d10c2a7a63db5ecaee2ac970 Mon Sep 17 00:00:00 2001 From: stainless-bot Date: Fri, 21 Jun 2024 16:28:48 +0000 Subject: [PATCH] feat(api): OpenAPI spec update via Stainless API --- .stats.yml | 2 +- src/together/_base_client.py | 25 +- src/together/_utils/__init__.py | 1 + src/together/_utils/_reflection.py | 8 + src/together/_utils/_sync.py | 19 +- src/together/resources/chat/completions.py | 390 +++++++++++------- src/together/resources/completions.py | 338 +++++++++------ .../types/chat/completion_create_params.py | 92 +++-- .../types/completion_create_params.py | 76 ++-- 9 files changed, 585 insertions(+), 366 deletions(-) create mode 100644 src/together/_utils/_reflection.py diff --git a/.stats.yml b/.stats.yml index 02655e1f..d6da9ca3 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,2 +1,2 @@ configured_endpoints: 15 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/togetherai%2FTogetherAI-5934359dd4fbab352cb5042ffbf08374bd3d3b6bc0550fd09797de44626772fe.yml +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/togetherai%2FTogetherAI-33661dd8fd4c26ecd595dee22e2c9274e6c4699ad8de5ece233e0d37376c6b7c.yml diff --git a/src/together/_base_client.py b/src/together/_base_client.py index 66db09be..a806809b 100644 --- a/src/together/_base_client.py +++ b/src/together/_base_client.py @@ -60,7 +60,7 @@ RequestOptions, ModelBuilderProtocol, ) -from ._utils import is_dict, is_list, is_given, lru_cache, is_mapping +from ._utils import is_dict, is_list, asyncify, is_given, lru_cache, is_mapping from ._compat import model_copy, model_dump from ._models import GenericModel, FinalRequestOptions, validate_type, construct_type from ._response import ( @@ -358,6 +358,7 @@ def __init__( self._custom_query = custom_query or {} self._strict_response_validation = _strict_response_validation self._idempotency_header = None + self._platform: Platform | None = None if max_retries is None: # pyright: ignore[reportUnnecessaryComparison] raise TypeError( @@ -456,7 +457,7 @@ def _build_request( raise RuntimeError(f"Unexpected JSON data type, {type(json_data)}, cannot merge with `extra_body`") headers = self._build_headers(options) - params = _merge_mappings(self._custom_query, options.params) + params = _merge_mappings(self.default_query, options.params) content_type = headers.get("Content-Type") # If the given Content-Type header is multipart/form-data then it @@ -592,6 +593,12 @@ def default_headers(self) -> dict[str, str | Omit]: **self._custom_headers, } + @property + def default_query(self) -> dict[str, object]: + return { + **self._custom_query, + } + def _validate_headers( self, headers: Headers, # noqa: ARG002 @@ -616,7 +623,10 @@ def base_url(self, url: URL | str) -> None: self._base_url = self._enforce_trailing_slash(url if isinstance(url, URL) else URL(url)) def platform_headers(self) -> Dict[str, str]: - return platform_headers(self._version) + # the actual implementation is in a separate `lru_cache` decorated + # function because adding `lru_cache` to methods will leak memory + # https://github.com/python/cpython/issues/88476 + return platform_headers(self._version, platform=self._platform) def _parse_retry_after_header(self, response_headers: Optional[httpx.Headers] = None) -> float | None: """Returns a float of the number of seconds (not milliseconds) to wait after retrying, or None if unspecified. @@ -1492,6 +1502,11 @@ async def _request( stream_cls: type[_AsyncStreamT] | None, remaining_retries: int | None, ) -> ResponseT | _AsyncStreamT: + if self._platform is None: + # `get_platform` can make blocking IO calls so we + # execute it earlier while we are in an async context + self._platform = await asyncify(get_platform)() + cast_to = self._maybe_override_cast_to(cast_to, options) await self._prepare_options(options) @@ -1915,11 +1930,11 @@ def get_platform() -> Platform: @lru_cache(maxsize=None) -def platform_headers(version: str) -> Dict[str, str]: +def platform_headers(version: str, *, platform: Platform | None) -> Dict[str, str]: return { "X-Stainless-Lang": "python", "X-Stainless-Package-Version": version, - "X-Stainless-OS": str(get_platform()), + "X-Stainless-OS": str(platform or get_platform()), "X-Stainless-Arch": str(get_architecture()), "X-Stainless-Runtime": get_python_runtime(), "X-Stainless-Runtime-Version": get_python_version(), diff --git a/src/together/_utils/__init__.py b/src/together/_utils/__init__.py index 31b5b227..667e2473 100644 --- a/src/together/_utils/__init__.py +++ b/src/together/_utils/__init__.py @@ -49,3 +49,4 @@ maybe_transform as maybe_transform, async_maybe_transform as async_maybe_transform, ) +from ._reflection import function_has_argument as function_has_argument diff --git a/src/together/_utils/_reflection.py b/src/together/_utils/_reflection.py new file mode 100644 index 00000000..e134f58e --- /dev/null +++ b/src/together/_utils/_reflection.py @@ -0,0 +1,8 @@ +import inspect +from typing import Any, Callable + + +def function_has_argument(func: Callable[..., Any], arg_name: str) -> bool: + """Returns whether or not the given function has a specific parameter""" + sig = inspect.signature(func) + return arg_name in sig.parameters diff --git a/src/together/_utils/_sync.py b/src/together/_utils/_sync.py index 595924e5..d0d81033 100644 --- a/src/together/_utils/_sync.py +++ b/src/together/_utils/_sync.py @@ -7,6 +7,8 @@ import anyio import anyio.to_thread +from ._reflection import function_has_argument + T_Retval = TypeVar("T_Retval") T_ParamSpec = ParamSpec("T_ParamSpec") @@ -59,6 +61,21 @@ def do_work(arg1, arg2, kwarg1="", kwarg2="") -> str: async def wrapper(*args: T_ParamSpec.args, **kwargs: T_ParamSpec.kwargs) -> T_Retval: partial_f = functools.partial(function, *args, **kwargs) - return await anyio.to_thread.run_sync(partial_f, cancellable=cancellable, limiter=limiter) + + # In `v4.1.0` anyio added the `abandon_on_cancel` argument and deprecated the old + # `cancellable` argument, so we need to use the new `abandon_on_cancel` to avoid + # surfacing deprecation warnings. + if function_has_argument(anyio.to_thread.run_sync, "abandon_on_cancel"): + return await anyio.to_thread.run_sync( + partial_f, + abandon_on_cancel=cancellable, + limiter=limiter, + ) + + return await anyio.to_thread.run_sync( + partial_f, + cancellable=cancellable, + limiter=limiter, + ) return wrapper diff --git a/src/together/resources/chat/completions.py b/src/together/resources/chat/completions.py index 43125d39..0d69a7c8 100644 --- a/src/together/resources/chat/completions.py +++ b/src/together/resources/chat/completions.py @@ -81,53 +81,66 @@ def create( model: The name of the model to query. - echo: If set, the response will contain the prompt, and will also return prompt - logprobs if set with logprobs. + echo: If true, the response will contain the prompt. Can be used with `logprobs` to + return prompt logprobs. - frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will decrease the likelihood of repeating tokens that were - mentioned prior. + frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of + repeating tokens that have already been mentioned. - logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens - appearing in the generated output. + logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output. logprobs: Determines the number of most likely tokens to return at each token position log - probabilities to return + probabilities to return. max_tokens: The maximum number of tokens to generate. - min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to - `temperature`. + min_p: A number between 0 and 1 that can be used as an alternative to temperature. - n: Number of generations to return + n: The number of completions to generate for each prompt. - presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will increase the likelihood of a model talking about new topics. + presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of + a model talking about new topics. repetition_penalty: A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition. - response_format: Specifies the format of the response. + response_format: An object specifying the format that the model must output. - safety_model: The name of the safety model to use. + safety_model: The name of the moderation model used to validate tokens. Choose from the + available moderation models found + [here](https://docs.together.ai/docs/inference-models#moderation-models). - stop: A list of string sequences that will truncate (stop) inference text output. + stop: A list of string sequences that will truncate (stop) inference text output. For + example, "" will stop generation as soon as the model generates the given + token. - stream: If set, tokens are returned as Server-Sent Events as they are made available. - Stream terminates with `data: [DONE]`. If false, return a single JSON object - containing the results. + stream: If true, stream tokens as Server-Sent Events as the model generates them instead + of waiting for the full model response. The stream terminates with + `data: [DONE]`. If false, return a single JSON object containing the results. - temperature: Determines the degree of randomness in the response. + temperature: A decimal number from 0-1 that determines the degree of randomness in the + response. A temperature less than 1 favors more correctness and is appropriate + for question answering or summarization. A value closer to 1 introduces more + randomness in the output. - tool_choice: The choice of tool to use. + tool_choice: Controls which (if any) function is called by the model. By default uses `auto`, + which lets the model pick between generating a message or calling a function. - tools: A list of tools to be used in the query. + tools: A list of tools the model may call. Currently, only functions are supported as a + tool. Use this to provide a list of functions the model may generate JSON inputs + for. - top_k: The `top_k` parameter is used to limit the number of choices for the next - predicted word or token. + top_k: An integer that's used to limit the number of choices for the next predicted + word or token. It specifies the maximum number of tokens to consider at each + step, based on their probability of occurrence. This technique helps to speed up + the generation process and can improve the quality of the generated text by + focusing on the most likely options. - top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of - choices for each predicted token based on the cumulative probabilities. + top_p: A percentage (also called the nucleus parameter) that's used to dynamically + adjust the number of choices for each predicted token based on the cumulative + probabilities. It specifies a probability threshold below which all less likely + tokens are filtered out. This technique helps maintain diversity and generate + more fluent and natural-sounding text. extra_headers: Send extra headers @@ -178,53 +191,66 @@ def create( model: The name of the model to query. - stream: If set, tokens are returned as Server-Sent Events as they are made available. - Stream terminates with `data: [DONE]`. If false, return a single JSON object - containing the results. + stream: If true, stream tokens as Server-Sent Events as the model generates them instead + of waiting for the full model response. The stream terminates with + `data: [DONE]`. If false, return a single JSON object containing the results. - echo: If set, the response will contain the prompt, and will also return prompt - logprobs if set with logprobs. + echo: If true, the response will contain the prompt. Can be used with `logprobs` to + return prompt logprobs. - frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will decrease the likelihood of repeating tokens that were - mentioned prior. + frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of + repeating tokens that have already been mentioned. - logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens - appearing in the generated output. + logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output. logprobs: Determines the number of most likely tokens to return at each token position log - probabilities to return + probabilities to return. max_tokens: The maximum number of tokens to generate. - min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to - `temperature`. + min_p: A number between 0 and 1 that can be used as an alternative to temperature. - n: Number of generations to return + n: The number of completions to generate for each prompt. - presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will increase the likelihood of a model talking about new topics. + presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of + a model talking about new topics. repetition_penalty: A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition. - response_format: Specifies the format of the response. + response_format: An object specifying the format that the model must output. - safety_model: The name of the safety model to use. + safety_model: The name of the moderation model used to validate tokens. Choose from the + available moderation models found + [here](https://docs.together.ai/docs/inference-models#moderation-models). - stop: A list of string sequences that will truncate (stop) inference text output. + stop: A list of string sequences that will truncate (stop) inference text output. For + example, "" will stop generation as soon as the model generates the given + token. - temperature: Determines the degree of randomness in the response. + temperature: A decimal number from 0-1 that determines the degree of randomness in the + response. A temperature less than 1 favors more correctness and is appropriate + for question answering or summarization. A value closer to 1 introduces more + randomness in the output. - tool_choice: The choice of tool to use. + tool_choice: Controls which (if any) function is called by the model. By default uses `auto`, + which lets the model pick between generating a message or calling a function. - tools: A list of tools to be used in the query. + tools: A list of tools the model may call. Currently, only functions are supported as a + tool. Use this to provide a list of functions the model may generate JSON inputs + for. - top_k: The `top_k` parameter is used to limit the number of choices for the next - predicted word or token. + top_k: An integer that's used to limit the number of choices for the next predicted + word or token. It specifies the maximum number of tokens to consider at each + step, based on their probability of occurrence. This technique helps to speed up + the generation process and can improve the quality of the generated text by + focusing on the most likely options. - top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of - choices for each predicted token based on the cumulative probabilities. + top_p: A percentage (also called the nucleus parameter) that's used to dynamically + adjust the number of choices for each predicted token based on the cumulative + probabilities. It specifies a probability threshold below which all less likely + tokens are filtered out. This technique helps maintain diversity and generate + more fluent and natural-sounding text. extra_headers: Send extra headers @@ -275,53 +301,66 @@ def create( model: The name of the model to query. - stream: If set, tokens are returned as Server-Sent Events as they are made available. - Stream terminates with `data: [DONE]`. If false, return a single JSON object - containing the results. + stream: If true, stream tokens as Server-Sent Events as the model generates them instead + of waiting for the full model response. The stream terminates with + `data: [DONE]`. If false, return a single JSON object containing the results. - echo: If set, the response will contain the prompt, and will also return prompt - logprobs if set with logprobs. + echo: If true, the response will contain the prompt. Can be used with `logprobs` to + return prompt logprobs. - frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will decrease the likelihood of repeating tokens that were - mentioned prior. + frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of + repeating tokens that have already been mentioned. - logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens - appearing in the generated output. + logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output. logprobs: Determines the number of most likely tokens to return at each token position log - probabilities to return + probabilities to return. max_tokens: The maximum number of tokens to generate. - min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to - `temperature`. + min_p: A number between 0 and 1 that can be used as an alternative to temperature. - n: Number of generations to return + n: The number of completions to generate for each prompt. - presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will increase the likelihood of a model talking about new topics. + presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of + a model talking about new topics. repetition_penalty: A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition. - response_format: Specifies the format of the response. + response_format: An object specifying the format that the model must output. - safety_model: The name of the safety model to use. + safety_model: The name of the moderation model used to validate tokens. Choose from the + available moderation models found + [here](https://docs.together.ai/docs/inference-models#moderation-models). - stop: A list of string sequences that will truncate (stop) inference text output. + stop: A list of string sequences that will truncate (stop) inference text output. For + example, "" will stop generation as soon as the model generates the given + token. - temperature: Determines the degree of randomness in the response. + temperature: A decimal number from 0-1 that determines the degree of randomness in the + response. A temperature less than 1 favors more correctness and is appropriate + for question answering or summarization. A value closer to 1 introduces more + randomness in the output. - tool_choice: The choice of tool to use. + tool_choice: Controls which (if any) function is called by the model. By default uses `auto`, + which lets the model pick between generating a message or calling a function. - tools: A list of tools to be used in the query. + tools: A list of tools the model may call. Currently, only functions are supported as a + tool. Use this to provide a list of functions the model may generate JSON inputs + for. - top_k: The `top_k` parameter is used to limit the number of choices for the next - predicted word or token. + top_k: An integer that's used to limit the number of choices for the next predicted + word or token. It specifies the maximum number of tokens to consider at each + step, based on their probability of occurrence. This technique helps to speed up + the generation process and can improve the quality of the generated text by + focusing on the most likely options. - top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of - choices for each predicted token based on the cumulative probabilities. + top_p: A percentage (also called the nucleus parameter) that's used to dynamically + adjust the number of choices for each predicted token based on the cumulative + probabilities. It specifies a probability threshold below which all less likely + tokens are filtered out. This technique helps maintain diversity and generate + more fluent and natural-sounding text. extra_headers: Send extra headers @@ -448,53 +487,66 @@ async def create( model: The name of the model to query. - echo: If set, the response will contain the prompt, and will also return prompt - logprobs if set with logprobs. + echo: If true, the response will contain the prompt. Can be used with `logprobs` to + return prompt logprobs. - frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will decrease the likelihood of repeating tokens that were - mentioned prior. + frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of + repeating tokens that have already been mentioned. - logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens - appearing in the generated output. + logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output. logprobs: Determines the number of most likely tokens to return at each token position log - probabilities to return + probabilities to return. max_tokens: The maximum number of tokens to generate. - min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to - `temperature`. + min_p: A number between 0 and 1 that can be used as an alternative to temperature. - n: Number of generations to return + n: The number of completions to generate for each prompt. - presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will increase the likelihood of a model talking about new topics. + presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of + a model talking about new topics. repetition_penalty: A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition. - response_format: Specifies the format of the response. + response_format: An object specifying the format that the model must output. - safety_model: The name of the safety model to use. + safety_model: The name of the moderation model used to validate tokens. Choose from the + available moderation models found + [here](https://docs.together.ai/docs/inference-models#moderation-models). - stop: A list of string sequences that will truncate (stop) inference text output. + stop: A list of string sequences that will truncate (stop) inference text output. For + example, "" will stop generation as soon as the model generates the given + token. - stream: If set, tokens are returned as Server-Sent Events as they are made available. - Stream terminates with `data: [DONE]`. If false, return a single JSON object - containing the results. + stream: If true, stream tokens as Server-Sent Events as the model generates them instead + of waiting for the full model response. The stream terminates with + `data: [DONE]`. If false, return a single JSON object containing the results. - temperature: Determines the degree of randomness in the response. + temperature: A decimal number from 0-1 that determines the degree of randomness in the + response. A temperature less than 1 favors more correctness and is appropriate + for question answering or summarization. A value closer to 1 introduces more + randomness in the output. - tool_choice: The choice of tool to use. + tool_choice: Controls which (if any) function is called by the model. By default uses `auto`, + which lets the model pick between generating a message or calling a function. - tools: A list of tools to be used in the query. + tools: A list of tools the model may call. Currently, only functions are supported as a + tool. Use this to provide a list of functions the model may generate JSON inputs + for. - top_k: The `top_k` parameter is used to limit the number of choices for the next - predicted word or token. + top_k: An integer that's used to limit the number of choices for the next predicted + word or token. It specifies the maximum number of tokens to consider at each + step, based on their probability of occurrence. This technique helps to speed up + the generation process and can improve the quality of the generated text by + focusing on the most likely options. - top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of - choices for each predicted token based on the cumulative probabilities. + top_p: A percentage (also called the nucleus parameter) that's used to dynamically + adjust the number of choices for each predicted token based on the cumulative + probabilities. It specifies a probability threshold below which all less likely + tokens are filtered out. This technique helps maintain diversity and generate + more fluent and natural-sounding text. extra_headers: Send extra headers @@ -545,53 +597,66 @@ async def create( model: The name of the model to query. - stream: If set, tokens are returned as Server-Sent Events as they are made available. - Stream terminates with `data: [DONE]`. If false, return a single JSON object - containing the results. + stream: If true, stream tokens as Server-Sent Events as the model generates them instead + of waiting for the full model response. The stream terminates with + `data: [DONE]`. If false, return a single JSON object containing the results. - echo: If set, the response will contain the prompt, and will also return prompt - logprobs if set with logprobs. + echo: If true, the response will contain the prompt. Can be used with `logprobs` to + return prompt logprobs. - frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will decrease the likelihood of repeating tokens that were - mentioned prior. + frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of + repeating tokens that have already been mentioned. - logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens - appearing in the generated output. + logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output. logprobs: Determines the number of most likely tokens to return at each token position log - probabilities to return + probabilities to return. max_tokens: The maximum number of tokens to generate. - min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to - `temperature`. + min_p: A number between 0 and 1 that can be used as an alternative to temperature. - n: Number of generations to return + n: The number of completions to generate for each prompt. - presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will increase the likelihood of a model talking about new topics. + presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of + a model talking about new topics. repetition_penalty: A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition. - response_format: Specifies the format of the response. + response_format: An object specifying the format that the model must output. - safety_model: The name of the safety model to use. + safety_model: The name of the moderation model used to validate tokens. Choose from the + available moderation models found + [here](https://docs.together.ai/docs/inference-models#moderation-models). - stop: A list of string sequences that will truncate (stop) inference text output. + stop: A list of string sequences that will truncate (stop) inference text output. For + example, "" will stop generation as soon as the model generates the given + token. - temperature: Determines the degree of randomness in the response. + temperature: A decimal number from 0-1 that determines the degree of randomness in the + response. A temperature less than 1 favors more correctness and is appropriate + for question answering or summarization. A value closer to 1 introduces more + randomness in the output. - tool_choice: The choice of tool to use. + tool_choice: Controls which (if any) function is called by the model. By default uses `auto`, + which lets the model pick between generating a message or calling a function. - tools: A list of tools to be used in the query. + tools: A list of tools the model may call. Currently, only functions are supported as a + tool. Use this to provide a list of functions the model may generate JSON inputs + for. - top_k: The `top_k` parameter is used to limit the number of choices for the next - predicted word or token. + top_k: An integer that's used to limit the number of choices for the next predicted + word or token. It specifies the maximum number of tokens to consider at each + step, based on their probability of occurrence. This technique helps to speed up + the generation process and can improve the quality of the generated text by + focusing on the most likely options. - top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of - choices for each predicted token based on the cumulative probabilities. + top_p: A percentage (also called the nucleus parameter) that's used to dynamically + adjust the number of choices for each predicted token based on the cumulative + probabilities. It specifies a probability threshold below which all less likely + tokens are filtered out. This technique helps maintain diversity and generate + more fluent and natural-sounding text. extra_headers: Send extra headers @@ -642,53 +707,66 @@ async def create( model: The name of the model to query. - stream: If set, tokens are returned as Server-Sent Events as they are made available. - Stream terminates with `data: [DONE]`. If false, return a single JSON object - containing the results. + stream: If true, stream tokens as Server-Sent Events as the model generates them instead + of waiting for the full model response. The stream terminates with + `data: [DONE]`. If false, return a single JSON object containing the results. - echo: If set, the response will contain the prompt, and will also return prompt - logprobs if set with logprobs. + echo: If true, the response will contain the prompt. Can be used with `logprobs` to + return prompt logprobs. - frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will decrease the likelihood of repeating tokens that were - mentioned prior. + frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of + repeating tokens that have already been mentioned. - logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens - appearing in the generated output. + logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output. logprobs: Determines the number of most likely tokens to return at each token position log - probabilities to return + probabilities to return. max_tokens: The maximum number of tokens to generate. - min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to - `temperature`. + min_p: A number between 0 and 1 that can be used as an alternative to temperature. - n: Number of generations to return + n: The number of completions to generate for each prompt. - presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will increase the likelihood of a model talking about new topics. + presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of + a model talking about new topics. repetition_penalty: A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition. - response_format: Specifies the format of the response. + response_format: An object specifying the format that the model must output. - safety_model: The name of the safety model to use. + safety_model: The name of the moderation model used to validate tokens. Choose from the + available moderation models found + [here](https://docs.together.ai/docs/inference-models#moderation-models). - stop: A list of string sequences that will truncate (stop) inference text output. + stop: A list of string sequences that will truncate (stop) inference text output. For + example, "" will stop generation as soon as the model generates the given + token. - temperature: Determines the degree of randomness in the response. + temperature: A decimal number from 0-1 that determines the degree of randomness in the + response. A temperature less than 1 favors more correctness and is appropriate + for question answering or summarization. A value closer to 1 introduces more + randomness in the output. - tool_choice: The choice of tool to use. + tool_choice: Controls which (if any) function is called by the model. By default uses `auto`, + which lets the model pick between generating a message or calling a function. - tools: A list of tools to be used in the query. + tools: A list of tools the model may call. Currently, only functions are supported as a + tool. Use this to provide a list of functions the model may generate JSON inputs + for. - top_k: The `top_k` parameter is used to limit the number of choices for the next - predicted word or token. + top_k: An integer that's used to limit the number of choices for the next predicted + word or token. It specifies the maximum number of tokens to consider at each + step, based on their probability of occurrence. This technique helps to speed up + the generation process and can improve the quality of the generated text by + focusing on the most likely options. - top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of - choices for each predicted token based on the cumulative probabilities. + top_p: A percentage (also called the nucleus parameter) that's used to dynamically + adjust the number of choices for each predicted token based on the cumulative + probabilities. It specifies a probability threshold below which all less likely + tokens are filtered out. This technique helps maintain diversity and generate + more fluent and natural-sounding text. extra_headers: Send extra headers diff --git a/src/together/resources/completions.py b/src/together/resources/completions.py index 163fc7a5..3f777147 100644 --- a/src/together/resources/completions.py +++ b/src/together/resources/completions.py @@ -76,46 +76,57 @@ def create( prompt: A string providing context for the model to complete. - echo: If set, the response will contain the prompt, and will also return prompt - logprobs if set with logprobs. + echo: If true, the response will contain the prompt. Can be used with `logprobs` to + return prompt logprobs. - frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will decrease the likelihood of repeating tokens that were - mentioned prior. + frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of + repeating tokens that have already been mentioned. - logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens - appearing in the generated output. + logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output. logprobs: Determines the number of most likely tokens to return at each token position log - probabilities to return + probabilities to return. max_tokens: The maximum number of tokens to generate. - min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to - `temperature`. + min_p: A number between 0 and 1 that can be used as an alternative to temperature. - n: Number of generations to return + n: The number of completions to generate for each prompt. - presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will increase the likelihood of a model talking about new topics. + presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of + a model talking about new topics. repetition_penalty: A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition. - safety_model: The name of the safety model to use. + safety_model: The name of the moderation model used to validate tokens. Choose from the + available moderation models found + [here](https://docs.together.ai/docs/inference-models#moderation-models). - stop: A list of string sequences that will truncate (stop) inference text output. + stop: A list of string sequences that will truncate (stop) inference text output. For + example, "" will stop generation as soon as the model generates the given + token. - stream: If set, tokens are returned as Server-Sent Events as they are made available. - Stream terminates with `data: [DONE]` + stream: If true, stream tokens as Server-Sent Events as the model generates them instead + of waiting for the full model response. The stream terminates with + `data: [DONE]`. If false, return a single JSON object containing the results. - temperature: Determines the degree of randomness in the response. + temperature: A decimal number from 0-1 that determines the degree of randomness in the + response. A temperature less than 1 favors more correctness and is appropriate + for question answering or summarization. A value closer to 1 introduces more + randomness in the output. - top_k: The `top_k` parameter is used to limit the number of choices for the next - predicted word or token. + top_k: An integer that's used to limit the number of choices for the next predicted + word or token. It specifies the maximum number of tokens to consider at each + step, based on their probability of occurrence. This technique helps to speed up + the generation process and can improve the quality of the generated text by + focusing on the most likely options. - top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of - choices for each predicted token based on the cumulative probabilities. + top_p: A percentage (also called the nucleus parameter) that's used to dynamically + adjust the number of choices for each predicted token based on the cumulative + probabilities. It specifies a probability threshold below which all less likely + tokens are filtered out. This technique helps maintain diversity and generate + more fluent and natural-sounding text. extra_headers: Send extra headers @@ -163,46 +174,57 @@ def create( prompt: A string providing context for the model to complete. - stream: If set, tokens are returned as Server-Sent Events as they are made available. - Stream terminates with `data: [DONE]` + stream: If true, stream tokens as Server-Sent Events as the model generates them instead + of waiting for the full model response. The stream terminates with + `data: [DONE]`. If false, return a single JSON object containing the results. - echo: If set, the response will contain the prompt, and will also return prompt - logprobs if set with logprobs. + echo: If true, the response will contain the prompt. Can be used with `logprobs` to + return prompt logprobs. - frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will decrease the likelihood of repeating tokens that were - mentioned prior. + frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of + repeating tokens that have already been mentioned. - logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens - appearing in the generated output. + logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output. logprobs: Determines the number of most likely tokens to return at each token position log - probabilities to return + probabilities to return. max_tokens: The maximum number of tokens to generate. - min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to - `temperature`. + min_p: A number between 0 and 1 that can be used as an alternative to temperature. - n: Number of generations to return + n: The number of completions to generate for each prompt. - presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will increase the likelihood of a model talking about new topics. + presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of + a model talking about new topics. repetition_penalty: A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition. - safety_model: The name of the safety model to use. + safety_model: The name of the moderation model used to validate tokens. Choose from the + available moderation models found + [here](https://docs.together.ai/docs/inference-models#moderation-models). - stop: A list of string sequences that will truncate (stop) inference text output. + stop: A list of string sequences that will truncate (stop) inference text output. For + example, "" will stop generation as soon as the model generates the given + token. - temperature: Determines the degree of randomness in the response. + temperature: A decimal number from 0-1 that determines the degree of randomness in the + response. A temperature less than 1 favors more correctness and is appropriate + for question answering or summarization. A value closer to 1 introduces more + randomness in the output. - top_k: The `top_k` parameter is used to limit the number of choices for the next - predicted word or token. + top_k: An integer that's used to limit the number of choices for the next predicted + word or token. It specifies the maximum number of tokens to consider at each + step, based on their probability of occurrence. This technique helps to speed up + the generation process and can improve the quality of the generated text by + focusing on the most likely options. - top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of - choices for each predicted token based on the cumulative probabilities. + top_p: A percentage (also called the nucleus parameter) that's used to dynamically + adjust the number of choices for each predicted token based on the cumulative + probabilities. It specifies a probability threshold below which all less likely + tokens are filtered out. This technique helps maintain diversity and generate + more fluent and natural-sounding text. extra_headers: Send extra headers @@ -250,46 +272,57 @@ def create( prompt: A string providing context for the model to complete. - stream: If set, tokens are returned as Server-Sent Events as they are made available. - Stream terminates with `data: [DONE]` + stream: If true, stream tokens as Server-Sent Events as the model generates them instead + of waiting for the full model response. The stream terminates with + `data: [DONE]`. If false, return a single JSON object containing the results. - echo: If set, the response will contain the prompt, and will also return prompt - logprobs if set with logprobs. + echo: If true, the response will contain the prompt. Can be used with `logprobs` to + return prompt logprobs. - frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will decrease the likelihood of repeating tokens that were - mentioned prior. + frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of + repeating tokens that have already been mentioned. - logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens - appearing in the generated output. + logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output. logprobs: Determines the number of most likely tokens to return at each token position log - probabilities to return + probabilities to return. max_tokens: The maximum number of tokens to generate. - min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to - `temperature`. + min_p: A number between 0 and 1 that can be used as an alternative to temperature. - n: Number of generations to return + n: The number of completions to generate for each prompt. - presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will increase the likelihood of a model talking about new topics. + presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of + a model talking about new topics. repetition_penalty: A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition. - safety_model: The name of the safety model to use. + safety_model: The name of the moderation model used to validate tokens. Choose from the + available moderation models found + [here](https://docs.together.ai/docs/inference-models#moderation-models). - stop: A list of string sequences that will truncate (stop) inference text output. + stop: A list of string sequences that will truncate (stop) inference text output. For + example, "" will stop generation as soon as the model generates the given + token. - temperature: Determines the degree of randomness in the response. + temperature: A decimal number from 0-1 that determines the degree of randomness in the + response. A temperature less than 1 favors more correctness and is appropriate + for question answering or summarization. A value closer to 1 introduces more + randomness in the output. - top_k: The `top_k` parameter is used to limit the number of choices for the next - predicted word or token. + top_k: An integer that's used to limit the number of choices for the next predicted + word or token. It specifies the maximum number of tokens to consider at each + step, based on their probability of occurrence. This technique helps to speed up + the generation process and can improve the quality of the generated text by + focusing on the most likely options. - top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of - choices for each predicted token based on the cumulative probabilities. + top_p: A percentage (also called the nucleus parameter) that's used to dynamically + adjust the number of choices for each predicted token based on the cumulative + probabilities. It specifies a probability threshold below which all less likely + tokens are filtered out. This technique helps maintain diversity and generate + more fluent and natural-sounding text. extra_headers: Send extra headers @@ -407,46 +440,57 @@ async def create( prompt: A string providing context for the model to complete. - echo: If set, the response will contain the prompt, and will also return prompt - logprobs if set with logprobs. + echo: If true, the response will contain the prompt. Can be used with `logprobs` to + return prompt logprobs. - frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will decrease the likelihood of repeating tokens that were - mentioned prior. + frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of + repeating tokens that have already been mentioned. - logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens - appearing in the generated output. + logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output. logprobs: Determines the number of most likely tokens to return at each token position log - probabilities to return + probabilities to return. max_tokens: The maximum number of tokens to generate. - min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to - `temperature`. + min_p: A number between 0 and 1 that can be used as an alternative to temperature. - n: Number of generations to return + n: The number of completions to generate for each prompt. - presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will increase the likelihood of a model talking about new topics. + presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of + a model talking about new topics. repetition_penalty: A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition. - safety_model: The name of the safety model to use. + safety_model: The name of the moderation model used to validate tokens. Choose from the + available moderation models found + [here](https://docs.together.ai/docs/inference-models#moderation-models). - stop: A list of string sequences that will truncate (stop) inference text output. + stop: A list of string sequences that will truncate (stop) inference text output. For + example, "" will stop generation as soon as the model generates the given + token. - stream: If set, tokens are returned as Server-Sent Events as they are made available. - Stream terminates with `data: [DONE]` + stream: If true, stream tokens as Server-Sent Events as the model generates them instead + of waiting for the full model response. The stream terminates with + `data: [DONE]`. If false, return a single JSON object containing the results. - temperature: Determines the degree of randomness in the response. + temperature: A decimal number from 0-1 that determines the degree of randomness in the + response. A temperature less than 1 favors more correctness and is appropriate + for question answering or summarization. A value closer to 1 introduces more + randomness in the output. - top_k: The `top_k` parameter is used to limit the number of choices for the next - predicted word or token. + top_k: An integer that's used to limit the number of choices for the next predicted + word or token. It specifies the maximum number of tokens to consider at each + step, based on their probability of occurrence. This technique helps to speed up + the generation process and can improve the quality of the generated text by + focusing on the most likely options. - top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of - choices for each predicted token based on the cumulative probabilities. + top_p: A percentage (also called the nucleus parameter) that's used to dynamically + adjust the number of choices for each predicted token based on the cumulative + probabilities. It specifies a probability threshold below which all less likely + tokens are filtered out. This technique helps maintain diversity and generate + more fluent and natural-sounding text. extra_headers: Send extra headers @@ -494,46 +538,57 @@ async def create( prompt: A string providing context for the model to complete. - stream: If set, tokens are returned as Server-Sent Events as they are made available. - Stream terminates with `data: [DONE]` + stream: If true, stream tokens as Server-Sent Events as the model generates them instead + of waiting for the full model response. The stream terminates with + `data: [DONE]`. If false, return a single JSON object containing the results. - echo: If set, the response will contain the prompt, and will also return prompt - logprobs if set with logprobs. + echo: If true, the response will contain the prompt. Can be used with `logprobs` to + return prompt logprobs. - frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will decrease the likelihood of repeating tokens that were - mentioned prior. + frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of + repeating tokens that have already been mentioned. - logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens - appearing in the generated output. + logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output. logprobs: Determines the number of most likely tokens to return at each token position log - probabilities to return + probabilities to return. max_tokens: The maximum number of tokens to generate. - min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to - `temperature`. + min_p: A number between 0 and 1 that can be used as an alternative to temperature. - n: Number of generations to return + n: The number of completions to generate for each prompt. - presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will increase the likelihood of a model talking about new topics. + presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of + a model talking about new topics. repetition_penalty: A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition. - safety_model: The name of the safety model to use. + safety_model: The name of the moderation model used to validate tokens. Choose from the + available moderation models found + [here](https://docs.together.ai/docs/inference-models#moderation-models). - stop: A list of string sequences that will truncate (stop) inference text output. + stop: A list of string sequences that will truncate (stop) inference text output. For + example, "" will stop generation as soon as the model generates the given + token. - temperature: Determines the degree of randomness in the response. + temperature: A decimal number from 0-1 that determines the degree of randomness in the + response. A temperature less than 1 favors more correctness and is appropriate + for question answering or summarization. A value closer to 1 introduces more + randomness in the output. - top_k: The `top_k` parameter is used to limit the number of choices for the next - predicted word or token. + top_k: An integer that's used to limit the number of choices for the next predicted + word or token. It specifies the maximum number of tokens to consider at each + step, based on their probability of occurrence. This technique helps to speed up + the generation process and can improve the quality of the generated text by + focusing on the most likely options. - top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of - choices for each predicted token based on the cumulative probabilities. + top_p: A percentage (also called the nucleus parameter) that's used to dynamically + adjust the number of choices for each predicted token based on the cumulative + probabilities. It specifies a probability threshold below which all less likely + tokens are filtered out. This technique helps maintain diversity and generate + more fluent and natural-sounding text. extra_headers: Send extra headers @@ -581,46 +636,57 @@ async def create( prompt: A string providing context for the model to complete. - stream: If set, tokens are returned as Server-Sent Events as they are made available. - Stream terminates with `data: [DONE]` + stream: If true, stream tokens as Server-Sent Events as the model generates them instead + of waiting for the full model response. The stream terminates with + `data: [DONE]`. If false, return a single JSON object containing the results. - echo: If set, the response will contain the prompt, and will also return prompt - logprobs if set with logprobs. + echo: If true, the response will contain the prompt. Can be used with `logprobs` to + return prompt logprobs. - frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will decrease the likelihood of repeating tokens that were - mentioned prior. + frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of + repeating tokens that have already been mentioned. - logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens - appearing in the generated output. + logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output. logprobs: Determines the number of most likely tokens to return at each token position log - probabilities to return + probabilities to return. max_tokens: The maximum number of tokens to generate. - min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to - `temperature`. + min_p: A number between 0 and 1 that can be used as an alternative to temperature. - n: Number of generations to return + n: The number of completions to generate for each prompt. - presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will increase the likelihood of a model talking about new topics. + presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of + a model talking about new topics. repetition_penalty: A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition. - safety_model: The name of the safety model to use. - - stop: A list of string sequences that will truncate (stop) inference text output. - - temperature: Determines the degree of randomness in the response. - - top_k: The `top_k` parameter is used to limit the number of choices for the next - predicted word or token. - - top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of - choices for each predicted token based on the cumulative probabilities. + safety_model: The name of the moderation model used to validate tokens. Choose from the + available moderation models found + [here](https://docs.together.ai/docs/inference-models#moderation-models). + + stop: A list of string sequences that will truncate (stop) inference text output. For + example, "" will stop generation as soon as the model generates the given + token. + + temperature: A decimal number from 0-1 that determines the degree of randomness in the + response. A temperature less than 1 favors more correctness and is appropriate + for question answering or summarization. A value closer to 1 introduces more + randomness in the output. + + top_k: An integer that's used to limit the number of choices for the next predicted + word or token. It specifies the maximum number of tokens to consider at each + step, based on their probability of occurrence. This technique helps to speed up + the generation process and can improve the quality of the generated text by + focusing on the most likely options. + + top_p: A percentage (also called the nucleus parameter) that's used to dynamically + adjust the number of choices for each predicted token based on the cumulative + probabilities. It specifies a probability threshold below which all less likely + tokens are filtered out. This technique helps maintain diversity and generate + more fluent and natural-sounding text. extra_headers: Send extra headers diff --git a/src/together/types/chat/completion_create_params.py b/src/together/types/chat/completion_create_params.py index 05c6a86a..8b57ef5b 100644 --- a/src/together/types/chat/completion_create_params.py +++ b/src/together/types/chat/completion_create_params.py @@ -26,46 +26,39 @@ class CompletionCreateParamsBase(TypedDict, total=False): """The name of the model to query.""" echo: bool - """ - If set, the response will contain the prompt, and will also return prompt - logprobs if set with logprobs. + """If true, the response will contain the prompt. + + Can be used with `logprobs` to return prompt logprobs. """ frequency_penalty: float """ - The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will decrease the likelihood of repeating tokens that were - mentioned prior. + A number between -2.0 and 2.0 where a positive value decreases the likelihood of + repeating tokens that have already been mentioned. """ logit_bias: Dict[str, float] - """ - The `logit_bias` parameter allows us to adjust the likelihood of specific tokens - appearing in the generated output. - """ + """Adjusts the likelihood of specific tokens appearing in the generated output.""" logprobs: int """ Determines the number of most likely tokens to return at each token position log - probabilities to return + probabilities to return. """ max_tokens: int """The maximum number of tokens to generate.""" min_p: float - """ - The `min_p` parameter is a number between 0 and 1 and an alternative to - `temperature`. - """ + """A number between 0 and 1 that can be used as an alternative to temperature.""" n: int - """Number of generations to return""" + """The number of completions to generate for each prompt.""" presence_penalty: float """ - The `presence_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will increase the likelihood of a model talking about new topics. + A number between -2.0 and 2.0 where a positive value increases the likelihood of + a model talking about new topics. """ repetition_penalty: float @@ -75,33 +68,60 @@ class CompletionCreateParamsBase(TypedDict, total=False): """ response_format: ResponseFormat - """Specifies the format of the response.""" + """An object specifying the format that the model must output.""" safety_model: str - """The name of the safety model to use.""" + """The name of the moderation model used to validate tokens. + + Choose from the available moderation models found + [here](https://docs.together.ai/docs/inference-models#moderation-models). + """ stop: List[str] - """A list of string sequences that will truncate (stop) inference text output.""" + """A list of string sequences that will truncate (stop) inference text output. + + For example, "" will stop generation as soon as the model generates the + given token. + """ temperature: float - """Determines the degree of randomness in the response.""" + """ + A decimal number from 0-1 that determines the degree of randomness in the + response. A temperature less than 1 favors more correctness and is appropriate + for question answering or summarization. A value closer to 1 introduces more + randomness in the output. + """ tool_choice: ToolChoice - """The choice of tool to use.""" + """Controls which (if any) function is called by the model. + + By default uses `auto`, which lets the model pick between generating a message + or calling a function. + """ tools: Iterable[ToolsParam] - """A list of tools to be used in the query.""" + """A list of tools the model may call. + + Currently, only functions are supported as a tool. Use this to provide a list of + functions the model may generate JSON inputs for. + """ top_k: int """ - The `top_k` parameter is used to limit the number of choices for the next - predicted word or token. + An integer that's used to limit the number of choices for the next predicted + word or token. It specifies the maximum number of tokens to consider at each + step, based on their probability of occurrence. This technique helps to speed up + the generation process and can improve the quality of the generated text by + focusing on the most likely options. """ top_p: float """ - The `top_p` (nucleus) parameter is used to dynamically adjust the number of - choices for each predicted token based on the cumulative probabilities. + A percentage (also called the nucleus parameter) that's used to dynamically + adjust the number of choices for each predicted token based on the cumulative + probabilities. It specifies a probability threshold below which all less likely + tokens are filtered out. This technique helps maintain diversity and generate + more fluent and natural-sounding text. """ @@ -126,19 +146,19 @@ class ResponseFormat(TypedDict, total=False): class CompletionCreateParamsNonStreaming(CompletionCreateParamsBase): stream: Literal[False] - """If set, tokens are returned as Server-Sent Events as they are made available. - - Stream terminates with `data: [DONE]`. If false, return a single JSON object - containing the results. + """ + If true, stream tokens as Server-Sent Events as the model generates them instead + of waiting for the full model response. The stream terminates with + `data: [DONE]`. If false, return a single JSON object containing the results. """ class CompletionCreateParamsStreaming(CompletionCreateParamsBase): stream: Required[Literal[True]] - """If set, tokens are returned as Server-Sent Events as they are made available. - - Stream terminates with `data: [DONE]`. If false, return a single JSON object - containing the results. + """ + If true, stream tokens as Server-Sent Events as the model generates them instead + of waiting for the full model response. The stream terminates with + `data: [DONE]`. If false, return a single JSON object containing the results. """ diff --git a/src/together/types/completion_create_params.py b/src/together/types/completion_create_params.py index 7f4e1fef..050a5477 100644 --- a/src/together/types/completion_create_params.py +++ b/src/together/types/completion_create_params.py @@ -16,46 +16,39 @@ class CompletionCreateParamsBase(TypedDict, total=False): """A string providing context for the model to complete.""" echo: bool - """ - If set, the response will contain the prompt, and will also return prompt - logprobs if set with logprobs. + """If true, the response will contain the prompt. + + Can be used with `logprobs` to return prompt logprobs. """ frequency_penalty: float """ - The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will decrease the likelihood of repeating tokens that were - mentioned prior. + A number between -2.0 and 2.0 where a positive value decreases the likelihood of + repeating tokens that have already been mentioned. """ logit_bias: Dict[str, float] - """ - The `logit_bias` parameter allows us to adjust the likelihood of specific tokens - appearing in the generated output. - """ + """Adjusts the likelihood of specific tokens appearing in the generated output.""" logprobs: int """ Determines the number of most likely tokens to return at each token position log - probabilities to return + probabilities to return. """ max_tokens: int """The maximum number of tokens to generate.""" min_p: float - """ - The `min_p` parameter is a number between 0 and 1 and an alternative to - `temperature`. - """ + """A number between 0 and 1 that can be used as an alternative to temperature.""" n: int - """Number of generations to return""" + """The number of completions to generate for each prompt.""" presence_penalty: float """ - The `presence_penalty` parameter is a number between -2.0 and 2.0 where a - positive value will increase the likelihood of a model talking about new topics. + A number between -2.0 and 2.0 where a positive value increases the likelihood of + a model talking about new topics. """ repetition_penalty: float @@ -65,40 +58,61 @@ class CompletionCreateParamsBase(TypedDict, total=False): """ safety_model: str - """The name of the safety model to use.""" + """The name of the moderation model used to validate tokens. + + Choose from the available moderation models found + [here](https://docs.together.ai/docs/inference-models#moderation-models). + """ stop: List[str] - """A list of string sequences that will truncate (stop) inference text output.""" + """A list of string sequences that will truncate (stop) inference text output. + + For example, "" will stop generation as soon as the model generates the + given token. + """ temperature: float - """Determines the degree of randomness in the response.""" + """ + A decimal number from 0-1 that determines the degree of randomness in the + response. A temperature less than 1 favors more correctness and is appropriate + for question answering or summarization. A value closer to 1 introduces more + randomness in the output. + """ top_k: int """ - The `top_k` parameter is used to limit the number of choices for the next - predicted word or token. + An integer that's used to limit the number of choices for the next predicted + word or token. It specifies the maximum number of tokens to consider at each + step, based on their probability of occurrence. This technique helps to speed up + the generation process and can improve the quality of the generated text by + focusing on the most likely options. """ top_p: float """ - The `top_p` (nucleus) parameter is used to dynamically adjust the number of - choices for each predicted token based on the cumulative probabilities. + A percentage (also called the nucleus parameter) that's used to dynamically + adjust the number of choices for each predicted token based on the cumulative + probabilities. It specifies a probability threshold below which all less likely + tokens are filtered out. This technique helps maintain diversity and generate + more fluent and natural-sounding text. """ class CompletionCreateParamsNonStreaming(CompletionCreateParamsBase): stream: Literal[False] - """If set, tokens are returned as Server-Sent Events as they are made available. - - Stream terminates with `data: [DONE]` + """ + If true, stream tokens as Server-Sent Events as the model generates them instead + of waiting for the full model response. The stream terminates with + `data: [DONE]`. If false, return a single JSON object containing the results. """ class CompletionCreateParamsStreaming(CompletionCreateParamsBase): stream: Required[Literal[True]] - """If set, tokens are returned as Server-Sent Events as they are made available. - - Stream terminates with `data: [DONE]` + """ + If true, stream tokens as Server-Sent Events as the model generates them instead + of waiting for the full model response. The stream terminates with + `data: [DONE]`. If false, return a single JSON object containing the results. """