From 6f73231202528d01d10c2a7a63db5ecaee2ac970 Mon Sep 17 00:00:00 2001
From: stainless-bot <dev@stainlessapi.com>
Date: Fri, 21 Jun 2024 16:28:48 +0000
Subject: [PATCH] feat(api): OpenAPI spec update via Stainless API

---
 .stats.yml                                    |   2 +-
 src/together/_base_client.py                  |  25 +-
 src/together/_utils/__init__.py               |   1 +
 src/together/_utils/_reflection.py            |   8 +
 src/together/_utils/_sync.py                  |  19 +-
 src/together/resources/chat/completions.py    | 390 +++++++++++-------
 src/together/resources/completions.py         | 338 +++++++++------
 .../types/chat/completion_create_params.py    |  92 +++--
 .../types/completion_create_params.py         |  76 ++--
 9 files changed, 585 insertions(+), 366 deletions(-)
 create mode 100644 src/together/_utils/_reflection.py

diff --git a/.stats.yml b/.stats.yml
index 02655e1f..d6da9ca3 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,2 +1,2 @@
 configured_endpoints: 15
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/togetherai%2FTogetherAI-5934359dd4fbab352cb5042ffbf08374bd3d3b6bc0550fd09797de44626772fe.yml
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/togetherai%2FTogetherAI-33661dd8fd4c26ecd595dee22e2c9274e6c4699ad8de5ece233e0d37376c6b7c.yml
diff --git a/src/together/_base_client.py b/src/together/_base_client.py
index 66db09be..a806809b 100644
--- a/src/together/_base_client.py
+++ b/src/together/_base_client.py
@@ -60,7 +60,7 @@
     RequestOptions,
     ModelBuilderProtocol,
 )
-from ._utils import is_dict, is_list, is_given, lru_cache, is_mapping
+from ._utils import is_dict, is_list, asyncify, is_given, lru_cache, is_mapping
 from ._compat import model_copy, model_dump
 from ._models import GenericModel, FinalRequestOptions, validate_type, construct_type
 from ._response import (
@@ -358,6 +358,7 @@ def __init__(
         self._custom_query = custom_query or {}
         self._strict_response_validation = _strict_response_validation
         self._idempotency_header = None
+        self._platform: Platform | None = None
 
         if max_retries is None:  # pyright: ignore[reportUnnecessaryComparison]
             raise TypeError(
@@ -456,7 +457,7 @@ def _build_request(
                 raise RuntimeError(f"Unexpected JSON data type, {type(json_data)}, cannot merge with `extra_body`")
 
         headers = self._build_headers(options)
-        params = _merge_mappings(self._custom_query, options.params)
+        params = _merge_mappings(self.default_query, options.params)
         content_type = headers.get("Content-Type")
 
         # If the given Content-Type header is multipart/form-data then it
@@ -592,6 +593,12 @@ def default_headers(self) -> dict[str, str | Omit]:
             **self._custom_headers,
         }
 
+    @property
+    def default_query(self) -> dict[str, object]:
+        return {
+            **self._custom_query,
+        }
+
     def _validate_headers(
         self,
         headers: Headers,  # noqa: ARG002
@@ -616,7 +623,10 @@ def base_url(self, url: URL | str) -> None:
         self._base_url = self._enforce_trailing_slash(url if isinstance(url, URL) else URL(url))
 
     def platform_headers(self) -> Dict[str, str]:
-        return platform_headers(self._version)
+        # the actual implementation is in a separate `lru_cache` decorated
+        # function because adding `lru_cache` to methods will leak memory
+        # https://github.com/python/cpython/issues/88476
+        return platform_headers(self._version, platform=self._platform)
 
     def _parse_retry_after_header(self, response_headers: Optional[httpx.Headers] = None) -> float | None:
         """Returns a float of the number of seconds (not milliseconds) to wait after retrying, or None if unspecified.
@@ -1492,6 +1502,11 @@ async def _request(
         stream_cls: type[_AsyncStreamT] | None,
         remaining_retries: int | None,
     ) -> ResponseT | _AsyncStreamT:
+        if self._platform is None:
+            # `get_platform` can make blocking IO calls so we
+            # execute it earlier while we are in an async context
+            self._platform = await asyncify(get_platform)()
+
         cast_to = self._maybe_override_cast_to(cast_to, options)
         await self._prepare_options(options)
 
@@ -1915,11 +1930,11 @@ def get_platform() -> Platform:
 
 
 @lru_cache(maxsize=None)
-def platform_headers(version: str) -> Dict[str, str]:
+def platform_headers(version: str, *, platform: Platform | None) -> Dict[str, str]:
     return {
         "X-Stainless-Lang": "python",
         "X-Stainless-Package-Version": version,
-        "X-Stainless-OS": str(get_platform()),
+        "X-Stainless-OS": str(platform or get_platform()),
         "X-Stainless-Arch": str(get_architecture()),
         "X-Stainless-Runtime": get_python_runtime(),
         "X-Stainless-Runtime-Version": get_python_version(),
diff --git a/src/together/_utils/__init__.py b/src/together/_utils/__init__.py
index 31b5b227..667e2473 100644
--- a/src/together/_utils/__init__.py
+++ b/src/together/_utils/__init__.py
@@ -49,3 +49,4 @@
     maybe_transform as maybe_transform,
     async_maybe_transform as async_maybe_transform,
 )
+from ._reflection import function_has_argument as function_has_argument
diff --git a/src/together/_utils/_reflection.py b/src/together/_utils/_reflection.py
new file mode 100644
index 00000000..e134f58e
--- /dev/null
+++ b/src/together/_utils/_reflection.py
@@ -0,0 +1,8 @@
+import inspect
+from typing import Any, Callable
+
+
+def function_has_argument(func: Callable[..., Any], arg_name: str) -> bool:
+    """Returns whether or not the given function has a specific parameter"""
+    sig = inspect.signature(func)
+    return arg_name in sig.parameters
diff --git a/src/together/_utils/_sync.py b/src/together/_utils/_sync.py
index 595924e5..d0d81033 100644
--- a/src/together/_utils/_sync.py
+++ b/src/together/_utils/_sync.py
@@ -7,6 +7,8 @@
 import anyio
 import anyio.to_thread
 
+from ._reflection import function_has_argument
+
 T_Retval = TypeVar("T_Retval")
 T_ParamSpec = ParamSpec("T_ParamSpec")
 
@@ -59,6 +61,21 @@ def do_work(arg1, arg2, kwarg1="", kwarg2="") -> str:
 
     async def wrapper(*args: T_ParamSpec.args, **kwargs: T_ParamSpec.kwargs) -> T_Retval:
         partial_f = functools.partial(function, *args, **kwargs)
-        return await anyio.to_thread.run_sync(partial_f, cancellable=cancellable, limiter=limiter)
+
+        # In `v4.1.0` anyio added the `abandon_on_cancel` argument and deprecated the old
+        # `cancellable` argument, so we need to use the new `abandon_on_cancel` to avoid
+        # surfacing deprecation warnings.
+        if function_has_argument(anyio.to_thread.run_sync, "abandon_on_cancel"):
+            return await anyio.to_thread.run_sync(
+                partial_f,
+                abandon_on_cancel=cancellable,
+                limiter=limiter,
+            )
+
+        return await anyio.to_thread.run_sync(
+            partial_f,
+            cancellable=cancellable,
+            limiter=limiter,
+        )
 
     return wrapper
diff --git a/src/together/resources/chat/completions.py b/src/together/resources/chat/completions.py
index 43125d39..0d69a7c8 100644
--- a/src/together/resources/chat/completions.py
+++ b/src/together/resources/chat/completions.py
@@ -81,53 +81,66 @@ def create(
 
           model: The name of the model to query.
 
-          echo: If set, the response will contain the prompt, and will also return prompt
-              logprobs if set with logprobs.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
 
-          frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will decrease the likelihood of repeating tokens that were
-              mentioned prior.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
 
-          logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens
-              appearing in the generated output.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
 
           logprobs: Determines the number of most likely tokens to return at each token position log
-              probabilities to return
+              probabilities to return.
 
           max_tokens: The maximum number of tokens to generate.
 
-          min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to
-              `temperature`.
+          min_p: A number between 0 and 1 that can be used as an alternative to temperature.
 
-          n: Number of generations to return
+          n: The number of completions to generate for each prompt.
 
-          presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will increase the likelihood of a model talking about new topics.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
 
           repetition_penalty: A number that controls the diversity of generated text by reducing the
               likelihood of repeated sequences. Higher values decrease repetition.
 
-          response_format: Specifies the format of the response.
+          response_format: An object specifying the format that the model must output.
 
-          safety_model: The name of the safety model to use.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
 
-          stop: A list of string sequences that will truncate (stop) inference text output.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
 
-          stream: If set, tokens are returned as Server-Sent Events as they are made available.
-              Stream terminates with `data: [DONE]`. If false, return a single JSON object
-              containing the results.
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
 
-          temperature: Determines the degree of randomness in the response.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
 
-          tool_choice: The choice of tool to use.
+          tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
+              which lets the model pick between generating a message or calling a function.
 
-          tools: A list of tools to be used in the query.
+          tools: A list of tools the model may call. Currently, only functions are supported as a
+              tool. Use this to provide a list of functions the model may generate JSON inputs
+              for.
 
-          top_k: The `top_k` parameter is used to limit the number of choices for the next
-              predicted word or token.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
 
-          top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of
-              choices for each predicted token based on the cumulative probabilities.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
 
           extra_headers: Send extra headers
 
@@ -178,53 +191,66 @@ def create(
 
           model: The name of the model to query.
 
-          stream: If set, tokens are returned as Server-Sent Events as they are made available.
-              Stream terminates with `data: [DONE]`. If false, return a single JSON object
-              containing the results.
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
 
-          echo: If set, the response will contain the prompt, and will also return prompt
-              logprobs if set with logprobs.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
 
-          frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will decrease the likelihood of repeating tokens that were
-              mentioned prior.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
 
-          logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens
-              appearing in the generated output.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
 
           logprobs: Determines the number of most likely tokens to return at each token position log
-              probabilities to return
+              probabilities to return.
 
           max_tokens: The maximum number of tokens to generate.
 
-          min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to
-              `temperature`.
+          min_p: A number between 0 and 1 that can be used as an alternative to temperature.
 
-          n: Number of generations to return
+          n: The number of completions to generate for each prompt.
 
-          presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will increase the likelihood of a model talking about new topics.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
 
           repetition_penalty: A number that controls the diversity of generated text by reducing the
               likelihood of repeated sequences. Higher values decrease repetition.
 
-          response_format: Specifies the format of the response.
+          response_format: An object specifying the format that the model must output.
 
-          safety_model: The name of the safety model to use.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
 
-          stop: A list of string sequences that will truncate (stop) inference text output.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
 
-          temperature: Determines the degree of randomness in the response.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
 
-          tool_choice: The choice of tool to use.
+          tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
+              which lets the model pick between generating a message or calling a function.
 
-          tools: A list of tools to be used in the query.
+          tools: A list of tools the model may call. Currently, only functions are supported as a
+              tool. Use this to provide a list of functions the model may generate JSON inputs
+              for.
 
-          top_k: The `top_k` parameter is used to limit the number of choices for the next
-              predicted word or token.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
 
-          top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of
-              choices for each predicted token based on the cumulative probabilities.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
 
           extra_headers: Send extra headers
 
@@ -275,53 +301,66 @@ def create(
 
           model: The name of the model to query.
 
-          stream: If set, tokens are returned as Server-Sent Events as they are made available.
-              Stream terminates with `data: [DONE]`. If false, return a single JSON object
-              containing the results.
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
 
-          echo: If set, the response will contain the prompt, and will also return prompt
-              logprobs if set with logprobs.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
 
-          frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will decrease the likelihood of repeating tokens that were
-              mentioned prior.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
 
-          logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens
-              appearing in the generated output.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
 
           logprobs: Determines the number of most likely tokens to return at each token position log
-              probabilities to return
+              probabilities to return.
 
           max_tokens: The maximum number of tokens to generate.
 
-          min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to
-              `temperature`.
+          min_p: A number between 0 and 1 that can be used as an alternative to temperature.
 
-          n: Number of generations to return
+          n: The number of completions to generate for each prompt.
 
-          presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will increase the likelihood of a model talking about new topics.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
 
           repetition_penalty: A number that controls the diversity of generated text by reducing the
               likelihood of repeated sequences. Higher values decrease repetition.
 
-          response_format: Specifies the format of the response.
+          response_format: An object specifying the format that the model must output.
 
-          safety_model: The name of the safety model to use.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
 
-          stop: A list of string sequences that will truncate (stop) inference text output.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
 
-          temperature: Determines the degree of randomness in the response.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
 
-          tool_choice: The choice of tool to use.
+          tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
+              which lets the model pick between generating a message or calling a function.
 
-          tools: A list of tools to be used in the query.
+          tools: A list of tools the model may call. Currently, only functions are supported as a
+              tool. Use this to provide a list of functions the model may generate JSON inputs
+              for.
 
-          top_k: The `top_k` parameter is used to limit the number of choices for the next
-              predicted word or token.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
 
-          top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of
-              choices for each predicted token based on the cumulative probabilities.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
 
           extra_headers: Send extra headers
 
@@ -448,53 +487,66 @@ async def create(
 
           model: The name of the model to query.
 
-          echo: If set, the response will contain the prompt, and will also return prompt
-              logprobs if set with logprobs.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
 
-          frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will decrease the likelihood of repeating tokens that were
-              mentioned prior.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
 
-          logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens
-              appearing in the generated output.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
 
           logprobs: Determines the number of most likely tokens to return at each token position log
-              probabilities to return
+              probabilities to return.
 
           max_tokens: The maximum number of tokens to generate.
 
-          min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to
-              `temperature`.
+          min_p: A number between 0 and 1 that can be used as an alternative to temperature.
 
-          n: Number of generations to return
+          n: The number of completions to generate for each prompt.
 
-          presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will increase the likelihood of a model talking about new topics.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
 
           repetition_penalty: A number that controls the diversity of generated text by reducing the
               likelihood of repeated sequences. Higher values decrease repetition.
 
-          response_format: Specifies the format of the response.
+          response_format: An object specifying the format that the model must output.
 
-          safety_model: The name of the safety model to use.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
 
-          stop: A list of string sequences that will truncate (stop) inference text output.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
 
-          stream: If set, tokens are returned as Server-Sent Events as they are made available.
-              Stream terminates with `data: [DONE]`. If false, return a single JSON object
-              containing the results.
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
 
-          temperature: Determines the degree of randomness in the response.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
 
-          tool_choice: The choice of tool to use.
+          tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
+              which lets the model pick between generating a message or calling a function.
 
-          tools: A list of tools to be used in the query.
+          tools: A list of tools the model may call. Currently, only functions are supported as a
+              tool. Use this to provide a list of functions the model may generate JSON inputs
+              for.
 
-          top_k: The `top_k` parameter is used to limit the number of choices for the next
-              predicted word or token.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
 
-          top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of
-              choices for each predicted token based on the cumulative probabilities.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
 
           extra_headers: Send extra headers
 
@@ -545,53 +597,66 @@ async def create(
 
           model: The name of the model to query.
 
-          stream: If set, tokens are returned as Server-Sent Events as they are made available.
-              Stream terminates with `data: [DONE]`. If false, return a single JSON object
-              containing the results.
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
 
-          echo: If set, the response will contain the prompt, and will also return prompt
-              logprobs if set with logprobs.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
 
-          frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will decrease the likelihood of repeating tokens that were
-              mentioned prior.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
 
-          logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens
-              appearing in the generated output.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
 
           logprobs: Determines the number of most likely tokens to return at each token position log
-              probabilities to return
+              probabilities to return.
 
           max_tokens: The maximum number of tokens to generate.
 
-          min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to
-              `temperature`.
+          min_p: A number between 0 and 1 that can be used as an alternative to temperature.
 
-          n: Number of generations to return
+          n: The number of completions to generate for each prompt.
 
-          presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will increase the likelihood of a model talking about new topics.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
 
           repetition_penalty: A number that controls the diversity of generated text by reducing the
               likelihood of repeated sequences. Higher values decrease repetition.
 
-          response_format: Specifies the format of the response.
+          response_format: An object specifying the format that the model must output.
 
-          safety_model: The name of the safety model to use.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
 
-          stop: A list of string sequences that will truncate (stop) inference text output.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
 
-          temperature: Determines the degree of randomness in the response.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
 
-          tool_choice: The choice of tool to use.
+          tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
+              which lets the model pick between generating a message or calling a function.
 
-          tools: A list of tools to be used in the query.
+          tools: A list of tools the model may call. Currently, only functions are supported as a
+              tool. Use this to provide a list of functions the model may generate JSON inputs
+              for.
 
-          top_k: The `top_k` parameter is used to limit the number of choices for the next
-              predicted word or token.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
 
-          top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of
-              choices for each predicted token based on the cumulative probabilities.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
 
           extra_headers: Send extra headers
 
@@ -642,53 +707,66 @@ async def create(
 
           model: The name of the model to query.
 
-          stream: If set, tokens are returned as Server-Sent Events as they are made available.
-              Stream terminates with `data: [DONE]`. If false, return a single JSON object
-              containing the results.
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
 
-          echo: If set, the response will contain the prompt, and will also return prompt
-              logprobs if set with logprobs.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
 
-          frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will decrease the likelihood of repeating tokens that were
-              mentioned prior.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
 
-          logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens
-              appearing in the generated output.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
 
           logprobs: Determines the number of most likely tokens to return at each token position log
-              probabilities to return
+              probabilities to return.
 
           max_tokens: The maximum number of tokens to generate.
 
-          min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to
-              `temperature`.
+          min_p: A number between 0 and 1 that can be used as an alternative to temperature.
 
-          n: Number of generations to return
+          n: The number of completions to generate for each prompt.
 
-          presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will increase the likelihood of a model talking about new topics.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
 
           repetition_penalty: A number that controls the diversity of generated text by reducing the
               likelihood of repeated sequences. Higher values decrease repetition.
 
-          response_format: Specifies the format of the response.
+          response_format: An object specifying the format that the model must output.
 
-          safety_model: The name of the safety model to use.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
 
-          stop: A list of string sequences that will truncate (stop) inference text output.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
 
-          temperature: Determines the degree of randomness in the response.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
 
-          tool_choice: The choice of tool to use.
+          tool_choice: Controls which (if any) function is called by the model. By default uses `auto`,
+              which lets the model pick between generating a message or calling a function.
 
-          tools: A list of tools to be used in the query.
+          tools: A list of tools the model may call. Currently, only functions are supported as a
+              tool. Use this to provide a list of functions the model may generate JSON inputs
+              for.
 
-          top_k: The `top_k` parameter is used to limit the number of choices for the next
-              predicted word or token.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
 
-          top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of
-              choices for each predicted token based on the cumulative probabilities.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
 
           extra_headers: Send extra headers
 
diff --git a/src/together/resources/completions.py b/src/together/resources/completions.py
index 163fc7a5..3f777147 100644
--- a/src/together/resources/completions.py
+++ b/src/together/resources/completions.py
@@ -76,46 +76,57 @@ def create(
 
           prompt: A string providing context for the model to complete.
 
-          echo: If set, the response will contain the prompt, and will also return prompt
-              logprobs if set with logprobs.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
 
-          frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will decrease the likelihood of repeating tokens that were
-              mentioned prior.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
 
-          logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens
-              appearing in the generated output.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
 
           logprobs: Determines the number of most likely tokens to return at each token position log
-              probabilities to return
+              probabilities to return.
 
           max_tokens: The maximum number of tokens to generate.
 
-          min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to
-              `temperature`.
+          min_p: A number between 0 and 1 that can be used as an alternative to temperature.
 
-          n: Number of generations to return
+          n: The number of completions to generate for each prompt.
 
-          presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will increase the likelihood of a model talking about new topics.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
 
           repetition_penalty: A number that controls the diversity of generated text by reducing the
               likelihood of repeated sequences. Higher values decrease repetition.
 
-          safety_model: The name of the safety model to use.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
 
-          stop: A list of string sequences that will truncate (stop) inference text output.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
 
-          stream: If set, tokens are returned as Server-Sent Events as they are made available.
-              Stream terminates with `data: [DONE]`
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
 
-          temperature: Determines the degree of randomness in the response.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
 
-          top_k: The `top_k` parameter is used to limit the number of choices for the next
-              predicted word or token.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
 
-          top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of
-              choices for each predicted token based on the cumulative probabilities.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
 
           extra_headers: Send extra headers
 
@@ -163,46 +174,57 @@ def create(
 
           prompt: A string providing context for the model to complete.
 
-          stream: If set, tokens are returned as Server-Sent Events as they are made available.
-              Stream terminates with `data: [DONE]`
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
 
-          echo: If set, the response will contain the prompt, and will also return prompt
-              logprobs if set with logprobs.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
 
-          frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will decrease the likelihood of repeating tokens that were
-              mentioned prior.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
 
-          logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens
-              appearing in the generated output.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
 
           logprobs: Determines the number of most likely tokens to return at each token position log
-              probabilities to return
+              probabilities to return.
 
           max_tokens: The maximum number of tokens to generate.
 
-          min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to
-              `temperature`.
+          min_p: A number between 0 and 1 that can be used as an alternative to temperature.
 
-          n: Number of generations to return
+          n: The number of completions to generate for each prompt.
 
-          presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will increase the likelihood of a model talking about new topics.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
 
           repetition_penalty: A number that controls the diversity of generated text by reducing the
               likelihood of repeated sequences. Higher values decrease repetition.
 
-          safety_model: The name of the safety model to use.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
 
-          stop: A list of string sequences that will truncate (stop) inference text output.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
 
-          temperature: Determines the degree of randomness in the response.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
 
-          top_k: The `top_k` parameter is used to limit the number of choices for the next
-              predicted word or token.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
 
-          top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of
-              choices for each predicted token based on the cumulative probabilities.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
 
           extra_headers: Send extra headers
 
@@ -250,46 +272,57 @@ def create(
 
           prompt: A string providing context for the model to complete.
 
-          stream: If set, tokens are returned as Server-Sent Events as they are made available.
-              Stream terminates with `data: [DONE]`
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
 
-          echo: If set, the response will contain the prompt, and will also return prompt
-              logprobs if set with logprobs.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
 
-          frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will decrease the likelihood of repeating tokens that were
-              mentioned prior.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
 
-          logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens
-              appearing in the generated output.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
 
           logprobs: Determines the number of most likely tokens to return at each token position log
-              probabilities to return
+              probabilities to return.
 
           max_tokens: The maximum number of tokens to generate.
 
-          min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to
-              `temperature`.
+          min_p: A number between 0 and 1 that can be used as an alternative to temperature.
 
-          n: Number of generations to return
+          n: The number of completions to generate for each prompt.
 
-          presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will increase the likelihood of a model talking about new topics.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
 
           repetition_penalty: A number that controls the diversity of generated text by reducing the
               likelihood of repeated sequences. Higher values decrease repetition.
 
-          safety_model: The name of the safety model to use.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
 
-          stop: A list of string sequences that will truncate (stop) inference text output.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
 
-          temperature: Determines the degree of randomness in the response.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
 
-          top_k: The `top_k` parameter is used to limit the number of choices for the next
-              predicted word or token.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
 
-          top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of
-              choices for each predicted token based on the cumulative probabilities.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
 
           extra_headers: Send extra headers
 
@@ -407,46 +440,57 @@ async def create(
 
           prompt: A string providing context for the model to complete.
 
-          echo: If set, the response will contain the prompt, and will also return prompt
-              logprobs if set with logprobs.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
 
-          frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will decrease the likelihood of repeating tokens that were
-              mentioned prior.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
 
-          logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens
-              appearing in the generated output.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
 
           logprobs: Determines the number of most likely tokens to return at each token position log
-              probabilities to return
+              probabilities to return.
 
           max_tokens: The maximum number of tokens to generate.
 
-          min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to
-              `temperature`.
+          min_p: A number between 0 and 1 that can be used as an alternative to temperature.
 
-          n: Number of generations to return
+          n: The number of completions to generate for each prompt.
 
-          presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will increase the likelihood of a model talking about new topics.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
 
           repetition_penalty: A number that controls the diversity of generated text by reducing the
               likelihood of repeated sequences. Higher values decrease repetition.
 
-          safety_model: The name of the safety model to use.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
 
-          stop: A list of string sequences that will truncate (stop) inference text output.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
 
-          stream: If set, tokens are returned as Server-Sent Events as they are made available.
-              Stream terminates with `data: [DONE]`
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
 
-          temperature: Determines the degree of randomness in the response.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
 
-          top_k: The `top_k` parameter is used to limit the number of choices for the next
-              predicted word or token.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
 
-          top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of
-              choices for each predicted token based on the cumulative probabilities.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
 
           extra_headers: Send extra headers
 
@@ -494,46 +538,57 @@ async def create(
 
           prompt: A string providing context for the model to complete.
 
-          stream: If set, tokens are returned as Server-Sent Events as they are made available.
-              Stream terminates with `data: [DONE]`
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
 
-          echo: If set, the response will contain the prompt, and will also return prompt
-              logprobs if set with logprobs.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
 
-          frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will decrease the likelihood of repeating tokens that were
-              mentioned prior.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
 
-          logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens
-              appearing in the generated output.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
 
           logprobs: Determines the number of most likely tokens to return at each token position log
-              probabilities to return
+              probabilities to return.
 
           max_tokens: The maximum number of tokens to generate.
 
-          min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to
-              `temperature`.
+          min_p: A number between 0 and 1 that can be used as an alternative to temperature.
 
-          n: Number of generations to return
+          n: The number of completions to generate for each prompt.
 
-          presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will increase the likelihood of a model talking about new topics.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
 
           repetition_penalty: A number that controls the diversity of generated text by reducing the
               likelihood of repeated sequences. Higher values decrease repetition.
 
-          safety_model: The name of the safety model to use.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
 
-          stop: A list of string sequences that will truncate (stop) inference text output.
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
 
-          temperature: Determines the degree of randomness in the response.
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
 
-          top_k: The `top_k` parameter is used to limit the number of choices for the next
-              predicted word or token.
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
 
-          top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of
-              choices for each predicted token based on the cumulative probabilities.
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
 
           extra_headers: Send extra headers
 
@@ -581,46 +636,57 @@ async def create(
 
           prompt: A string providing context for the model to complete.
 
-          stream: If set, tokens are returned as Server-Sent Events as they are made available.
-              Stream terminates with `data: [DONE]`
+          stream: If true, stream tokens as Server-Sent Events as the model generates them instead
+              of waiting for the full model response. The stream terminates with
+              `data: [DONE]`. If false, return a single JSON object containing the results.
 
-          echo: If set, the response will contain the prompt, and will also return prompt
-              logprobs if set with logprobs.
+          echo: If true, the response will contain the prompt. Can be used with `logprobs` to
+              return prompt logprobs.
 
-          frequency_penalty: The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will decrease the likelihood of repeating tokens that were
-              mentioned prior.
+          frequency_penalty: A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+              repeating tokens that have already been mentioned.
 
-          logit_bias: The `logit_bias` parameter allows us to adjust the likelihood of specific tokens
-              appearing in the generated output.
+          logit_bias: Adjusts the likelihood of specific tokens appearing in the generated output.
 
           logprobs: Determines the number of most likely tokens to return at each token position log
-              probabilities to return
+              probabilities to return.
 
           max_tokens: The maximum number of tokens to generate.
 
-          min_p: The `min_p` parameter is a number between 0 and 1 and an alternative to
-              `temperature`.
+          min_p: A number between 0 and 1 that can be used as an alternative to temperature.
 
-          n: Number of generations to return
+          n: The number of completions to generate for each prompt.
 
-          presence_penalty: The `presence_penalty` parameter is a number between -2.0 and 2.0 where a
-              positive value will increase the likelihood of a model talking about new topics.
+          presence_penalty: A number between -2.0 and 2.0 where a positive value increases the likelihood of
+              a model talking about new topics.
 
           repetition_penalty: A number that controls the diversity of generated text by reducing the
               likelihood of repeated sequences. Higher values decrease repetition.
 
-          safety_model: The name of the safety model to use.
-
-          stop: A list of string sequences that will truncate (stop) inference text output.
-
-          temperature: Determines the degree of randomness in the response.
-
-          top_k: The `top_k` parameter is used to limit the number of choices for the next
-              predicted word or token.
-
-          top_p: The `top_p` (nucleus) parameter is used to dynamically adjust the number of
-              choices for each predicted token based on the cumulative probabilities.
+          safety_model: The name of the moderation model used to validate tokens. Choose from the
+              available moderation models found
+              [here](https://docs.together.ai/docs/inference-models#moderation-models).
+
+          stop: A list of string sequences that will truncate (stop) inference text output. For
+              example, "</s>" will stop generation as soon as the model generates the given
+              token.
+
+          temperature: A decimal number from 0-1 that determines the degree of randomness in the
+              response. A temperature less than 1 favors more correctness and is appropriate
+              for question answering or summarization. A value closer to 1 introduces more
+              randomness in the output.
+
+          top_k: An integer that's used to limit the number of choices for the next predicted
+              word or token. It specifies the maximum number of tokens to consider at each
+              step, based on their probability of occurrence. This technique helps to speed up
+              the generation process and can improve the quality of the generated text by
+              focusing on the most likely options.
+
+          top_p: A percentage (also called the nucleus parameter) that's used to dynamically
+              adjust the number of choices for each predicted token based on the cumulative
+              probabilities. It specifies a probability threshold below which all less likely
+              tokens are filtered out. This technique helps maintain diversity and generate
+              more fluent and natural-sounding text.
 
           extra_headers: Send extra headers
 
diff --git a/src/together/types/chat/completion_create_params.py b/src/together/types/chat/completion_create_params.py
index 05c6a86a..8b57ef5b 100644
--- a/src/together/types/chat/completion_create_params.py
+++ b/src/together/types/chat/completion_create_params.py
@@ -26,46 +26,39 @@ class CompletionCreateParamsBase(TypedDict, total=False):
     """The name of the model to query."""
 
     echo: bool
-    """
-    If set, the response will contain the prompt, and will also return prompt
-    logprobs if set with logprobs.
+    """If true, the response will contain the prompt.
+
+    Can be used with `logprobs` to return prompt logprobs.
     """
 
     frequency_penalty: float
     """
-    The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a
-    positive value will decrease the likelihood of repeating tokens that were
-    mentioned prior.
+    A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+    repeating tokens that have already been mentioned.
     """
 
     logit_bias: Dict[str, float]
-    """
-    The `logit_bias` parameter allows us to adjust the likelihood of specific tokens
-    appearing in the generated output.
-    """
+    """Adjusts the likelihood of specific tokens appearing in the generated output."""
 
     logprobs: int
     """
     Determines the number of most likely tokens to return at each token position log
-    probabilities to return
+    probabilities to return.
     """
 
     max_tokens: int
     """The maximum number of tokens to generate."""
 
     min_p: float
-    """
-    The `min_p` parameter is a number between 0 and 1 and an alternative to
-    `temperature`.
-    """
+    """A number between 0 and 1 that can be used as an alternative to temperature."""
 
     n: int
-    """Number of generations to return"""
+    """The number of completions to generate for each prompt."""
 
     presence_penalty: float
     """
-    The `presence_penalty` parameter is a number between -2.0 and 2.0 where a
-    positive value will increase the likelihood of a model talking about new topics.
+    A number between -2.0 and 2.0 where a positive value increases the likelihood of
+    a model talking about new topics.
     """
 
     repetition_penalty: float
@@ -75,33 +68,60 @@ class CompletionCreateParamsBase(TypedDict, total=False):
     """
 
     response_format: ResponseFormat
-    """Specifies the format of the response."""
+    """An object specifying the format that the model must output."""
 
     safety_model: str
-    """The name of the safety model to use."""
+    """The name of the moderation model used to validate tokens.
+
+    Choose from the available moderation models found
+    [here](https://docs.together.ai/docs/inference-models#moderation-models).
+    """
 
     stop: List[str]
-    """A list of string sequences that will truncate (stop) inference text output."""
+    """A list of string sequences that will truncate (stop) inference text output.
+
+    For example, "</s>" will stop generation as soon as the model generates the
+    given token.
+    """
 
     temperature: float
-    """Determines the degree of randomness in the response."""
+    """
+    A decimal number from 0-1 that determines the degree of randomness in the
+    response. A temperature less than 1 favors more correctness and is appropriate
+    for question answering or summarization. A value closer to 1 introduces more
+    randomness in the output.
+    """
 
     tool_choice: ToolChoice
-    """The choice of tool to use."""
+    """Controls which (if any) function is called by the model.
+
+    By default uses `auto`, which lets the model pick between generating a message
+    or calling a function.
+    """
 
     tools: Iterable[ToolsParam]
-    """A list of tools to be used in the query."""
+    """A list of tools the model may call.
+
+    Currently, only functions are supported as a tool. Use this to provide a list of
+    functions the model may generate JSON inputs for.
+    """
 
     top_k: int
     """
-    The `top_k` parameter is used to limit the number of choices for the next
-    predicted word or token.
+    An integer that's used to limit the number of choices for the next predicted
+    word or token. It specifies the maximum number of tokens to consider at each
+    step, based on their probability of occurrence. This technique helps to speed up
+    the generation process and can improve the quality of the generated text by
+    focusing on the most likely options.
     """
 
     top_p: float
     """
-    The `top_p` (nucleus) parameter is used to dynamically adjust the number of
-    choices for each predicted token based on the cumulative probabilities.
+    A percentage (also called the nucleus parameter) that's used to dynamically
+    adjust the number of choices for each predicted token based on the cumulative
+    probabilities. It specifies a probability threshold below which all less likely
+    tokens are filtered out. This technique helps maintain diversity and generate
+    more fluent and natural-sounding text.
     """
 
 
@@ -126,19 +146,19 @@ class ResponseFormat(TypedDict, total=False):
 
 class CompletionCreateParamsNonStreaming(CompletionCreateParamsBase):
     stream: Literal[False]
-    """If set, tokens are returned as Server-Sent Events as they are made available.
-
-    Stream terminates with `data: [DONE]`. If false, return a single JSON object
-    containing the results.
+    """
+    If true, stream tokens as Server-Sent Events as the model generates them instead
+    of waiting for the full model response. The stream terminates with
+    `data: [DONE]`. If false, return a single JSON object containing the results.
     """
 
 
 class CompletionCreateParamsStreaming(CompletionCreateParamsBase):
     stream: Required[Literal[True]]
-    """If set, tokens are returned as Server-Sent Events as they are made available.
-
-    Stream terminates with `data: [DONE]`. If false, return a single JSON object
-    containing the results.
+    """
+    If true, stream tokens as Server-Sent Events as the model generates them instead
+    of waiting for the full model response. The stream terminates with
+    `data: [DONE]`. If false, return a single JSON object containing the results.
     """
 
 
diff --git a/src/together/types/completion_create_params.py b/src/together/types/completion_create_params.py
index 7f4e1fef..050a5477 100644
--- a/src/together/types/completion_create_params.py
+++ b/src/together/types/completion_create_params.py
@@ -16,46 +16,39 @@ class CompletionCreateParamsBase(TypedDict, total=False):
     """A string providing context for the model to complete."""
 
     echo: bool
-    """
-    If set, the response will contain the prompt, and will also return prompt
-    logprobs if set with logprobs.
+    """If true, the response will contain the prompt.
+
+    Can be used with `logprobs` to return prompt logprobs.
     """
 
     frequency_penalty: float
     """
-    The `frequency_penalty` parameter is a number between -2.0 and 2.0 where a
-    positive value will decrease the likelihood of repeating tokens that were
-    mentioned prior.
+    A number between -2.0 and 2.0 where a positive value decreases the likelihood of
+    repeating tokens that have already been mentioned.
     """
 
     logit_bias: Dict[str, float]
-    """
-    The `logit_bias` parameter allows us to adjust the likelihood of specific tokens
-    appearing in the generated output.
-    """
+    """Adjusts the likelihood of specific tokens appearing in the generated output."""
 
     logprobs: int
     """
     Determines the number of most likely tokens to return at each token position log
-    probabilities to return
+    probabilities to return.
     """
 
     max_tokens: int
     """The maximum number of tokens to generate."""
 
     min_p: float
-    """
-    The `min_p` parameter is a number between 0 and 1 and an alternative to
-    `temperature`.
-    """
+    """A number between 0 and 1 that can be used as an alternative to temperature."""
 
     n: int
-    """Number of generations to return"""
+    """The number of completions to generate for each prompt."""
 
     presence_penalty: float
     """
-    The `presence_penalty` parameter is a number between -2.0 and 2.0 where a
-    positive value will increase the likelihood of a model talking about new topics.
+    A number between -2.0 and 2.0 where a positive value increases the likelihood of
+    a model talking about new topics.
     """
 
     repetition_penalty: float
@@ -65,40 +58,61 @@ class CompletionCreateParamsBase(TypedDict, total=False):
     """
 
     safety_model: str
-    """The name of the safety model to use."""
+    """The name of the moderation model used to validate tokens.
+
+    Choose from the available moderation models found
+    [here](https://docs.together.ai/docs/inference-models#moderation-models).
+    """
 
     stop: List[str]
-    """A list of string sequences that will truncate (stop) inference text output."""
+    """A list of string sequences that will truncate (stop) inference text output.
+
+    For example, "</s>" will stop generation as soon as the model generates the
+    given token.
+    """
 
     temperature: float
-    """Determines the degree of randomness in the response."""
+    """
+    A decimal number from 0-1 that determines the degree of randomness in the
+    response. A temperature less than 1 favors more correctness and is appropriate
+    for question answering or summarization. A value closer to 1 introduces more
+    randomness in the output.
+    """
 
     top_k: int
     """
-    The `top_k` parameter is used to limit the number of choices for the next
-    predicted word or token.
+    An integer that's used to limit the number of choices for the next predicted
+    word or token. It specifies the maximum number of tokens to consider at each
+    step, based on their probability of occurrence. This technique helps to speed up
+    the generation process and can improve the quality of the generated text by
+    focusing on the most likely options.
     """
 
     top_p: float
     """
-    The `top_p` (nucleus) parameter is used to dynamically adjust the number of
-    choices for each predicted token based on the cumulative probabilities.
+    A percentage (also called the nucleus parameter) that's used to dynamically
+    adjust the number of choices for each predicted token based on the cumulative
+    probabilities. It specifies a probability threshold below which all less likely
+    tokens are filtered out. This technique helps maintain diversity and generate
+    more fluent and natural-sounding text.
     """
 
 
 class CompletionCreateParamsNonStreaming(CompletionCreateParamsBase):
     stream: Literal[False]
-    """If set, tokens are returned as Server-Sent Events as they are made available.
-
-    Stream terminates with `data: [DONE]`
+    """
+    If true, stream tokens as Server-Sent Events as the model generates them instead
+    of waiting for the full model response. The stream terminates with
+    `data: [DONE]`. If false, return a single JSON object containing the results.
     """
 
 
 class CompletionCreateParamsStreaming(CompletionCreateParamsBase):
     stream: Required[Literal[True]]
-    """If set, tokens are returned as Server-Sent Events as they are made available.
-
-    Stream terminates with `data: [DONE]`
+    """
+    If true, stream tokens as Server-Sent Events as the model generates them instead
+    of waiting for the full model response. The stream terminates with
+    `data: [DONE]`. If false, return a single JSON object containing the results.
     """