diff --git a/speech-to-text/v1-generated.ts b/speech-to-text/v1-generated.ts index 3673feefea..5deebd1bf8 100644 --- a/speech-to-text/v1-generated.ts +++ b/speech-to-text/v1-generated.ts @@ -1,5 +1,5 @@ /** - * (C) Copyright IBM Corp. 2017, 2020. + * (C) Copyright IBM Corp. 2017, 2021. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,9 +15,9 @@ */ /** - * IBM OpenAPI SDK Code Generator Version: 99-SNAPSHOT-be3b4618-20201221-123327 + * IBM OpenAPI SDK Code Generator Version: 99-SNAPSHOT-902c9336-20210507-162723 */ - + import * as extend from 'extend'; import { IncomingHttpHeaders, OutgoingHttpHeaders } from 'http'; @@ -27,9 +27,14 @@ import { getSdkHeaders } from '../lib/common'; /** * The IBM Watson™ Speech to Text service provides APIs that use IBM's speech-recognition capabilities to produce * transcripts of spoken audio. The service can transcribe speech from various languages and audio formats. In addition - * to basic transcription, the service can produce detailed information about many different aspects of the audio. For - * most languages, the service supports two sampling rates, broadband and narrowband. It returns all JSON response - * content in the UTF-8 character set. + * to basic transcription, the service can produce detailed information about many different aspects of the audio. It + * returns all JSON response content in the UTF-8 character set. + * + * The service supports two types of models: previous-generation models that include the terms `Broadband` and + * `Narrowband` in their names, and beta next-generation models that include the terms `Multimedia` and `Telephony` in + * their names. Broadband and multimedia models have minimum sampling rates of 16 kHz. Narrowband and telephony models + * have minimum sampling rates of 8 kHz. The beta next-generation models currently support fewer languages and features, + * but they offer high throughput and greater transcription accuracy. * * For speech recognition, the service supports synchronous and asynchronous HTTP Representational State Transfer (REST) * interfaces. It also supports a WebSocket interface that provides a full-duplex, low-latency communication channel: @@ -41,8 +46,8 @@ import { getSdkHeaders } from '../lib/common'; * formal language specification that lets you restrict the phrases that the service can recognize. * * Language model customization and acoustic model customization are generally available for production use with all - * language models that are generally available. Grammars are beta functionality for all language models that support - * language model customization. + * previous-generation models that are generally available. Grammars are beta functionality for all previous-generation + * models that support language model customization. Next-generation models do not support customization at this time. */ class SpeechToTextV1 extends BaseService { @@ -89,7 +94,7 @@ class SpeechToTextV1 extends BaseService { * model and its minimum sampling rate in Hertz, among other things. The ordering of the list of models can change * from call to call; do not rely on an alphabetized or static list of models. * - * **See also:** [Languages and models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models#models). + * **See also:** [Listing models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-list). * * @param {Object} [params] - The parameters to send to the service. * @param {OutgoingHttpHeaders} [params.headers] - Custom request headers @@ -121,7 +126,7 @@ class SpeechToTextV1 extends BaseService { * Gets information for a single specified language model that is available for use with the service. The information * includes the name of the model and its minimum sampling rate in Hertz, among other things. * - * **See also:** [Languages and models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models#models). + * **See also:** [Listing models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-list). * * @param {Object} params - The parameters to send to the service. * @param {string} params.modelId - The identifier of the model in the form of its name from the output of the **Get a @@ -221,8 +226,36 @@ class SpeechToTextV1 extends BaseService { * the minimum required rate, the service down-samples the audio to the appropriate rate. If the sampling rate of the * audio is lower than the minimum required rate, the request fails. * - * **See also:** [Audio - * formats](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-audio-formats#audio-formats). + * **See also:** [Supported audio + * formats](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-audio-formats). + * + * ### Next-generation models + * + * **Note:** The next-generation language models are beta functionality. They support a limited number of languages + * and features at this time. The supported languages, models, and features will increase with future releases. + * + * The service supports next-generation `Multimedia` (16 kHz) and `Telephony` (8 kHz) models for many languages. + * Next-generation models have higher throughput than the service's previous generation of `Broadband` and + * `Narrowband` models. When you use next-generation models, the service can return transcriptions more quickly and + * also provide noticeably better transcription accuracy. + * + * You specify a next-generation model by using the `model` query parameter, as you do a previous-generation model. + * Next-generation models support the same request headers as previous-generation models, but they support only the + * following additional query parameters: + * * `background_audio_suppression` + * * `inactivity_timeout` + * * `profanity_filter` + * * `redaction` + * * `smart_formatting` + * * `speaker_labels` + * * `speech_detector_sensitivity` + * * `timestamps` + * + * Many next-generation models also support the beta `low_latency` parameter, which is not available with + * previous-generation models. + * + * **See also:** [Next-generation languages and + * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng). * * ### Multipart speech recognition * @@ -246,24 +279,26 @@ class SpeechToTextV1 extends BaseService { * an audio format, see **Audio formats (content types)** in the method description. * @param {string} [params.model] - The identifier of the model that is to be used for the recognition request. * (**Note:** The model `ar-AR_BroadbandModel` is deprecated; use `ar-MS_BroadbandModel` instead.) See [Languages and - * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models#models). + * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models) and [Next-generation languages and + * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng). * @param {string} [params.languageCustomizationId] - The customization ID (GUID) of a custom language model that is * to be used with the recognition request. The base model of the specified custom language model must match the model * specified with the `model` parameter. You must make the request with credentials for the instance of the service - * that owns the custom model. By default, no custom language model is used. See [Custom - * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#custom-input). + * that owns the custom model. By default, no custom language model is used. See [Using a custom language model for + * speech recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-languageUse). * * **Note:** Use this parameter instead of the deprecated `customization_id` parameter. * @param {string} [params.acousticCustomizationId] - The customization ID (GUID) of a custom acoustic model that is * to be used with the recognition request. The base model of the specified custom acoustic model must match the model * specified with the `model` parameter. You must make the request with credentials for the instance of the service - * that owns the custom model. By default, no custom acoustic model is used. See [Custom - * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#custom-input). + * that owns the custom model. By default, no custom acoustic model is used. See [Using a custom acoustic model for + * speech recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-acousticUse). * @param {string} [params.baseModelVersion] - The version of the specified base model that is to be used with the * recognition request. Multiple versions of a base model can exist when a model is updated for internal improvements. * The parameter is intended primarily for use with custom models that have been upgraded for a new base model. The - * default value depends on whether the parameter is used with or without a custom model. See [Base model - * version](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#version). + * default value depends on whether the parameter is used with or without a custom model. See [Making speech + * recognition requests with upgraded custom + * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-custom-upgrade-use#custom-upgrade-use-recognition). * @param {number} [params.customizationWeight] - If you specify the customization ID (GUID) of a custom language * model with the recognition request, the customization weight tells the service how much weight to give to words * from the custom language model compared to those from the base model for the current request. @@ -276,7 +311,8 @@ class SpeechToTextV1 extends BaseService { * OOV words from the custom model. Use caution when setting the weight: a higher value can improve the accuracy of * phrases from the custom model's domain, but it can negatively affect performance on non-domain phrases. * - * See [Custom models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#custom-input). + * See [Using customization + * weight](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-languageUse#weight). * @param {number} [params.inactivityTimeout] - The time in seconds after which, if only silence (no speech) is * detected in streaming audio, the connection is closed with a 400 error. The parameter is useful for stopping audio * submission from a live microphone when a user simply walks away. Use `-1` for infinity. See [Inactivity @@ -290,31 +326,31 @@ class SpeechToTextV1 extends BaseService { * characters, though the maximum effective length for double-byte languages might be shorter. Keywords are * case-insensitive. * - * See [Keyword spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#keyword_spotting). + * See [Keyword spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#keyword-spotting). * @param {number} [params.keywordsThreshold] - A confidence value that is the lower bound for spotting a keyword. A * word is considered to match a keyword if its confidence is greater than or equal to the threshold. Specify a * probability between 0.0 and 1.0. If you specify a threshold, you must also specify one or more keywords. The * service performs no keyword spotting if you omit either parameter. See [Keyword - * spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#keyword_spotting). + * spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#keyword-spotting). * @param {number} [params.maxAlternatives] - The maximum number of alternative transcripts that the service is to * return. By default, the service returns a single transcript. If you specify a value of `0`, the service uses the * default value, `1`. See [Maximum - * alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#max_alternatives). + * alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#max-alternatives). * @param {number} [params.wordAlternativesThreshold] - A confidence value that is the lower bound for identifying a * hypothesis as a possible word alternative (also known as "Confusion Networks"). An alternative word is considered * if its confidence is greater than or equal to the threshold. Specify a probability between 0.0 and 1.0. By default, * the service computes no alternative words. See [Word - * alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#word_alternatives). + * alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#word-alternatives). * @param {boolean} [params.wordConfidence] - If `true`, the service returns a confidence measure in the range of 0.0 * to 1.0 for each word. By default, the service returns no word confidence scores. See [Word - * confidence](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#word_confidence). + * confidence](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#word-confidence). * @param {boolean} [params.timestamps] - If `true`, the service returns time alignment for each word. By default, no * timestamps are returned. See [Word - * timestamps](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#word_timestamps). + * timestamps](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#word-timestamps). * @param {boolean} [params.profanityFilter] - If `true`, the service filters profanity from all output except for * keyword results by replacing inappropriate words with a series of asterisks. Set the parameter to `false` to return - * results with no censoring. Applies to US English transcription only. See [Profanity - * filtering](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#profanity_filter). + * results with no censoring. Applies to US English and Japanese transcription only. See [Profanity + * filtering](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#profanity-filtering). * @param {boolean} [params.smartFormatting] - If `true`, the service converts dates, times, series of digits and * numbers, phone numbers, currency values, and internet addresses into more readable, conventional representations in * the final transcript of a recognition request. For US English, the service also converts certain keyword strings to @@ -322,24 +358,26 @@ class SpeechToTextV1 extends BaseService { * * **Note:** Applies to US English, Japanese, and Spanish transcription only. * - * See [Smart formatting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#smart_formatting). + * See [Smart formatting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#smart-formatting). * @param {boolean} [params.speakerLabels] - If `true`, the response includes labels that identify which words were * spoken by which participants in a multi-person exchange. By default, the service returns no speaker labels. Setting * `speaker_labels` to `true` forces the `timestamps` parameter to be `true`, regardless of whether you specify * `false` for the parameter. + * * For previous-generation models, can be used for US English, Australian English, German, Japanese, Korean, and + * Spanish (both broadband and narrowband models) and UK English (narrowband model) transcription only. + * * For next-generation models, can be used for English (Australian, UK, and US), German, and Spanish transcription + * only. * - * **Note:** Applies to US English, Australian English, German, Japanese, Korean, and Spanish (both broadband and - * narrowband models) and UK English (narrowband model) transcription only. - * - * See [Speaker labels](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#speaker_labels). + * Restrictions and limitations apply to the use of speaker labels for both types of models. See [Speaker + * labels](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-speaker-labels). * @param {string} [params.customizationId] - **Deprecated.** Use the `language_customization_id` parameter to specify * the customization ID (GUID) of a custom language model that is to be used with the recognition request. Do not * specify both parameters with a request. * @param {string} [params.grammarName] - The name of a grammar that is to be used with the recognition request. If * you specify a grammar, you must also use the `language_customization_id` parameter to specify the name of the * custom language model for which the grammar is defined. The service recognizes only strings that are recognized by - * the specified grammar; it does not recognize other custom words from the model's words resource. See - * [Grammars](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#grammars-input). + * the specified grammar; it does not recognize other custom words from the model's words resource. See [Using a + * grammar for speech recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-grammarUse). * @param {boolean} [params.redaction] - If `true`, the service redacts, or masks, numeric data from final * transcripts. The feature redacts any number that has three or more consecutive digits by replacing each digit with * an `X` character. It is intended to redact sensitive numeric data, such as credit card numbers. By default, the @@ -352,12 +390,13 @@ class SpeechToTextV1 extends BaseService { * * **Note:** Applies to US English, Japanese, and Korean transcription only. * - * See [Numeric redaction](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#redaction). + * See [Numeric + * redaction](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#numeric-redaction). * @param {boolean} [params.audioMetrics] - If `true`, requests detailed information about the signal characteristics * of the input audio. The service returns audio metrics with the final transcription results. By default, the service * returns no audio metrics. * - * See [Audio metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#audio_metrics). + * See [Audio metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#audio-metrics). * @param {number} [params.endOfPhraseSilenceTime] - If `true`, specifies the duration of the pause interval at which * the service splits a transcript into multiple final results. If the service detects pauses or extended silence * before it reaches the end of the audio stream, its response can include multiple final results. Silence indicates a @@ -371,7 +410,7 @@ class SpeechToTextV1 extends BaseService { * The default pause interval for most languages is 0.8 seconds; the default for Chinese is 0.6 seconds. * * See [End of phrase silence - * time](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#silence_time). + * time](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-parsing#silence-time). * @param {boolean} [params.splitTranscriptAtPhraseEnd] - If `true`, directs the service to split the transcript into * multiple final results based on semantic features of the input, for example, at the conclusion of meaningful * phrases such as sentences. The service bases its understanding of semantic features on the base language model that @@ -379,7 +418,7 @@ class SpeechToTextV1 extends BaseService { * transcript. By default, the service splits transcripts based solely on the pause interval. * * See [Split transcript at phrase - * end](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#split_transcript). + * end](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-parsing#split-transcript). * @param {number} [params.speechDetectorSensitivity] - The sensitivity of speech activity detection that the service * is to perform. Use the parameter to suppress word insertions from music, coughing, and other non-speech events. The * service biases the audio it passes for speech recognition by evaluating the input audio against prior models of @@ -390,8 +429,8 @@ class SpeechToTextV1 extends BaseService { * * 0.5 (the default) provides a reasonable compromise for the level of sensitivity. * * 1.0 suppresses no audio (speech detection sensitivity is disabled). * - * The values increase on a monotonic curve. See [Speech Activity - * Detection](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#detection). + * The values increase on a monotonic curve. See [Speech detector + * sensitivity](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-sensitivity). * @param {number} [params.backgroundAudioSuppression] - The level to which the service is to suppress background * audio based on its volume to prevent it from being transcribed as speech. Use the parameter to suppress side * conversations or background noise. @@ -401,8 +440,21 @@ class SpeechToTextV1 extends BaseService { * * 0.5 provides a reasonable level of audio suppression for general usage. * * 1.0 suppresses all audio (no audio is transcribed). * - * The values increase on a monotonic curve. See [Speech Activity - * Detection](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#detection). + * The values increase on a monotonic curve. See [Background audio + * suppression](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-suppression). + * @param {boolean} [params.lowLatency] - If `true` for next-generation `Multimedia` and `Telephony` models that + * support low latency, directs the service to produce results even more quickly than it usually does. Next-generation + * models produce transcription results faster than previous-generation models. The `low_latency` parameter causes the + * models to produce results even more quickly, though the results might be less accurate when the parameter is used. + * + * **Note:** The parameter is beta functionality. It is not available for previous-generation `Broadband` and + * `Narrowband` models. It is available only for some next-generation models. + * + * * For a list of next-generation models that support low latency, see [Supported language + * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng#models-ng-supported) for + * next-generation models. + * * For more information about the `low_latency` parameter, see [Low + * latency](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-interim#low-latency). * @param {OutgoingHttpHeaders} [params.headers] - Custom request headers * @returns {Promise>} */ @@ -439,7 +491,8 @@ class SpeechToTextV1 extends BaseService { 'end_of_phrase_silence_time': _params.endOfPhraseSilenceTime, 'split_transcript_at_phrase_end': _params.splitTranscriptAtPhraseEnd, 'speech_detector_sensitivity': _params.speechDetectorSensitivity, - 'background_audio_suppression': _params.backgroundAudioSuppression + 'background_audio_suppression': _params.backgroundAudioSuppression, + 'low_latency': _params.lowLatency }; const sdkHeaders = getSdkHeaders(SpeechToTextV1.DEFAULT_SERVICE_NAME, 'v1', 'recognize'); @@ -664,8 +717,36 @@ class SpeechToTextV1 extends BaseService { * the minimum required rate, the service down-samples the audio to the appropriate rate. If the sampling rate of the * audio is lower than the minimum required rate, the request fails. * - * **See also:** [Audio - * formats](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-audio-formats#audio-formats). + * **See also:** [Supported audio + * formats](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-audio-formats). + * + * ### Next-generation models + * + * **Note:** The next-generation language models are beta functionality. They support a limited number of languages + * and features at this time. The supported languages, models, and features will increase with future releases. + * + * The service supports next-generation `Multimedia` (16 kHz) and `Telephony` (8 kHz) models for many languages. + * Next-generation models have higher throughput than the service's previous generation of `Broadband` and + * `Narrowband` models. When you use next-generation models, the service can return transcriptions more quickly and + * also provide noticeably better transcription accuracy. + * + * You specify a next-generation model by using the `model` query parameter, as you do a previous-generation model. + * Next-generation models support the same request headers as previous-generation models, but they support only the + * following additional query parameters: + * * `background_audio_suppression` + * * `inactivity_timeout` + * * `profanity_filter` + * * `redaction` + * * `smart_formatting` + * * `speaker_labels` + * * `speech_detector_sensitivity` + * * `timestamps` + * + * Many next-generation models also support the beta `low_latency` parameter, which is not available with + * previous-generation models. + * + * **See also:** [Next-generation languages and + * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng). * * @param {Object} params - The parameters to send to the service. * @param {NodeJS.ReadableStream|Buffer} params.audio - The audio to transcribe. @@ -673,7 +754,8 @@ class SpeechToTextV1 extends BaseService { * an audio format, see **Audio formats (content types)** in the method description. * @param {string} [params.model] - The identifier of the model that is to be used for the recognition request. * (**Note:** The model `ar-AR_BroadbandModel` is deprecated; use `ar-MS_BroadbandModel` instead.) See [Languages and - * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models#models). + * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models) and [Next-generation languages and + * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng). * @param {string} [params.callbackUrl] - A URL to which callback notifications are to be sent. The URL must already * be successfully allowlisted by using the **Register a callback** method. You can include the same callback URL with * any number of job creation requests. Omit the parameter to poll the service for job completion and results. @@ -705,20 +787,21 @@ class SpeechToTextV1 extends BaseService { * @param {string} [params.languageCustomizationId] - The customization ID (GUID) of a custom language model that is * to be used with the recognition request. The base model of the specified custom language model must match the model * specified with the `model` parameter. You must make the request with credentials for the instance of the service - * that owns the custom model. By default, no custom language model is used. See [Custom - * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#custom-input). + * that owns the custom model. By default, no custom language model is used. See [Using a custom language model for + * speech recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-languageUse). * * **Note:** Use this parameter instead of the deprecated `customization_id` parameter. * @param {string} [params.acousticCustomizationId] - The customization ID (GUID) of a custom acoustic model that is * to be used with the recognition request. The base model of the specified custom acoustic model must match the model * specified with the `model` parameter. You must make the request with credentials for the instance of the service - * that owns the custom model. By default, no custom acoustic model is used. See [Custom - * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#custom-input). + * that owns the custom model. By default, no custom acoustic model is used. See [Using a custom acoustic model for + * speech recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-acousticUse). * @param {string} [params.baseModelVersion] - The version of the specified base model that is to be used with the * recognition request. Multiple versions of a base model can exist when a model is updated for internal improvements. * The parameter is intended primarily for use with custom models that have been upgraded for a new base model. The - * default value depends on whether the parameter is used with or without a custom model. See [Base model - * version](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#version). + * default value depends on whether the parameter is used with or without a custom model. See [Making speech + * recognition requests with upgraded custom + * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-custom-upgrade-use#custom-upgrade-use-recognition). * @param {number} [params.customizationWeight] - If you specify the customization ID (GUID) of a custom language * model with the recognition request, the customization weight tells the service how much weight to give to words * from the custom language model compared to those from the base model for the current request. @@ -731,7 +814,8 @@ class SpeechToTextV1 extends BaseService { * OOV words from the custom model. Use caution when setting the weight: a higher value can improve the accuracy of * phrases from the custom model's domain, but it can negatively affect performance on non-domain phrases. * - * See [Custom models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#custom-input). + * See [Using customization + * weight](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-languageUse#weight). * @param {number} [params.inactivityTimeout] - The time in seconds after which, if only silence (no speech) is * detected in streaming audio, the connection is closed with a 400 error. The parameter is useful for stopping audio * submission from a live microphone when a user simply walks away. Use `-1` for infinity. See [Inactivity @@ -745,31 +829,31 @@ class SpeechToTextV1 extends BaseService { * characters, though the maximum effective length for double-byte languages might be shorter. Keywords are * case-insensitive. * - * See [Keyword spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#keyword_spotting). + * See [Keyword spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#keyword-spotting). * @param {number} [params.keywordsThreshold] - A confidence value that is the lower bound for spotting a keyword. A * word is considered to match a keyword if its confidence is greater than or equal to the threshold. Specify a * probability between 0.0 and 1.0. If you specify a threshold, you must also specify one or more keywords. The * service performs no keyword spotting if you omit either parameter. See [Keyword - * spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#keyword_spotting). + * spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#keyword-spotting). * @param {number} [params.maxAlternatives] - The maximum number of alternative transcripts that the service is to * return. By default, the service returns a single transcript. If you specify a value of `0`, the service uses the * default value, `1`. See [Maximum - * alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#max_alternatives). + * alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#max-alternatives). * @param {number} [params.wordAlternativesThreshold] - A confidence value that is the lower bound for identifying a * hypothesis as a possible word alternative (also known as "Confusion Networks"). An alternative word is considered * if its confidence is greater than or equal to the threshold. Specify a probability between 0.0 and 1.0. By default, * the service computes no alternative words. See [Word - * alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#word_alternatives). + * alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#word-alternatives). * @param {boolean} [params.wordConfidence] - If `true`, the service returns a confidence measure in the range of 0.0 * to 1.0 for each word. By default, the service returns no word confidence scores. See [Word - * confidence](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#word_confidence). + * confidence](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#word-confidence). * @param {boolean} [params.timestamps] - If `true`, the service returns time alignment for each word. By default, no * timestamps are returned. See [Word - * timestamps](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#word_timestamps). + * timestamps](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#word-timestamps). * @param {boolean} [params.profanityFilter] - If `true`, the service filters profanity from all output except for * keyword results by replacing inappropriate words with a series of asterisks. Set the parameter to `false` to return - * results with no censoring. Applies to US English transcription only. See [Profanity - * filtering](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#profanity_filter). + * results with no censoring. Applies to US English and Japanese transcription only. See [Profanity + * filtering](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#profanity-filtering). * @param {boolean} [params.smartFormatting] - If `true`, the service converts dates, times, series of digits and * numbers, phone numbers, currency values, and internet addresses into more readable, conventional representations in * the final transcript of a recognition request. For US English, the service also converts certain keyword strings to @@ -777,24 +861,26 @@ class SpeechToTextV1 extends BaseService { * * **Note:** Applies to US English, Japanese, and Spanish transcription only. * - * See [Smart formatting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#smart_formatting). + * See [Smart formatting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#smart-formatting). * @param {boolean} [params.speakerLabels] - If `true`, the response includes labels that identify which words were * spoken by which participants in a multi-person exchange. By default, the service returns no speaker labels. Setting * `speaker_labels` to `true` forces the `timestamps` parameter to be `true`, regardless of whether you specify * `false` for the parameter. + * * For previous-generation models, can be used for US English, Australian English, German, Japanese, Korean, and + * Spanish (both broadband and narrowband models) and UK English (narrowband model) transcription only. + * * For next-generation models, can be used for English (Australian, UK, and US), German, and Spanish transcription + * only. * - * **Note:** Applies to US English, Australian English, German, Japanese, Korean, and Spanish (both broadband and - * narrowband models) and UK English (narrowband model) transcription only. - * - * See [Speaker labels](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#speaker_labels). + * Restrictions and limitations apply to the use of speaker labels for both types of models. See [Speaker + * labels](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-speaker-labels). * @param {string} [params.customizationId] - **Deprecated.** Use the `language_customization_id` parameter to specify * the customization ID (GUID) of a custom language model that is to be used with the recognition request. Do not * specify both parameters with a request. * @param {string} [params.grammarName] - The name of a grammar that is to be used with the recognition request. If * you specify a grammar, you must also use the `language_customization_id` parameter to specify the name of the * custom language model for which the grammar is defined. The service recognizes only strings that are recognized by - * the specified grammar; it does not recognize other custom words from the model's words resource. See - * [Grammars](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#grammars-input). + * the specified grammar; it does not recognize other custom words from the model's words resource. See [Using a + * grammar for speech recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-grammarUse). * @param {boolean} [params.redaction] - If `true`, the service redacts, or masks, numeric data from final * transcripts. The feature redacts any number that has three or more consecutive digits by replacing each digit with * an `X` character. It is intended to redact sensitive numeric data, such as credit card numbers. By default, the @@ -807,14 +893,15 @@ class SpeechToTextV1 extends BaseService { * * **Note:** Applies to US English, Japanese, and Korean transcription only. * - * See [Numeric redaction](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#redaction). + * See [Numeric + * redaction](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#numeric-redaction). * @param {boolean} [params.processingMetrics] - If `true`, requests processing metrics about the service's * transcription of the input audio. The service returns processing metrics at the interval specified by the * `processing_metrics_interval` parameter. It also returns processing metrics for transcription events, for example, * for final and interim results. By default, the service returns no processing metrics. * * See [Processing - * metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#processing_metrics). + * metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#processing-metrics). * @param {number} [params.processingMetricsInterval] - Specifies the interval in real wall-clock seconds at which the * service is to return processing metrics. The parameter is ignored unless the `processing_metrics` parameter is set * to `true`. @@ -827,12 +914,12 @@ class SpeechToTextV1 extends BaseService { * of the audio, the service returns processing metrics only for transcription events. * * See [Processing - * metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#processing_metrics). + * metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#processing-metrics). * @param {boolean} [params.audioMetrics] - If `true`, requests detailed information about the signal characteristics * of the input audio. The service returns audio metrics with the final transcription results. By default, the service * returns no audio metrics. * - * See [Audio metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#audio_metrics). + * See [Audio metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#audio-metrics). * @param {number} [params.endOfPhraseSilenceTime] - If `true`, specifies the duration of the pause interval at which * the service splits a transcript into multiple final results. If the service detects pauses or extended silence * before it reaches the end of the audio stream, its response can include multiple final results. Silence indicates a @@ -846,7 +933,7 @@ class SpeechToTextV1 extends BaseService { * The default pause interval for most languages is 0.8 seconds; the default for Chinese is 0.6 seconds. * * See [End of phrase silence - * time](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#silence_time). + * time](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-parsing#silence-time). * @param {boolean} [params.splitTranscriptAtPhraseEnd] - If `true`, directs the service to split the transcript into * multiple final results based on semantic features of the input, for example, at the conclusion of meaningful * phrases such as sentences. The service bases its understanding of semantic features on the base language model that @@ -854,7 +941,7 @@ class SpeechToTextV1 extends BaseService { * transcript. By default, the service splits transcripts based solely on the pause interval. * * See [Split transcript at phrase - * end](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#split_transcript). + * end](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-parsing#split-transcript). * @param {number} [params.speechDetectorSensitivity] - The sensitivity of speech activity detection that the service * is to perform. Use the parameter to suppress word insertions from music, coughing, and other non-speech events. The * service biases the audio it passes for speech recognition by evaluating the input audio against prior models of @@ -865,8 +952,8 @@ class SpeechToTextV1 extends BaseService { * * 0.5 (the default) provides a reasonable compromise for the level of sensitivity. * * 1.0 suppresses no audio (speech detection sensitivity is disabled). * - * The values increase on a monotonic curve. See [Speech Activity - * Detection](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#detection). + * The values increase on a monotonic curve. See [Speech detector + * sensitivity](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-sensitivity). * @param {number} [params.backgroundAudioSuppression] - The level to which the service is to suppress background * audio based on its volume to prevent it from being transcribed as speech. Use the parameter to suppress side * conversations or background noise. @@ -876,8 +963,21 @@ class SpeechToTextV1 extends BaseService { * * 0.5 provides a reasonable level of audio suppression for general usage. * * 1.0 suppresses all audio (no audio is transcribed). * - * The values increase on a monotonic curve. See [Speech Activity - * Detection](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#detection). + * The values increase on a monotonic curve. See [Background audio + * suppression](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-suppression). + * @param {boolean} [params.lowLatency] - If `true` for next-generation `Multimedia` and `Telephony` models that + * support low latency, directs the service to produce results even more quickly than it usually does. Next-generation + * models produce transcription results faster than previous-generation models. The `low_latency` parameter causes the + * models to produce results even more quickly, though the results might be less accurate when the parameter is used. + * + * **Note:** The parameter is beta functionality. It is not available for previous-generation `Broadband` and + * `Narrowband` models. It is available only for some next-generation models. + * + * * For a list of next-generation models that support low latency, see [Supported language + * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng#models-ng-supported) for + * next-generation models. + * * For more information about the `low_latency` parameter, see [Low + * latency](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-interim#low-latency). * @param {OutgoingHttpHeaders} [params.headers] - Custom request headers * @returns {Promise>} */ @@ -920,7 +1020,8 @@ class SpeechToTextV1 extends BaseService { 'end_of_phrase_silence_time': _params.endOfPhraseSilenceTime, 'split_transcript_at_phrase_end': _params.splitTranscriptAtPhraseEnd, 'speech_detector_sensitivity': _params.speechDetectorSensitivity, - 'background_audio_suppression': _params.backgroundAudioSuppression + 'background_audio_suppression': _params.backgroundAudioSuppression, + 'low_latency': _params.lowLatency }; const sdkHeaders = getSdkHeaders(SpeechToTextV1.DEFAULT_SERVICE_NAME, 'v1', 'createJob'); @@ -1355,6 +1456,9 @@ class SpeechToTextV1 extends BaseService { * * The value that you assign is used for all recognition requests that use the model. You can override it for any * recognition request by specifying a customization weight for that request. + * + * See [Using customization + * weight](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-languageUse#weight). * @param {OutgoingHttpHeaders} [params.headers] - Custom request headers * @returns {Promise>} */ @@ -1460,7 +1564,7 @@ class SpeechToTextV1 extends BaseService { * requests for the model until the upgrade completes. * * **See also:** [Upgrading a custom language - * model](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-customUpgrade#upgradeLanguage). + * model](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-custom-upgrade#custom-upgrade-language). * * @param {Object} params - The parameters to send to the service. * @param {string} params.customizationId - The customization ID (GUID) of the custom language model that is to be @@ -2795,7 +2899,7 @@ class SpeechToTextV1 extends BaseService { * was not trained with a custom language model. * * **See also:** [Upgrading a custom acoustic - * model](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-customUpgrade#upgradeAcoustic). + * model](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-custom-upgrade#custom-upgrade-acoustic). * * @param {Object} params - The parameters to send to the service. * @param {string} params.customizationId - The customization ID (GUID) of the custom acoustic model that is to be @@ -2809,7 +2913,7 @@ class SpeechToTextV1 extends BaseService { * has been modified since it was last trained. Use this parameter only to force the upgrade of a custom acoustic * model that is trained with a custom language model, and only if you receive a 400 response code and the message `No * input data modified since last training`. See [Upgrading a custom acoustic - * model](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-customUpgrade#upgradeAcoustic). + * model](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-custom-upgrade#custom-upgrade-acoustic). * @param {OutgoingHttpHeaders} [params.headers] - Custom request headers * @returns {Promise>} */ @@ -2965,8 +3069,8 @@ class SpeechToTextV1 extends BaseService { * higher than the minimum required rate, the service down-samples the audio to the appropriate rate. If the sampling * rate of the audio is lower than the minimum required rate, the service labels the audio file as `invalid`. * - * **See also:** [Audio - * formats](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-audio-formats#audio-formats). + * **See also:** [Supported audio + * formats](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-audio-formats). * * ### Content types for archive-type resources * @@ -3296,15 +3400,21 @@ namespace SpeechToTextV1 { export enum ModelId { AR_AR_BROADBANDMODEL = 'ar-AR_BroadbandModel', AR_MS_BROADBANDMODEL = 'ar-MS_BroadbandModel', + AR_MS_TELEPHONY = 'ar-MS_Telephony', DE_DE_BROADBANDMODEL = 'de-DE_BroadbandModel', DE_DE_NARROWBANDMODEL = 'de-DE_NarrowbandModel', + DE_DE_TELEPHONY = 'de-DE_Telephony', EN_AU_BROADBANDMODEL = 'en-AU_BroadbandModel', EN_AU_NARROWBANDMODEL = 'en-AU_NarrowbandModel', + EN_AU_TELEPHONY = 'en-AU_Telephony', EN_GB_BROADBANDMODEL = 'en-GB_BroadbandModel', EN_GB_NARROWBANDMODEL = 'en-GB_NarrowbandModel', + EN_GB_TELEPHONY = 'en-GB_Telephony', EN_US_BROADBANDMODEL = 'en-US_BroadbandModel', + EN_US_MULTIMEDIA = 'en-US_Multimedia', EN_US_NARROWBANDMODEL = 'en-US_NarrowbandModel', EN_US_SHORTFORM_NARROWBANDMODEL = 'en-US_ShortForm_NarrowbandModel', + EN_US_TELEPHONY = 'en-US_Telephony', ES_AR_BROADBANDMODEL = 'es-AR_BroadbandModel', ES_AR_NARROWBANDMODEL = 'es-AR_NarrowbandModel', ES_CL_BROADBANDMODEL = 'es-CL_BroadbandModel', @@ -3313,16 +3423,20 @@ namespace SpeechToTextV1 { ES_CO_NARROWBANDMODEL = 'es-CO_NarrowbandModel', ES_ES_BROADBANDMODEL = 'es-ES_BroadbandModel', ES_ES_NARROWBANDMODEL = 'es-ES_NarrowbandModel', + ES_ES_TELEPHONY = 'es-ES_Telephony', ES_MX_BROADBANDMODEL = 'es-MX_BroadbandModel', ES_MX_NARROWBANDMODEL = 'es-MX_NarrowbandModel', ES_PE_BROADBANDMODEL = 'es-PE_BroadbandModel', ES_PE_NARROWBANDMODEL = 'es-PE_NarrowbandModel', FR_CA_BROADBANDMODEL = 'fr-CA_BroadbandModel', FR_CA_NARROWBANDMODEL = 'fr-CA_NarrowbandModel', + FR_CA_TELEPHONY = 'fr-CA_Telephony', FR_FR_BROADBANDMODEL = 'fr-FR_BroadbandModel', FR_FR_NARROWBANDMODEL = 'fr-FR_NarrowbandModel', + FR_FR_TELEPHONY = 'fr-FR_Telephony', IT_IT_BROADBANDMODEL = 'it-IT_BroadbandModel', IT_IT_NARROWBANDMODEL = 'it-IT_NarrowbandModel', + IT_IT_TELEPHONY = 'it-IT_Telephony', JA_JP_BROADBANDMODEL = 'ja-JP_BroadbandModel', JA_JP_NARROWBANDMODEL = 'ja-JP_NarrowbandModel', KO_KR_BROADBANDMODEL = 'ko-KR_BroadbandModel', @@ -3331,6 +3445,7 @@ namespace SpeechToTextV1 { NL_NL_NARROWBANDMODEL = 'nl-NL_NarrowbandModel', PT_BR_BROADBANDMODEL = 'pt-BR_BroadbandModel', PT_BR_NARROWBANDMODEL = 'pt-BR_NarrowbandModel', + PT_BR_TELEPHONY = 'pt-BR_Telephony', ZH_CN_BROADBANDMODEL = 'zh-CN_BroadbandModel', ZH_CN_NARROWBANDMODEL = 'zh-CN_NarrowbandModel', } @@ -3346,14 +3461,15 @@ namespace SpeechToTextV1 { contentType?: RecognizeConstants.ContentType | string; /** The identifier of the model that is to be used for the recognition request. (**Note:** The model * `ar-AR_BroadbandModel` is deprecated; use `ar-MS_BroadbandModel` instead.) See [Languages and - * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models#models). + * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models) and [Next-generation languages + * and models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng). */ model?: RecognizeConstants.Model | string; /** The customization ID (GUID) of a custom language model that is to be used with the recognition request. The * base model of the specified custom language model must match the model specified with the `model` parameter. You * must make the request with credentials for the instance of the service that owns the custom model. By default, - * no custom language model is used. See [Custom - * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#custom-input). + * no custom language model is used. See [Using a custom language model for speech + * recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-languageUse). * * **Note:** Use this parameter instead of the deprecated `customization_id` parameter. */ @@ -3361,15 +3477,16 @@ namespace SpeechToTextV1 { /** The customization ID (GUID) of a custom acoustic model that is to be used with the recognition request. The * base model of the specified custom acoustic model must match the model specified with the `model` parameter. You * must make the request with credentials for the instance of the service that owns the custom model. By default, - * no custom acoustic model is used. See [Custom - * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#custom-input). + * no custom acoustic model is used. See [Using a custom acoustic model for speech + * recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-acousticUse). */ acousticCustomizationId?: string; /** The version of the specified base model that is to be used with the recognition request. Multiple versions * of a base model can exist when a model is updated for internal improvements. The parameter is intended primarily * for use with custom models that have been upgraded for a new base model. The default value depends on whether - * the parameter is used with or without a custom model. See [Base model - * version](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#version). + * the parameter is used with or without a custom model. See [Making speech recognition requests with upgraded + * custom + * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-custom-upgrade-use#custom-upgrade-use-recognition). */ baseModelVersion?: string; /** If you specify the customization ID (GUID) of a custom language model with the recognition request, the @@ -3384,7 +3501,8 @@ namespace SpeechToTextV1 { * of OOV words from the custom model. Use caution when setting the weight: a higher value can improve the accuracy * of phrases from the custom model's domain, but it can negatively affect performance on non-domain phrases. * - * See [Custom models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#custom-input). + * See [Using customization + * weight](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-languageUse#weight). */ customizationWeight?: number; /** The time in seconds after which, if only silence (no speech) is detected in streaming audio, the connection @@ -3402,41 +3520,42 @@ namespace SpeechToTextV1 { * 1024 characters, though the maximum effective length for double-byte languages might be shorter. Keywords are * case-insensitive. * - * See [Keyword spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#keyword_spotting). + * See [Keyword + * spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#keyword-spotting). */ keywords?: string[]; /** A confidence value that is the lower bound for spotting a keyword. A word is considered to match a keyword * if its confidence is greater than or equal to the threshold. Specify a probability between 0.0 and 1.0. If you * specify a threshold, you must also specify one or more keywords. The service performs no keyword spotting if you * omit either parameter. See [Keyword - * spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#keyword_spotting). + * spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#keyword-spotting). */ keywordsThreshold?: number; /** The maximum number of alternative transcripts that the service is to return. By default, the service returns * a single transcript. If you specify a value of `0`, the service uses the default value, `1`. See [Maximum - * alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#max_alternatives). + * alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#max-alternatives). */ maxAlternatives?: number; /** A confidence value that is the lower bound for identifying a hypothesis as a possible word alternative (also * known as "Confusion Networks"). An alternative word is considered if its confidence is greater than or equal to * the threshold. Specify a probability between 0.0 and 1.0. By default, the service computes no alternative words. * See [Word - * alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#word_alternatives). + * alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#word-alternatives). */ wordAlternativesThreshold?: number; /** If `true`, the service returns a confidence measure in the range of 0.0 to 1.0 for each word. By default, * the service returns no word confidence scores. See [Word - * confidence](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#word_confidence). + * confidence](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#word-confidence). */ wordConfidence?: boolean; /** If `true`, the service returns time alignment for each word. By default, no timestamps are returned. See - * [Word timestamps](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#word_timestamps). + * [Word timestamps](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#word-timestamps). */ timestamps?: boolean; /** If `true`, the service filters profanity from all output except for keyword results by replacing * inappropriate words with a series of asterisks. Set the parameter to `false` to return results with no - * censoring. Applies to US English transcription only. See [Profanity - * filtering](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#profanity_filter). + * censoring. Applies to US English and Japanese transcription only. See [Profanity + * filtering](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#profanity-filtering). */ profanityFilter?: boolean; /** If `true`, the service converts dates, times, series of digits and numbers, phone numbers, currency values, @@ -3446,17 +3565,20 @@ namespace SpeechToTextV1 { * * **Note:** Applies to US English, Japanese, and Spanish transcription only. * - * See [Smart formatting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#smart_formatting). + * See [Smart + * formatting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#smart-formatting). */ smartFormatting?: boolean; /** If `true`, the response includes labels that identify which words were spoken by which participants in a * multi-person exchange. By default, the service returns no speaker labels. Setting `speaker_labels` to `true` * forces the `timestamps` parameter to be `true`, regardless of whether you specify `false` for the parameter. + * * For previous-generation models, can be used for US English, Australian English, German, Japanese, Korean, and + * Spanish (both broadband and narrowband models) and UK English (narrowband model) transcription only. + * * For next-generation models, can be used for English (Australian, UK, and US), German, and Spanish + * transcription only. * - * **Note:** Applies to US English, Australian English, German, Japanese, Korean, and Spanish (both broadband and - * narrowband models) and UK English (narrowband model) transcription only. - * - * See [Speaker labels](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#speaker_labels). + * Restrictions and limitations apply to the use of speaker labels for both types of models. See [Speaker + * labels](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-speaker-labels). */ speakerLabels?: boolean; /** **Deprecated.** Use the `language_customization_id` parameter to specify the customization ID (GUID) of a @@ -3467,8 +3589,8 @@ namespace SpeechToTextV1 { /** The name of a grammar that is to be used with the recognition request. If you specify a grammar, you must * also use the `language_customization_id` parameter to specify the name of the custom language model for which * the grammar is defined. The service recognizes only strings that are recognized by the specified grammar; it - * does not recognize other custom words from the model's words resource. See - * [Grammars](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#grammars-input). + * does not recognize other custom words from the model's words resource. See [Using a grammar for speech + * recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-grammarUse). */ grammarName?: string; /** If `true`, the service redacts, or masks, numeric data from final transcripts. The feature redacts any @@ -3482,13 +3604,14 @@ namespace SpeechToTextV1 { * * **Note:** Applies to US English, Japanese, and Korean transcription only. * - * See [Numeric redaction](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#redaction). + * See [Numeric + * redaction](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#numeric-redaction). */ redaction?: boolean; /** If `true`, requests detailed information about the signal characteristics of the input audio. The service * returns audio metrics with the final transcription results. By default, the service returns no audio metrics. * - * See [Audio metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#audio_metrics). + * See [Audio metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#audio-metrics). */ audioMetrics?: boolean; /** If `true`, specifies the duration of the pause interval at which the service splits a transcript into @@ -3504,7 +3627,7 @@ namespace SpeechToTextV1 { * The default pause interval for most languages is 0.8 seconds; the default for Chinese is 0.6 seconds. * * See [End of phrase silence - * time](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#silence_time). + * time](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-parsing#silence-time). */ endOfPhraseSilenceTime?: number; /** If `true`, directs the service to split the transcript into multiple final results based on semantic @@ -3514,7 +3637,7 @@ namespace SpeechToTextV1 { * splits transcripts based solely on the pause interval. * * See [Split transcript at phrase - * end](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#split_transcript). + * end](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-parsing#split-transcript). */ splitTranscriptAtPhraseEnd?: boolean; /** The sensitivity of speech activity detection that the service is to perform. Use the parameter to suppress @@ -3526,8 +3649,8 @@ namespace SpeechToTextV1 { * * 0.5 (the default) provides a reasonable compromise for the level of sensitivity. * * 1.0 suppresses no audio (speech detection sensitivity is disabled). * - * The values increase on a monotonic curve. See [Speech Activity - * Detection](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#detection). + * The values increase on a monotonic curve. See [Speech detector + * sensitivity](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-sensitivity). */ speechDetectorSensitivity?: number; /** The level to which the service is to suppress background audio based on its volume to prevent it from being @@ -3538,10 +3661,25 @@ namespace SpeechToTextV1 { * * 0.5 provides a reasonable level of audio suppression for general usage. * * 1.0 suppresses all audio (no audio is transcribed). * - * The values increase on a monotonic curve. See [Speech Activity - * Detection](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#detection). + * The values increase on a monotonic curve. See [Background audio + * suppression](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-suppression). */ backgroundAudioSuppression?: number; + /** If `true` for next-generation `Multimedia` and `Telephony` models that support low latency, directs the + * service to produce results even more quickly than it usually does. Next-generation models produce transcription + * results faster than previous-generation models. The `low_latency` parameter causes the models to produce results + * even more quickly, though the results might be less accurate when the parameter is used. + * + * **Note:** The parameter is beta functionality. It is not available for previous-generation `Broadband` and + * `Narrowband` models. It is available only for some next-generation models. + * + * * For a list of next-generation models that support low latency, see [Supported language + * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng#models-ng-supported) for + * next-generation models. + * * For more information about the `low_latency` parameter, see [Low + * latency](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-interim#low-latency). + */ + lowLatency?: boolean; headers?: OutgoingHttpHeaders; } @@ -3566,19 +3704,25 @@ namespace SpeechToTextV1 { AUDIO_WEBM_CODECS_OPUS = 'audio/webm;codecs=opus', AUDIO_WEBM_CODECS_VORBIS = 'audio/webm;codecs=vorbis', } - /** The identifier of the model that is to be used for the recognition request. (**Note:** The model `ar-AR_BroadbandModel` is deprecated; use `ar-MS_BroadbandModel` instead.) See [Languages and models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models#models). */ + /** The identifier of the model that is to be used for the recognition request. (**Note:** The model `ar-AR_BroadbandModel` is deprecated; use `ar-MS_BroadbandModel` instead.) See [Languages and models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models) and [Next-generation languages and models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng). */ export enum Model { AR_AR_BROADBANDMODEL = 'ar-AR_BroadbandModel', AR_MS_BROADBANDMODEL = 'ar-MS_BroadbandModel', + AR_MS_TELEPHONY = 'ar-MS_Telephony', DE_DE_BROADBANDMODEL = 'de-DE_BroadbandModel', DE_DE_NARROWBANDMODEL = 'de-DE_NarrowbandModel', + DE_DE_TELEPHONY = 'de-DE_Telephony', EN_AU_BROADBANDMODEL = 'en-AU_BroadbandModel', EN_AU_NARROWBANDMODEL = 'en-AU_NarrowbandModel', + EN_AU_TELEPHONY = 'en-AU_Telephony', EN_GB_BROADBANDMODEL = 'en-GB_BroadbandModel', EN_GB_NARROWBANDMODEL = 'en-GB_NarrowbandModel', + EN_GB_TELEPHONY = 'en-GB_Telephony', EN_US_BROADBANDMODEL = 'en-US_BroadbandModel', + EN_US_MULTIMEDIA = 'en-US_Multimedia', EN_US_NARROWBANDMODEL = 'en-US_NarrowbandModel', EN_US_SHORTFORM_NARROWBANDMODEL = 'en-US_ShortForm_NarrowbandModel', + EN_US_TELEPHONY = 'en-US_Telephony', ES_AR_BROADBANDMODEL = 'es-AR_BroadbandModel', ES_AR_NARROWBANDMODEL = 'es-AR_NarrowbandModel', ES_CL_BROADBANDMODEL = 'es-CL_BroadbandModel', @@ -3587,16 +3731,20 @@ namespace SpeechToTextV1 { ES_CO_NARROWBANDMODEL = 'es-CO_NarrowbandModel', ES_ES_BROADBANDMODEL = 'es-ES_BroadbandModel', ES_ES_NARROWBANDMODEL = 'es-ES_NarrowbandModel', + ES_ES_TELEPHONY = 'es-ES_Telephony', ES_MX_BROADBANDMODEL = 'es-MX_BroadbandModel', ES_MX_NARROWBANDMODEL = 'es-MX_NarrowbandModel', ES_PE_BROADBANDMODEL = 'es-PE_BroadbandModel', ES_PE_NARROWBANDMODEL = 'es-PE_NarrowbandModel', FR_CA_BROADBANDMODEL = 'fr-CA_BroadbandModel', FR_CA_NARROWBANDMODEL = 'fr-CA_NarrowbandModel', + FR_CA_TELEPHONY = 'fr-CA_Telephony', FR_FR_BROADBANDMODEL = 'fr-FR_BroadbandModel', FR_FR_NARROWBANDMODEL = 'fr-FR_NarrowbandModel', + FR_FR_TELEPHONY = 'fr-FR_Telephony', IT_IT_BROADBANDMODEL = 'it-IT_BroadbandModel', IT_IT_NARROWBANDMODEL = 'it-IT_NarrowbandModel', + IT_IT_TELEPHONY = 'it-IT_Telephony', JA_JP_BROADBANDMODEL = 'ja-JP_BroadbandModel', JA_JP_NARROWBANDMODEL = 'ja-JP_NarrowbandModel', KO_KR_BROADBANDMODEL = 'ko-KR_BroadbandModel', @@ -3605,6 +3753,7 @@ namespace SpeechToTextV1 { NL_NL_NARROWBANDMODEL = 'nl-NL_NarrowbandModel', PT_BR_BROADBANDMODEL = 'pt-BR_BroadbandModel', PT_BR_NARROWBANDMODEL = 'pt-BR_NarrowbandModel', + PT_BR_TELEPHONY = 'pt-BR_Telephony', ZH_CN_BROADBANDMODEL = 'zh-CN_BroadbandModel', ZH_CN_NARROWBANDMODEL = 'zh-CN_NarrowbandModel', } @@ -3643,7 +3792,8 @@ namespace SpeechToTextV1 { contentType?: CreateJobConstants.ContentType | string; /** The identifier of the model that is to be used for the recognition request. (**Note:** The model * `ar-AR_BroadbandModel` is deprecated; use `ar-MS_BroadbandModel` instead.) See [Languages and - * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models#models). + * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models) and [Next-generation languages + * and models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng). */ model?: CreateJobConstants.Model | string; /** A URL to which callback notifications are to be sent. The URL must already be successfully allowlisted by @@ -3685,8 +3835,8 @@ namespace SpeechToTextV1 { /** The customization ID (GUID) of a custom language model that is to be used with the recognition request. The * base model of the specified custom language model must match the model specified with the `model` parameter. You * must make the request with credentials for the instance of the service that owns the custom model. By default, - * no custom language model is used. See [Custom - * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#custom-input). + * no custom language model is used. See [Using a custom language model for speech + * recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-languageUse). * * **Note:** Use this parameter instead of the deprecated `customization_id` parameter. */ @@ -3694,15 +3844,16 @@ namespace SpeechToTextV1 { /** The customization ID (GUID) of a custom acoustic model that is to be used with the recognition request. The * base model of the specified custom acoustic model must match the model specified with the `model` parameter. You * must make the request with credentials for the instance of the service that owns the custom model. By default, - * no custom acoustic model is used. See [Custom - * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#custom-input). + * no custom acoustic model is used. See [Using a custom acoustic model for speech + * recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-acousticUse). */ acousticCustomizationId?: string; /** The version of the specified base model that is to be used with the recognition request. Multiple versions * of a base model can exist when a model is updated for internal improvements. The parameter is intended primarily * for use with custom models that have been upgraded for a new base model. The default value depends on whether - * the parameter is used with or without a custom model. See [Base model - * version](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#version). + * the parameter is used with or without a custom model. See [Making speech recognition requests with upgraded + * custom + * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-custom-upgrade-use#custom-upgrade-use-recognition). */ baseModelVersion?: string; /** If you specify the customization ID (GUID) of a custom language model with the recognition request, the @@ -3717,7 +3868,8 @@ namespace SpeechToTextV1 { * of OOV words from the custom model. Use caution when setting the weight: a higher value can improve the accuracy * of phrases from the custom model's domain, but it can negatively affect performance on non-domain phrases. * - * See [Custom models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#custom-input). + * See [Using customization + * weight](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-languageUse#weight). */ customizationWeight?: number; /** The time in seconds after which, if only silence (no speech) is detected in streaming audio, the connection @@ -3735,41 +3887,42 @@ namespace SpeechToTextV1 { * 1024 characters, though the maximum effective length for double-byte languages might be shorter. Keywords are * case-insensitive. * - * See [Keyword spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#keyword_spotting). + * See [Keyword + * spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#keyword-spotting). */ keywords?: string[]; /** A confidence value that is the lower bound for spotting a keyword. A word is considered to match a keyword * if its confidence is greater than or equal to the threshold. Specify a probability between 0.0 and 1.0. If you * specify a threshold, you must also specify one or more keywords. The service performs no keyword spotting if you * omit either parameter. See [Keyword - * spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#keyword_spotting). + * spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#keyword-spotting). */ keywordsThreshold?: number; /** The maximum number of alternative transcripts that the service is to return. By default, the service returns * a single transcript. If you specify a value of `0`, the service uses the default value, `1`. See [Maximum - * alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#max_alternatives). + * alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#max-alternatives). */ maxAlternatives?: number; /** A confidence value that is the lower bound for identifying a hypothesis as a possible word alternative (also * known as "Confusion Networks"). An alternative word is considered if its confidence is greater than or equal to * the threshold. Specify a probability between 0.0 and 1.0. By default, the service computes no alternative words. * See [Word - * alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#word_alternatives). + * alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#word-alternatives). */ wordAlternativesThreshold?: number; /** If `true`, the service returns a confidence measure in the range of 0.0 to 1.0 for each word. By default, * the service returns no word confidence scores. See [Word - * confidence](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#word_confidence). + * confidence](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#word-confidence). */ wordConfidence?: boolean; /** If `true`, the service returns time alignment for each word. By default, no timestamps are returned. See - * [Word timestamps](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#word_timestamps). + * [Word timestamps](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#word-timestamps). */ timestamps?: boolean; /** If `true`, the service filters profanity from all output except for keyword results by replacing * inappropriate words with a series of asterisks. Set the parameter to `false` to return results with no - * censoring. Applies to US English transcription only. See [Profanity - * filtering](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#profanity_filter). + * censoring. Applies to US English and Japanese transcription only. See [Profanity + * filtering](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#profanity-filtering). */ profanityFilter?: boolean; /** If `true`, the service converts dates, times, series of digits and numbers, phone numbers, currency values, @@ -3779,17 +3932,20 @@ namespace SpeechToTextV1 { * * **Note:** Applies to US English, Japanese, and Spanish transcription only. * - * See [Smart formatting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#smart_formatting). + * See [Smart + * formatting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#smart-formatting). */ smartFormatting?: boolean; /** If `true`, the response includes labels that identify which words were spoken by which participants in a * multi-person exchange. By default, the service returns no speaker labels. Setting `speaker_labels` to `true` * forces the `timestamps` parameter to be `true`, regardless of whether you specify `false` for the parameter. + * * For previous-generation models, can be used for US English, Australian English, German, Japanese, Korean, and + * Spanish (both broadband and narrowband models) and UK English (narrowband model) transcription only. + * * For next-generation models, can be used for English (Australian, UK, and US), German, and Spanish + * transcription only. * - * **Note:** Applies to US English, Australian English, German, Japanese, Korean, and Spanish (both broadband and - * narrowband models) and UK English (narrowband model) transcription only. - * - * See [Speaker labels](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#speaker_labels). + * Restrictions and limitations apply to the use of speaker labels for both types of models. See [Speaker + * labels](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-speaker-labels). */ speakerLabels?: boolean; /** **Deprecated.** Use the `language_customization_id` parameter to specify the customization ID (GUID) of a @@ -3800,8 +3956,8 @@ namespace SpeechToTextV1 { /** The name of a grammar that is to be used with the recognition request. If you specify a grammar, you must * also use the `language_customization_id` parameter to specify the name of the custom language model for which * the grammar is defined. The service recognizes only strings that are recognized by the specified grammar; it - * does not recognize other custom words from the model's words resource. See - * [Grammars](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#grammars-input). + * does not recognize other custom words from the model's words resource. See [Using a grammar for speech + * recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-grammarUse). */ grammarName?: string; /** If `true`, the service redacts, or masks, numeric data from final transcripts. The feature redacts any @@ -3815,7 +3971,8 @@ namespace SpeechToTextV1 { * * **Note:** Applies to US English, Japanese, and Korean transcription only. * - * See [Numeric redaction](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#redaction). + * See [Numeric + * redaction](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#numeric-redaction). */ redaction?: boolean; /** If `true`, requests processing metrics about the service's transcription of the input audio. The service @@ -3824,7 +3981,7 @@ namespace SpeechToTextV1 { * service returns no processing metrics. * * See [Processing - * metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#processing_metrics). + * metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#processing-metrics). */ processingMetrics?: boolean; /** Specifies the interval in real wall-clock seconds at which the service is to return processing metrics. The @@ -3838,13 +3995,13 @@ namespace SpeechToTextV1 { * duration of the audio, the service returns processing metrics only for transcription events. * * See [Processing - * metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#processing_metrics). + * metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#processing-metrics). */ processingMetricsInterval?: number; /** If `true`, requests detailed information about the signal characteristics of the input audio. The service * returns audio metrics with the final transcription results. By default, the service returns no audio metrics. * - * See [Audio metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#audio_metrics). + * See [Audio metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#audio-metrics). */ audioMetrics?: boolean; /** If `true`, specifies the duration of the pause interval at which the service splits a transcript into @@ -3860,7 +4017,7 @@ namespace SpeechToTextV1 { * The default pause interval for most languages is 0.8 seconds; the default for Chinese is 0.6 seconds. * * See [End of phrase silence - * time](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#silence_time). + * time](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-parsing#silence-time). */ endOfPhraseSilenceTime?: number; /** If `true`, directs the service to split the transcript into multiple final results based on semantic @@ -3870,7 +4027,7 @@ namespace SpeechToTextV1 { * splits transcripts based solely on the pause interval. * * See [Split transcript at phrase - * end](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-output#split_transcript). + * end](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-parsing#split-transcript). */ splitTranscriptAtPhraseEnd?: boolean; /** The sensitivity of speech activity detection that the service is to perform. Use the parameter to suppress @@ -3882,8 +4039,8 @@ namespace SpeechToTextV1 { * * 0.5 (the default) provides a reasonable compromise for the level of sensitivity. * * 1.0 suppresses no audio (speech detection sensitivity is disabled). * - * The values increase on a monotonic curve. See [Speech Activity - * Detection](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#detection). + * The values increase on a monotonic curve. See [Speech detector + * sensitivity](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-sensitivity). */ speechDetectorSensitivity?: number; /** The level to which the service is to suppress background audio based on its volume to prevent it from being @@ -3894,10 +4051,25 @@ namespace SpeechToTextV1 { * * 0.5 provides a reasonable level of audio suppression for general usage. * * 1.0 suppresses all audio (no audio is transcribed). * - * The values increase on a monotonic curve. See [Speech Activity - * Detection](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#detection). + * The values increase on a monotonic curve. See [Background audio + * suppression](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-suppression). */ backgroundAudioSuppression?: number; + /** If `true` for next-generation `Multimedia` and `Telephony` models that support low latency, directs the + * service to produce results even more quickly than it usually does. Next-generation models produce transcription + * results faster than previous-generation models. The `low_latency` parameter causes the models to produce results + * even more quickly, though the results might be less accurate when the parameter is used. + * + * **Note:** The parameter is beta functionality. It is not available for previous-generation `Broadband` and + * `Narrowband` models. It is available only for some next-generation models. + * + * * For a list of next-generation models that support low latency, see [Supported language + * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng#models-ng-supported) for + * next-generation models. + * * For more information about the `low_latency` parameter, see [Low + * latency](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-interim#low-latency). + */ + lowLatency?: boolean; headers?: OutgoingHttpHeaders; } @@ -3922,19 +4094,25 @@ namespace SpeechToTextV1 { AUDIO_WEBM_CODECS_OPUS = 'audio/webm;codecs=opus', AUDIO_WEBM_CODECS_VORBIS = 'audio/webm;codecs=vorbis', } - /** The identifier of the model that is to be used for the recognition request. (**Note:** The model `ar-AR_BroadbandModel` is deprecated; use `ar-MS_BroadbandModel` instead.) See [Languages and models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models#models). */ + /** The identifier of the model that is to be used for the recognition request. (**Note:** The model `ar-AR_BroadbandModel` is deprecated; use `ar-MS_BroadbandModel` instead.) See [Languages and models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models) and [Next-generation languages and models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng). */ export enum Model { AR_AR_BROADBANDMODEL = 'ar-AR_BroadbandModel', AR_MS_BROADBANDMODEL = 'ar-MS_BroadbandModel', + AR_MS_TELEPHONY = 'ar-MS_Telephony', DE_DE_BROADBANDMODEL = 'de-DE_BroadbandModel', DE_DE_NARROWBANDMODEL = 'de-DE_NarrowbandModel', + DE_DE_TELEPHONY = 'de-DE_Telephony', EN_AU_BROADBANDMODEL = 'en-AU_BroadbandModel', EN_AU_NARROWBANDMODEL = 'en-AU_NarrowbandModel', + EN_AU_TELEPHONY = 'en-AU_Telephony', EN_GB_BROADBANDMODEL = 'en-GB_BroadbandModel', EN_GB_NARROWBANDMODEL = 'en-GB_NarrowbandModel', + EN_GB_TELEPHONY = 'en-GB_Telephony', EN_US_BROADBANDMODEL = 'en-US_BroadbandModel', + EN_US_MULTIMEDIA = 'en-US_Multimedia', EN_US_NARROWBANDMODEL = 'en-US_NarrowbandModel', EN_US_SHORTFORM_NARROWBANDMODEL = 'en-US_ShortForm_NarrowbandModel', + EN_US_TELEPHONY = 'en-US_Telephony', ES_AR_BROADBANDMODEL = 'es-AR_BroadbandModel', ES_AR_NARROWBANDMODEL = 'es-AR_NarrowbandModel', ES_CL_BROADBANDMODEL = 'es-CL_BroadbandModel', @@ -3943,16 +4121,20 @@ namespace SpeechToTextV1 { ES_CO_NARROWBANDMODEL = 'es-CO_NarrowbandModel', ES_ES_BROADBANDMODEL = 'es-ES_BroadbandModel', ES_ES_NARROWBANDMODEL = 'es-ES_NarrowbandModel', + ES_ES_TELEPHONY = 'es-ES_Telephony', ES_MX_BROADBANDMODEL = 'es-MX_BroadbandModel', ES_MX_NARROWBANDMODEL = 'es-MX_NarrowbandModel', ES_PE_BROADBANDMODEL = 'es-PE_BroadbandModel', ES_PE_NARROWBANDMODEL = 'es-PE_NarrowbandModel', FR_CA_BROADBANDMODEL = 'fr-CA_BroadbandModel', FR_CA_NARROWBANDMODEL = 'fr-CA_NarrowbandModel', + FR_CA_TELEPHONY = 'fr-CA_Telephony', FR_FR_BROADBANDMODEL = 'fr-FR_BroadbandModel', FR_FR_NARROWBANDMODEL = 'fr-FR_NarrowbandModel', + FR_FR_TELEPHONY = 'fr-FR_Telephony', IT_IT_BROADBANDMODEL = 'it-IT_BroadbandModel', IT_IT_NARROWBANDMODEL = 'it-IT_NarrowbandModel', + IT_IT_TELEPHONY = 'it-IT_Telephony', JA_JP_BROADBANDMODEL = 'ja-JP_BroadbandModel', JA_JP_NARROWBANDMODEL = 'ja-JP_NarrowbandModel', KO_KR_BROADBANDMODEL = 'ko-KR_BroadbandModel', @@ -3961,6 +4143,7 @@ namespace SpeechToTextV1 { NL_NL_NARROWBANDMODEL = 'nl-NL_NarrowbandModel', PT_BR_BROADBANDMODEL = 'pt-BR_BroadbandModel', PT_BR_NARROWBANDMODEL = 'pt-BR_NarrowbandModel', + PT_BR_TELEPHONY = 'pt-BR_Telephony', ZH_CN_BROADBANDMODEL = 'zh-CN_BroadbandModel', ZH_CN_NARROWBANDMODEL = 'zh-CN_NarrowbandModel', } @@ -4159,6 +4342,9 @@ namespace SpeechToTextV1 { * * The value that you assign is used for all recognition requests that use the model. You can override it for any * recognition request by specifying a customization weight for that request. + * + * See [Using customization + * weight](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-languageUse#weight). */ customizationWeight?: number; headers?: OutgoingHttpHeaders; @@ -4629,7 +4815,7 @@ namespace SpeechToTextV1 { * was last trained. Use this parameter only to force the upgrade of a custom acoustic model that is trained with a * custom language model, and only if you receive a 400 response code and the message `No input data modified since * last training`. See [Upgrading a custom acoustic - * model](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-customUpgrade#upgradeAcoustic). + * model](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-custom-upgrade#custom-upgrade-acoustic). */ force?: boolean; headers?: OutgoingHttpHeaders; @@ -5340,8 +5526,10 @@ namespace SpeechToTextV1 { export interface SpeechRecognitionAlternative { /** A transcription of the audio. */ transcript: string; - /** A score that indicates the service's confidence in the transcript in the range of 0.0 to 1.0. A confidence - * score is returned only for the best alternative and only with results marked as final. + /** A score that indicates the service's confidence in the transcript in the range of 0.0 to 1.0. For speech + * recognition with previous-generation models, a confidence score is returned only for the best alternative and + * only with results marked as final. For speech recognition with next-generation models, a confidence score is + * never returned. */ confidence?: number; /** Time alignments for each word from the transcript as a list of lists. Each inner list consists of three @@ -5443,6 +5631,11 @@ namespace SpeechToTextV1 { * (narrowband model only). Speaker labels are not supported for any other models. */ speaker_labels: boolean; + /** Indicates whether the `low_latency` parameter can be used with a next-generation language model. The field + * is returned only for next-generation models. Previous-generation models do not support the `low_latency` + * parameter. + */ + low_latency?: boolean; } /** The response from training of a custom language or custom acoustic model. */ diff --git a/test/integration/text-to-speech.test.js b/test/integration/text-to-speech.test.js index 75fbb1d3de..6b9bf69ff8 100644 --- a/test/integration/text-to-speech.test.js +++ b/test/integration/text-to-speech.test.js @@ -2,6 +2,8 @@ const { IamAuthenticator } = require('../../dist/auth'); const TextToSpeechV1 = require('../../dist/text-to-speech/v1'); +const fs = require('fs'); +const path = require('path'); const wav = require('wav'); const authHelper = require('../resources/auth_helper.js'); const describe = authHelper.describe; // this runs describe.skip if there is no auth.js file :) @@ -94,6 +96,66 @@ describe('text to speech_integration', () => { customizationId = result.customization_id; }); + describe('custom prompts', () => { + const promptId = 'Hello'; + + it('should addCustomPrompt()', async () => { + expect(customizationId).toBeTruthy(); + + const params = { + customizationId, + promptId, + metadata: { + prompt_text: 'Hello, how are you today?', + }, + file: fs.createReadStream(path.join(__dirname, '../resources/tts_audio.wav')), + filename: 'tts_audio.wav', + }; + + const res = await textToSpeech.addCustomPrompt(params); + const { result } = res || {}; + expect(result.prompt_id).toBe('Hello'); + }); + + it('should listCustomPrompts()', async () => { + expect(customizationId).toBeTruthy(); + + const params = { + customizationId, + }; + + const res = await textToSpeech.listCustomPrompts(params); + const { result } = res || {}; + expect(result.prompts.length).toBeTruthy(); + }); + + it('should getCustomPrompt()', async () => { + expect(customizationId).toBeTruthy(); + + const params = { + customizationId, + promptId, + }; + + const res = await textToSpeech.getCustomPrompt(params); + const { result } = res || {}; + expect(result.prompt_id).toBe('Hello'); + }); + + it('should deleteCustomPrompt()', async () => { + expect(customizationId).toBeTruthy(); + + const params = { + customizationId, + promptId, + }; + + const res = await textToSpeech.deleteCustomPrompt(params); + const { result } = res || {}; + expect(result).toBeDefined(); + }); + }); + it('should listCustomModels() with language', async () => { const params = { language: 'en-GB', @@ -220,4 +282,52 @@ describe('text to speech_integration', () => { expect(result).toBeDefined(); }); }); + + describe('speaker models', () => { + let speakerId; + + it('should createSpeakerModel()', async () => { + const params = { + speakerName: 'Angelo', + audio: fs.createReadStream(path.join(__dirname, '../resources/tts_audio.wav')), + }; + + const res = await textToSpeech.createSpeakerModel(params); + const { result } = res || {}; + expect(result.speaker_id).toBeDefined(); + speakerId = result.speaker_id; + }); + + it('should listSpeakerModels()', async () => { + expect(speakerId).toBeTruthy(); + + const res = await textToSpeech.listSpeakerModels(); + const { result } = res || {}; + expect(result.speakers.length).toBeTruthy(); + }); + + it('should getSpeakerModel()', async () => { + expect(speakerId).toBeTruthy(); + + const params = { + speakerId, + }; + + const res = await textToSpeech.getSpeakerModel(params); + const { result } = res || {}; + expect(result.customizations).toBeDefined(); + }); + + it('should deleteSpeakerModel()', async () => { + expect(speakerId).toBeTruthy(); + + const params = { + speakerId, + }; + + const res = await textToSpeech.deleteSpeakerModel(params); + const { result } = res || {}; + expect(result).toBeDefined(); + }); + }); }); diff --git a/test/resources/tts_audio.wav b/test/resources/tts_audio.wav new file mode 100644 index 0000000000..ba4760649e Binary files /dev/null and b/test/resources/tts_audio.wav differ diff --git a/test/unit/speech-to-text.v1.test.js b/test/unit/speech-to-text.v1.test.js index d5dd4e147b..984aa96b05 100644 --- a/test/unit/speech-to-text.v1.test.js +++ b/test/unit/speech-to-text.v1.test.js @@ -1,5 +1,5 @@ /** - * (C) Copyright IBM Corp. 2018, 2020. + * (C) Copyright IBM Corp. 2018, 2021. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -253,6 +253,7 @@ describe('SpeechToTextV1', () => { const splitTranscriptAtPhraseEnd = true; const speechDetectorSensitivity = 36.0; const backgroundAudioSuppression = 36.0; + const lowLatency = true; const params = { audio: audio, contentType: contentType, @@ -279,6 +280,7 @@ describe('SpeechToTextV1', () => { splitTranscriptAtPhraseEnd: splitTranscriptAtPhraseEnd, speechDetectorSensitivity: speechDetectorSensitivity, backgroundAudioSuppression: backgroundAudioSuppression, + lowLatency: lowLatency, }; const recognizeResult = speechToTextService.recognize(params); @@ -320,6 +322,7 @@ describe('SpeechToTextV1', () => { expect(options.qs['split_transcript_at_phrase_end']).toEqual(splitTranscriptAtPhraseEnd); expect(options.qs['speech_detector_sensitivity']).toEqual(speechDetectorSensitivity); expect(options.qs['background_audio_suppression']).toEqual(backgroundAudioSuppression); + expect(options.qs['low_latency']).toEqual(lowLatency); }); test('should prioritize user-given headers', () => { @@ -538,6 +541,7 @@ describe('SpeechToTextV1', () => { const splitTranscriptAtPhraseEnd = true; const speechDetectorSensitivity = 36.0; const backgroundAudioSuppression = 36.0; + const lowLatency = true; const params = { audio: audio, contentType: contentType, @@ -570,6 +574,7 @@ describe('SpeechToTextV1', () => { splitTranscriptAtPhraseEnd: splitTranscriptAtPhraseEnd, speechDetectorSensitivity: speechDetectorSensitivity, backgroundAudioSuppression: backgroundAudioSuppression, + lowLatency: lowLatency, }; const createJobResult = speechToTextService.createJob(params); @@ -617,6 +622,7 @@ describe('SpeechToTextV1', () => { expect(options.qs['split_transcript_at_phrase_end']).toEqual(splitTranscriptAtPhraseEnd); expect(options.qs['speech_detector_sensitivity']).toEqual(speechDetectorSensitivity); expect(options.qs['background_audio_suppression']).toEqual(backgroundAudioSuppression); + expect(options.qs['low_latency']).toEqual(lowLatency); }); test('should prioritize user-given headers', () => { diff --git a/test/unit/text-to-speech.v1.test.js b/test/unit/text-to-speech.v1.test.js index 4f30145972..4bb04b2b09 100644 --- a/test/unit/text-to-speech.v1.test.js +++ b/test/unit/text-to-speech.v1.test.js @@ -1,5 +1,5 @@ /** - * (C) Copyright IBM Corp. 2018, 2020. + * (C) Copyright IBM Corp. 2018, 2021. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1103,6 +1103,582 @@ describe('TextToSpeechV1', () => { }); }); }); + describe('listCustomPrompts', () => { + describe('positive tests', () => { + test('should pass the right params to createRequest', () => { + // Construct the params object for operation listCustomPrompts + const customizationId = 'testString'; + const params = { + customizationId: customizationId, + }; + + const listCustomPromptsResult = textToSpeechService.listCustomPrompts(params); + + // all methods should return a Promise + expectToBePromise(listCustomPromptsResult); + + // assert that create request was called + expect(createRequestMock).toHaveBeenCalledTimes(1); + + const options = getOptions(createRequestMock); + + checkUrlAndMethod(options, '/v1/customizations/{customization_id}/prompts', 'GET'); + const expectedAccept = 'application/json'; + const expectedContentType = undefined; + checkMediaHeaders(createRequestMock, expectedAccept, expectedContentType); + expect(options.path['customization_id']).toEqual(customizationId); + }); + + test('should prioritize user-given headers', () => { + // parameters + const customizationId = 'testString'; + const userAccept = 'fake/accept'; + const userContentType = 'fake/contentType'; + const params = { + customizationId, + headers: { + Accept: userAccept, + 'Content-Type': userContentType, + }, + }; + + textToSpeechService.listCustomPrompts(params); + checkMediaHeaders(createRequestMock, userAccept, userContentType); + }); + }); + + describe('negative tests', () => { + test('should enforce required parameters', async done => { + let err; + try { + await textToSpeechService.listCustomPrompts({}); + } catch (e) { + err = e; + } + + expect(err.message).toMatch(/Missing required parameters/); + done(); + }); + + test('should reject promise when required params are not given', done => { + const listCustomPromptsPromise = textToSpeechService.listCustomPrompts(); + expectToBePromise(listCustomPromptsPromise); + + listCustomPromptsPromise.catch(err => { + expect(err.message).toMatch(/Missing required parameters/); + done(); + }); + }); + }); + }); + describe('addCustomPrompt', () => { + describe('positive tests', () => { + // Request models needed by this operation. + + // PromptMetadata + const promptMetadataModel = { + prompt_text: 'testString', + speaker_id: 'testString', + }; + + test('should pass the right params to createRequest', () => { + // Construct the params object for operation addCustomPrompt + const customizationId = 'testString'; + const promptId = 'testString'; + const metadata = promptMetadataModel; + const file = Buffer.from('This is a mock file.'); + const filename = 'testString'; + const params = { + customizationId: customizationId, + promptId: promptId, + metadata: metadata, + file: file, + filename: filename, + }; + + const addCustomPromptResult = textToSpeechService.addCustomPrompt(params); + + // all methods should return a Promise + expectToBePromise(addCustomPromptResult); + + // assert that create request was called + expect(createRequestMock).toHaveBeenCalledTimes(1); + + const options = getOptions(createRequestMock); + + checkUrlAndMethod( + options, + '/v1/customizations/{customization_id}/prompts/{prompt_id}', + 'POST' + ); + const expectedAccept = 'application/json'; + const expectedContentType = 'multipart/form-data'; + checkMediaHeaders(createRequestMock, expectedAccept, expectedContentType); + expect(options.formData['metadata']).toEqual(metadata); + expect(options.formData['file'].data).toEqual(file); + expect(options.formData['file'].filename).toEqual(filename); + expect(options.formData['file'].contentType).toEqual('audio/wav'); + expect(options.path['customization_id']).toEqual(customizationId); + expect(options.path['prompt_id']).toEqual(promptId); + }); + + test('should prioritize user-given headers', () => { + // parameters + const customizationId = 'testString'; + const promptId = 'testString'; + const metadata = promptMetadataModel; + const file = Buffer.from('This is a mock file.'); + const filename = 'testString'; + const userAccept = 'fake/accept'; + const userContentType = 'fake/contentType'; + const params = { + customizationId, + promptId, + metadata, + file, + filename, + headers: { + Accept: userAccept, + 'Content-Type': userContentType, + }, + }; + + textToSpeechService.addCustomPrompt(params); + checkMediaHeaders(createRequestMock, userAccept, userContentType); + }); + }); + + describe('negative tests', () => { + test('should enforce required parameters', async done => { + let err; + try { + await textToSpeechService.addCustomPrompt({}); + } catch (e) { + err = e; + } + + expect(err.message).toMatch(/Missing required parameters/); + done(); + }); + + test('should reject promise when required params are not given', done => { + const addCustomPromptPromise = textToSpeechService.addCustomPrompt(); + expectToBePromise(addCustomPromptPromise); + + addCustomPromptPromise.catch(err => { + expect(err.message).toMatch(/Missing required parameters/); + done(); + }); + }); + }); + }); + describe('getCustomPrompt', () => { + describe('positive tests', () => { + test('should pass the right params to createRequest', () => { + // Construct the params object for operation getCustomPrompt + const customizationId = 'testString'; + const promptId = 'testString'; + const params = { + customizationId: customizationId, + promptId: promptId, + }; + + const getCustomPromptResult = textToSpeechService.getCustomPrompt(params); + + // all methods should return a Promise + expectToBePromise(getCustomPromptResult); + + // assert that create request was called + expect(createRequestMock).toHaveBeenCalledTimes(1); + + const options = getOptions(createRequestMock); + + checkUrlAndMethod( + options, + '/v1/customizations/{customization_id}/prompts/{prompt_id}', + 'GET' + ); + const expectedAccept = 'application/json'; + const expectedContentType = undefined; + checkMediaHeaders(createRequestMock, expectedAccept, expectedContentType); + expect(options.path['customization_id']).toEqual(customizationId); + expect(options.path['prompt_id']).toEqual(promptId); + }); + + test('should prioritize user-given headers', () => { + // parameters + const customizationId = 'testString'; + const promptId = 'testString'; + const userAccept = 'fake/accept'; + const userContentType = 'fake/contentType'; + const params = { + customizationId, + promptId, + headers: { + Accept: userAccept, + 'Content-Type': userContentType, + }, + }; + + textToSpeechService.getCustomPrompt(params); + checkMediaHeaders(createRequestMock, userAccept, userContentType); + }); + }); + + describe('negative tests', () => { + test('should enforce required parameters', async done => { + let err; + try { + await textToSpeechService.getCustomPrompt({}); + } catch (e) { + err = e; + } + + expect(err.message).toMatch(/Missing required parameters/); + done(); + }); + + test('should reject promise when required params are not given', done => { + const getCustomPromptPromise = textToSpeechService.getCustomPrompt(); + expectToBePromise(getCustomPromptPromise); + + getCustomPromptPromise.catch(err => { + expect(err.message).toMatch(/Missing required parameters/); + done(); + }); + }); + }); + }); + describe('deleteCustomPrompt', () => { + describe('positive tests', () => { + test('should pass the right params to createRequest', () => { + // Construct the params object for operation deleteCustomPrompt + const customizationId = 'testString'; + const promptId = 'testString'; + const params = { + customizationId: customizationId, + promptId: promptId, + }; + + const deleteCustomPromptResult = textToSpeechService.deleteCustomPrompt(params); + + // all methods should return a Promise + expectToBePromise(deleteCustomPromptResult); + + // assert that create request was called + expect(createRequestMock).toHaveBeenCalledTimes(1); + + const options = getOptions(createRequestMock); + + checkUrlAndMethod( + options, + '/v1/customizations/{customization_id}/prompts/{prompt_id}', + 'DELETE' + ); + const expectedAccept = undefined; + const expectedContentType = undefined; + checkMediaHeaders(createRequestMock, expectedAccept, expectedContentType); + expect(options.path['customization_id']).toEqual(customizationId); + expect(options.path['prompt_id']).toEqual(promptId); + }); + + test('should prioritize user-given headers', () => { + // parameters + const customizationId = 'testString'; + const promptId = 'testString'; + const userAccept = 'fake/accept'; + const userContentType = 'fake/contentType'; + const params = { + customizationId, + promptId, + headers: { + Accept: userAccept, + 'Content-Type': userContentType, + }, + }; + + textToSpeechService.deleteCustomPrompt(params); + checkMediaHeaders(createRequestMock, userAccept, userContentType); + }); + }); + + describe('negative tests', () => { + test('should enforce required parameters', async done => { + let err; + try { + await textToSpeechService.deleteCustomPrompt({}); + } catch (e) { + err = e; + } + + expect(err.message).toMatch(/Missing required parameters/); + done(); + }); + + test('should reject promise when required params are not given', done => { + const deleteCustomPromptPromise = textToSpeechService.deleteCustomPrompt(); + expectToBePromise(deleteCustomPromptPromise); + + deleteCustomPromptPromise.catch(err => { + expect(err.message).toMatch(/Missing required parameters/); + done(); + }); + }); + }); + }); + describe('listSpeakerModels', () => { + describe('positive tests', () => { + test('should pass the right params to createRequest', () => { + // Construct the params object for operation listSpeakerModels + const params = {}; + + const listSpeakerModelsResult = textToSpeechService.listSpeakerModels(params); + + // all methods should return a Promise + expectToBePromise(listSpeakerModelsResult); + + // assert that create request was called + expect(createRequestMock).toHaveBeenCalledTimes(1); + + const options = getOptions(createRequestMock); + + checkUrlAndMethod(options, '/v1/speakers', 'GET'); + const expectedAccept = 'application/json'; + const expectedContentType = undefined; + checkMediaHeaders(createRequestMock, expectedAccept, expectedContentType); + }); + + test('should prioritize user-given headers', () => { + // parameters + const userAccept = 'fake/accept'; + const userContentType = 'fake/contentType'; + const params = { + headers: { + Accept: userAccept, + 'Content-Type': userContentType, + }, + }; + + textToSpeechService.listSpeakerModels(params); + checkMediaHeaders(createRequestMock, userAccept, userContentType); + }); + + test('should not have any problems when no parameters are passed in', () => { + // invoke the method with no parameters + textToSpeechService.listSpeakerModels({}); + checkForSuccessfulExecution(createRequestMock); + }); + }); + }); + describe('createSpeakerModel', () => { + describe('positive tests', () => { + test('should pass the right params to createRequest', () => { + // Construct the params object for operation createSpeakerModel + const speakerName = 'testString'; + const audio = Buffer.from('This is a mock file.'); + const params = { + speakerName: speakerName, + audio: audio, + }; + + const createSpeakerModelResult = textToSpeechService.createSpeakerModel(params); + + // all methods should return a Promise + expectToBePromise(createSpeakerModelResult); + + // assert that create request was called + expect(createRequestMock).toHaveBeenCalledTimes(1); + + const options = getOptions(createRequestMock); + + checkUrlAndMethod(options, '/v1/speakers', 'POST'); + const expectedAccept = 'application/json'; + const expectedContentType = 'audio/wav'; + checkMediaHeaders(createRequestMock, expectedAccept, expectedContentType); + expect(options.body).toEqual(audio); + expect(options.qs['speaker_name']).toEqual(speakerName); + }); + + test('should prioritize user-given headers', () => { + // parameters + const speakerName = 'testString'; + const audio = Buffer.from('This is a mock file.'); + const userAccept = 'fake/accept'; + const userContentType = 'fake/contentType'; + const params = { + speakerName, + audio, + headers: { + Accept: userAccept, + 'Content-Type': userContentType, + }, + }; + + textToSpeechService.createSpeakerModel(params); + checkMediaHeaders(createRequestMock, userAccept, userContentType); + }); + }); + + describe('negative tests', () => { + test('should enforce required parameters', async done => { + let err; + try { + await textToSpeechService.createSpeakerModel({}); + } catch (e) { + err = e; + } + + expect(err.message).toMatch(/Missing required parameters/); + done(); + }); + + test('should reject promise when required params are not given', done => { + const createSpeakerModelPromise = textToSpeechService.createSpeakerModel(); + expectToBePromise(createSpeakerModelPromise); + + createSpeakerModelPromise.catch(err => { + expect(err.message).toMatch(/Missing required parameters/); + done(); + }); + }); + }); + }); + describe('getSpeakerModel', () => { + describe('positive tests', () => { + test('should pass the right params to createRequest', () => { + // Construct the params object for operation getSpeakerModel + const speakerId = 'testString'; + const params = { + speakerId: speakerId, + }; + + const getSpeakerModelResult = textToSpeechService.getSpeakerModel(params); + + // all methods should return a Promise + expectToBePromise(getSpeakerModelResult); + + // assert that create request was called + expect(createRequestMock).toHaveBeenCalledTimes(1); + + const options = getOptions(createRequestMock); + + checkUrlAndMethod(options, '/v1/speakers/{speaker_id}', 'GET'); + const expectedAccept = 'application/json'; + const expectedContentType = undefined; + checkMediaHeaders(createRequestMock, expectedAccept, expectedContentType); + expect(options.path['speaker_id']).toEqual(speakerId); + }); + + test('should prioritize user-given headers', () => { + // parameters + const speakerId = 'testString'; + const userAccept = 'fake/accept'; + const userContentType = 'fake/contentType'; + const params = { + speakerId, + headers: { + Accept: userAccept, + 'Content-Type': userContentType, + }, + }; + + textToSpeechService.getSpeakerModel(params); + checkMediaHeaders(createRequestMock, userAccept, userContentType); + }); + }); + + describe('negative tests', () => { + test('should enforce required parameters', async done => { + let err; + try { + await textToSpeechService.getSpeakerModel({}); + } catch (e) { + err = e; + } + + expect(err.message).toMatch(/Missing required parameters/); + done(); + }); + + test('should reject promise when required params are not given', done => { + const getSpeakerModelPromise = textToSpeechService.getSpeakerModel(); + expectToBePromise(getSpeakerModelPromise); + + getSpeakerModelPromise.catch(err => { + expect(err.message).toMatch(/Missing required parameters/); + done(); + }); + }); + }); + }); + describe('deleteSpeakerModel', () => { + describe('positive tests', () => { + test('should pass the right params to createRequest', () => { + // Construct the params object for operation deleteSpeakerModel + const speakerId = 'testString'; + const params = { + speakerId: speakerId, + }; + + const deleteSpeakerModelResult = textToSpeechService.deleteSpeakerModel(params); + + // all methods should return a Promise + expectToBePromise(deleteSpeakerModelResult); + + // assert that create request was called + expect(createRequestMock).toHaveBeenCalledTimes(1); + + const options = getOptions(createRequestMock); + + checkUrlAndMethod(options, '/v1/speakers/{speaker_id}', 'DELETE'); + const expectedAccept = undefined; + const expectedContentType = undefined; + checkMediaHeaders(createRequestMock, expectedAccept, expectedContentType); + expect(options.path['speaker_id']).toEqual(speakerId); + }); + + test('should prioritize user-given headers', () => { + // parameters + const speakerId = 'testString'; + const userAccept = 'fake/accept'; + const userContentType = 'fake/contentType'; + const params = { + speakerId, + headers: { + Accept: userAccept, + 'Content-Type': userContentType, + }, + }; + + textToSpeechService.deleteSpeakerModel(params); + checkMediaHeaders(createRequestMock, userAccept, userContentType); + }); + }); + + describe('negative tests', () => { + test('should enforce required parameters', async done => { + let err; + try { + await textToSpeechService.deleteSpeakerModel({}); + } catch (e) { + err = e; + } + + expect(err.message).toMatch(/Missing required parameters/); + done(); + }); + + test('should reject promise when required params are not given', done => { + const deleteSpeakerModelPromise = textToSpeechService.deleteSpeakerModel(); + expectToBePromise(deleteSpeakerModelPromise); + + deleteSpeakerModelPromise.catch(err => { + expect(err.message).toMatch(/Missing required parameters/); + done(); + }); + }); + }); + }); describe('deleteUserData', () => { describe('positive tests', () => { test('should pass the right params to createRequest', () => { diff --git a/text-to-speech/v1-generated.ts b/text-to-speech/v1-generated.ts index d08401c0da..854265f1ae 100644 --- a/text-to-speech/v1-generated.ts +++ b/text-to-speech/v1-generated.ts @@ -1,5 +1,5 @@ /** - * (C) Copyright IBM Corp. 2017, 2020. + * (C) Copyright IBM Corp. 2017, 2021. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,9 +15,9 @@ */ /** - * IBM OpenAPI SDK Code Generator Version: 99-SNAPSHOT-be3b4618-20201221-123327 + * IBM OpenAPI SDK Code Generator Version: 99-SNAPSHOT-902c9336-20210507-162723 */ - + import * as extend from 'extend'; import { IncomingHttpHeaders, OutgoingHttpHeaders } from 'http'; @@ -40,6 +40,10 @@ import { getSdkHeaders } from '../lib/common'; * translation is based on the SSML phoneme format for representing a word. You can specify a phonetic translation in * standard International Phonetic Alphabet (IPA) representation or in the proprietary IBM Symbolic Phonetic * Representation (SPR). The Arabic, Chinese, Dutch, Australian English, and Korean languages support only IPA. + * + * The service also offers a Tune by Example feature that lets you define custom prompts. You can also define speaker + * models to improve the quality of your custom prompts. The service support custom prompts only for US English custom + * models and voices. */ class TextToSpeechV1 extends BaseService { @@ -480,9 +484,9 @@ class TextToSpeechV1 extends BaseService { * List custom models. * * Lists metadata such as the name and description for all custom models that are owned by an instance of the service. - * Specify a language to list the custom models for that language only. To see the words in addition to the metadata - * for a specific custom model, use the **List a custom model** method. You must use credentials for the instance of - * the service that owns a model to list information about it. + * Specify a language to list the custom models for that language only. To see the words and prompts in addition to + * the metadata for a specific custom model, use the **Get a custom model** method. You must use credentials for the + * instance of the service that owns a model to list information about it. * * **See also:** [Querying all custom * models](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-customModels#cuModelsQueryAll). @@ -597,8 +601,9 @@ class TextToSpeechV1 extends BaseService { * Get a custom model. * * Gets all information about a specified custom model. In addition to metadata such as the name and description of - * the custom model, the output includes the words and their translations as defined in the model. To see just the - * metadata for a model, use the **List custom models** method. + * the custom model, the output includes the words and their translations that are defined for the model, as well as + * any prompts that are defined for the model. To see just the metadata for a model, use the **List custom models** + * method. * * **See also:** [Querying a custom * model](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-customModels#cuModelsQuery). @@ -987,6 +992,544 @@ class TextToSpeechV1 extends BaseService { return this.createRequest(parameters); }; + /************************* + * customPrompts + ************************/ + + /** + * List custom prompts. + * + * Lists information about all custom prompts that are defined for a custom model. The information includes the prompt + * ID, prompt text, status, and optional speaker ID for each prompt of the custom model. You must use credentials for + * the instance of the service that owns the custom model. The same information about all of the prompts for a custom + * model is also provided by the **Get a custom model** method. That method provides complete details about a + * specified custom model, including its language, owner, custom words, and more. + * + * **Beta:** Custom prompts are beta functionality that is supported only for use with US English custom models and + * voices. + * + * **See also:** [Listing custom + * prompts](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-custom-prompts#tbe-custom-prompts-list). + * + * @param {Object} params - The parameters to send to the service. + * @param {string} params.customizationId - The customization ID (GUID) of the custom model. You must make the request + * with credentials for the instance of the service that owns the custom model. + * @param {OutgoingHttpHeaders} [params.headers] - Custom request headers + * @returns {Promise>} + */ + public listCustomPrompts(params: TextToSpeechV1.ListCustomPromptsParams): Promise> { + const _params = Object.assign({}, params); + const requiredParams = ['customizationId']; + + const missingParams = getMissingParams(_params, requiredParams); + if (missingParams) { + return Promise.reject(missingParams); + } + + const path = { + 'customization_id': _params.customizationId + }; + + const sdkHeaders = getSdkHeaders(TextToSpeechV1.DEFAULT_SERVICE_NAME, 'v1', 'listCustomPrompts'); + + const parameters = { + options: { + url: '/v1/customizations/{customization_id}/prompts', + method: 'GET', + path, + }, + defaultOptions: extend(true, {}, this.baseOptions, { + headers: extend(true, sdkHeaders, { + 'Accept': 'application/json', + }, _params.headers), + }), + }; + + return this.createRequest(parameters); + }; + + /** + * Add a custom prompt. + * + * Adds a custom prompt to a custom model. A prompt is defined by the text that is to be spoken, the audio for that + * text, a unique user-specified ID for the prompt, and an optional speaker ID. The information is used to generate + * prosodic data that is not visible to the user. This data is used by the service to produce the synthesized audio + * upon request. You must use credentials for the instance of the service that owns a custom model to add a prompt to + * it. You can add a maximum of 1000 custom prompts to a single custom model. + * + * You are recommended to assign meaningful values for prompt IDs. For example, use `goodbye` to identify a prompt + * that speaks a farewell message. Prompt IDs must be unique within a given custom model. You cannot define two + * prompts with the same name for the same custom model. If you provide the ID of an existing prompt, the previously + * uploaded prompt is replaced by the new information. The existing prompt is reprocessed by using the new text and + * audio and, if provided, new speaker model, and the prosody data associated with the prompt is updated. + * + * The quality of a prompt is undefined if the language of a prompt does not match the language of its custom model. + * This is consistent with any text or SSML that is specified for a speech synthesis request. The service makes a + * best-effort attempt to render the specified text for the prompt; it does not validate that the language of the text + * matches the language of the model. + * + * Adding a prompt is an asynchronous operation. Although it accepts less audio than speaker enrollment, the service + * must align the audio with the provided text. The time that it takes to process a prompt depends on the prompt + * itself. The processing time for a reasonably sized prompt generally matches the length of the audio (for example, + * it takes 20 seconds to process a 20-second prompt). + * + * For shorter prompts, you can wait for a reasonable amount of time and then check the status of the prompt with the + * **Get a custom prompt** method. For longer prompts, consider using that method to poll the service every few + * seconds to determine when the prompt becomes available. No prompt can be used for speech synthesis if it is in the + * `processing` or `failed` state. Only prompts that are in the `available` state can be used for speech synthesis. + * + * When it processes a request, the service attempts to align the text and the audio that are provided for the prompt. + * The text that is passed with a prompt must match the spoken audio as closely as possible. Optimally, the text and + * audio match exactly. The service does its best to align the specified text with the audio, and it can often + * compensate for mismatches between the two. But if the service cannot effectively align the text and the audio, + * possibly because the magnitude of mismatches between the two is too great, processing of the prompt fails. + * + * ### Evaluating a prompt + * + * Always listen to and evaluate a prompt to determine its quality before using it in production. To evaluate a + * prompt, include only the single prompt in a speech synthesis request by using the following SSML extension, in this + * case for a prompt whose ID is `goodbye`: + * + * `` + * + * In some cases, you might need to rerecord and resubmit a prompt as many as five times to address the following + * possible problems: + * * The service might fail to detect a mismatch between the prompt’s text and audio. The longer the prompt, the + * greater the chance for misalignment between its text and audio. Therefore, multiple shorter prompts are preferable + * to a single long prompt. + * * The text of a prompt might include a word that the service does not recognize. In this case, you can create a + * custom word and pronunciation pair to tell the service how to pronounce the word. You must then re-create the + * prompt. + * * The quality of the input audio might be insufficient or the service’s processing of the audio might fail to + * detect the intended prosody. Submitting new audio for the prompt can correct these issues. + * + * If a prompt that is created without a speaker ID does not adequately reflect the intended prosody, enrolling the + * speaker and providing a speaker ID for the prompt is one recommended means of potentially improving the quality of + * the prompt. This is especially important for shorter prompts such as "good-bye" or "thank you," where less audio + * data makes it more difficult to match the prosody of the speaker. + * + * **Beta:** Custom prompts are beta functionality that is supported only for use with US English custom models and + * voices. + * + * **See also:** + * * [Add a custom + * prompt](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-create#tbe-create-add-prompt) + * * [Evaluate a custom + * prompt](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-create#tbe-create-evaluate-prompt) + * * [Rules for creating custom + * prompts](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-rules#tbe-rules-prompts). + * + * @param {Object} params - The parameters to send to the service. + * @param {string} params.customizationId - The customization ID (GUID) of the custom model. You must make the request + * with credentials for the instance of the service that owns the custom model. + * @param {string} params.promptId - The identifier of the prompt that is to be added to the custom model: + * * Include a maximum of 49 characters in the ID. + * * Include only alphanumeric characters and `_` (underscores) in the ID. + * * Do not include XML sensitive characters (double quotes, single quotes, ampersands, angle brackets, and slashes) + * in the ID. + * * To add a new prompt, the ID must be unique for the specified custom model. Otherwise, the new information for the + * prompt overwrites the existing prompt that has that ID. + * @param {PromptMetadata} params.metadata - Information about the prompt that is to be added to a custom model. The + * following example of a `PromptMetadata` object includes both the required prompt text and an optional speaker model + * ID: + * + * `{ "prompt_text": "Thank you and good-bye!", "speaker_id": "823068b2-ed4e-11ea-b6e0-7b6456aa95cc" }`. + * @param {NodeJS.ReadableStream|Buffer} params.file - An audio file that speaks the text of the prompt with + * intonation and prosody that matches how you would like the prompt to be spoken. + * * The prompt audio must be in WAV format and must have a minimum sampling rate of 16 kHz. The service accepts audio + * with higher sampling rates. The service transcodes all audio to 16 kHz before processing it. + * * The length of the prompt audio is limited to 30 seconds. + * @param {string} params.filename - The filename for file. + * @param {OutgoingHttpHeaders} [params.headers] - Custom request headers + * @returns {Promise>} + */ + public addCustomPrompt(params: TextToSpeechV1.AddCustomPromptParams): Promise> { + const _params = Object.assign({}, params); + const requiredParams = ['customizationId', 'promptId', 'metadata', 'file', 'filename']; + + const missingParams = getMissingParams(_params, requiredParams); + if (missingParams) { + return Promise.reject(missingParams); + } + + const formData = { + 'metadata': _params.metadata, + 'file': { + data: _params.file, + filename: _params.filename, + contentType: 'audio/wav' + } + }; + + const path = { + 'customization_id': _params.customizationId, + 'prompt_id': _params.promptId + }; + + const sdkHeaders = getSdkHeaders(TextToSpeechV1.DEFAULT_SERVICE_NAME, 'v1', 'addCustomPrompt'); + + const parameters = { + options: { + url: '/v1/customizations/{customization_id}/prompts/{prompt_id}', + method: 'POST', + path, + formData + }, + defaultOptions: extend(true, {}, this.baseOptions, { + headers: extend(true, sdkHeaders, { + 'Accept': 'application/json', + 'Content-Type': 'multipart/form-data', + }, _params.headers), + }), + }; + + return this.createRequest(parameters); + }; + + /** + * Get a custom prompt. + * + * Gets information about a specified custom prompt for a specified custom model. The information includes the prompt + * ID, prompt text, status, and optional speaker ID for each prompt of the custom model. You must use credentials for + * the instance of the service that owns the custom model. + * + * **Beta:** Custom prompts are beta functionality that is supported only for use with US English custom models and + * voices. + * + * **See also:** [Listing custom + * prompts](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-custom-prompts#tbe-custom-prompts-list). + * + * @param {Object} params - The parameters to send to the service. + * @param {string} params.customizationId - The customization ID (GUID) of the custom model. You must make the request + * with credentials for the instance of the service that owns the custom model. + * @param {string} params.promptId - The identifier (name) of the prompt. + * @param {OutgoingHttpHeaders} [params.headers] - Custom request headers + * @returns {Promise>} + */ + public getCustomPrompt(params: TextToSpeechV1.GetCustomPromptParams): Promise> { + const _params = Object.assign({}, params); + const requiredParams = ['customizationId', 'promptId']; + + const missingParams = getMissingParams(_params, requiredParams); + if (missingParams) { + return Promise.reject(missingParams); + } + + const path = { + 'customization_id': _params.customizationId, + 'prompt_id': _params.promptId + }; + + const sdkHeaders = getSdkHeaders(TextToSpeechV1.DEFAULT_SERVICE_NAME, 'v1', 'getCustomPrompt'); + + const parameters = { + options: { + url: '/v1/customizations/{customization_id}/prompts/{prompt_id}', + method: 'GET', + path, + }, + defaultOptions: extend(true, {}, this.baseOptions, { + headers: extend(true, sdkHeaders, { + 'Accept': 'application/json', + }, _params.headers), + }), + }; + + return this.createRequest(parameters); + }; + + /** + * Delete a custom prompt. + * + * Deletes an existing custom prompt from a custom model. The service deletes the prompt with the specified ID. You + * must use credentials for the instance of the service that owns the custom model from which the prompt is to be + * deleted. + * + * **Caution:** Deleting a custom prompt elicits a 400 response code from synthesis requests that attempt to use the + * prompt. Make sure that you do not attempt to use a deleted prompt in a production application. + * + * **Beta:** Custom prompts are beta functionality that is supported only for use with US English custom models and + * voices. + * + * **See also:** [Deleting a custom + * prompt](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-custom-prompts#tbe-custom-prompts-delete). + * + * @param {Object} params - The parameters to send to the service. + * @param {string} params.customizationId - The customization ID (GUID) of the custom model. You must make the request + * with credentials for the instance of the service that owns the custom model. + * @param {string} params.promptId - The identifier (name) of the prompt that is to be deleted. + * @param {OutgoingHttpHeaders} [params.headers] - Custom request headers + * @returns {Promise>} + */ + public deleteCustomPrompt(params: TextToSpeechV1.DeleteCustomPromptParams): Promise> { + const _params = Object.assign({}, params); + const requiredParams = ['customizationId', 'promptId']; + + const missingParams = getMissingParams(_params, requiredParams); + if (missingParams) { + return Promise.reject(missingParams); + } + + const path = { + 'customization_id': _params.customizationId, + 'prompt_id': _params.promptId + }; + + const sdkHeaders = getSdkHeaders(TextToSpeechV1.DEFAULT_SERVICE_NAME, 'v1', 'deleteCustomPrompt'); + + const parameters = { + options: { + url: '/v1/customizations/{customization_id}/prompts/{prompt_id}', + method: 'DELETE', + path, + }, + defaultOptions: extend(true, {}, this.baseOptions, { + headers: extend(true, sdkHeaders, { + }, _params.headers), + }), + }; + + return this.createRequest(parameters); + }; + + /************************* + * speakerModels + ************************/ + + /** + * List speaker models. + * + * Lists information about all speaker models that are defined for a service instance. The information includes the + * speaker ID and speaker name of each defined speaker. You must use credentials for the instance of a service to list + * its speakers. + * + * **Beta:** Speaker models and the custom prompts with which they are used are beta functionality that is supported + * only for use with US English custom models and voices. + * + * **See also:** [Listing speaker + * models](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-speaker-models#tbe-speaker-models-list). + * + * @param {Object} [params] - The parameters to send to the service. + * @param {OutgoingHttpHeaders} [params.headers] - Custom request headers + * @returns {Promise>} + */ + public listSpeakerModels(params?: TextToSpeechV1.ListSpeakerModelsParams): Promise> { + const _params = Object.assign({}, params); + + const sdkHeaders = getSdkHeaders(TextToSpeechV1.DEFAULT_SERVICE_NAME, 'v1', 'listSpeakerModels'); + + const parameters = { + options: { + url: '/v1/speakers', + method: 'GET', + }, + defaultOptions: extend(true, {}, this.baseOptions, { + headers: extend(true, sdkHeaders, { + 'Accept': 'application/json', + }, _params.headers), + }), + }; + + return this.createRequest(parameters); + }; + + /** + * Create a speaker model. + * + * Creates a new speaker model, which is an optional enrollment token for users who are to add prompts to custom + * models. A speaker model contains information about a user's voice. The service extracts this information from a WAV + * audio sample that you pass as the body of the request. Associating a speaker model with a prompt is optional, but + * the information that is extracted from the speaker model helps the service learn about the speaker's voice. + * + * A speaker model can make an appreciable difference in the quality of prompts, especially short prompts with + * relatively little audio, that are associated with that speaker. A speaker model can help the service produce a + * prompt with more confidence; the lack of a speaker model can potentially compromise the quality of a prompt. + * + * The gender of the speaker who creates a speaker model does not need to match the gender of a voice that is used + * with prompts that are associated with that speaker model. For example, a speaker model that is created by a male + * speaker can be associated with prompts that are spoken by female voices. + * + * You create a speaker model for a given instance of the service. The new speaker model is owned by the service + * instance whose credentials are used to create it. That same speaker can then be used to create prompts for all + * custom models within that service instance. No language is associated with a speaker model, but each custom model + * has a single specified language. You can add prompts only to US English models. + * + * You specify a name for the speaker when you create it. The name must be unique among all speaker names for the + * owning service instance. To re-create a speaker model for an existing speaker name, you must first delete the + * existing speaker model that has that name. + * + * Speaker enrollment is a synchronous operation. Although it accepts more audio data than a prompt, the process of + * adding a speaker is very fast. The service simply extracts information about the speaker’s voice from the audio. + * Unlike prompts, speaker models neither need nor accept a transcription of the audio. When the call returns, the + * audio is fully processed and the speaker enrollment is complete. + * + * The service returns a speaker ID with the request. A speaker ID is globally unique identifier (GUID) that you use + * to identify the speaker in subsequent requests to the service. + * + * **Beta:** Speaker models and the custom prompts with which they are used are beta functionality that is supported + * only for use with US English custom models and voices. + * + * **See also:** + * * [Create a speaker + * model](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-create#tbe-create-speaker-model) + * * [Rules for creating speaker + * models](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-rules#tbe-rules-speakers). + * + * @param {Object} params - The parameters to send to the service. + * @param {string} params.speakerName - The name of the speaker that is to be added to the service instance. + * * Include a maximum of 49 characters in the name. + * * Include only alphanumeric characters and `_` (underscores) in the name. + * * Do not include XML sensitive characters (double quotes, single quotes, ampersands, angle brackets, and slashes) + * in the name. + * * Do not use the name of an existing speaker that is already defined for the service instance. + * @param {NodeJS.ReadableStream|Buffer} params.audio - An enrollment audio file that contains a sample of the + * speaker’s voice. + * * The enrollment audio must be in WAV format and must have a minimum sampling rate of 16 kHz. The service accepts + * audio with higher sampling rates. It transcodes all audio to 16 kHz before processing it. + * * The length of the enrollment audio is limited to 1 minute. Speaking one or two paragraphs of text that include + * five to ten sentences is recommended. + * @param {OutgoingHttpHeaders} [params.headers] - Custom request headers + * @returns {Promise>} + */ + public createSpeakerModel(params: TextToSpeechV1.CreateSpeakerModelParams): Promise> { + const _params = Object.assign({}, params); + const requiredParams = ['speakerName', 'audio']; + + const missingParams = getMissingParams(_params, requiredParams); + if (missingParams) { + return Promise.reject(missingParams); + } + + const body = _params.audio; + const query = { + 'speaker_name': _params.speakerName + }; + + const sdkHeaders = getSdkHeaders(TextToSpeechV1.DEFAULT_SERVICE_NAME, 'v1', 'createSpeakerModel'); + + const parameters = { + options: { + url: '/v1/speakers', + method: 'POST', + body, + qs: query, + }, + defaultOptions: extend(true, {}, this.baseOptions, { + headers: extend(true, sdkHeaders, { + 'Accept': 'application/json', + 'Content-Type': 'audio/wav', + }, _params.headers), + }), + }; + + return this.createRequest(parameters); + }; + + /** + * Get a speaker model. + * + * Gets information about all prompts that are defined by a specified speaker for all custom models that are owned by + * a service instance. The information is grouped by the customization IDs of the custom models. For each custom + * model, the information lists information about each prompt that is defined for that custom model by the speaker. + * You must use credentials for the instance of the service that owns a speaker model to list its prompts. + * + * **Beta:** Speaker models and the custom prompts with which they are used are beta functionality that is supported + * only for use with US English custom models and voices. + * + * **See also:** [Listing the custom prompts for a speaker + * model](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-speaker-models#tbe-speaker-models-list-prompts). + * + * @param {Object} params - The parameters to send to the service. + * @param {string} params.speakerId - The speaker ID (GUID) of the speaker model. You must make the request with + * service credentials for the instance of the service that owns the speaker model. + * @param {OutgoingHttpHeaders} [params.headers] - Custom request headers + * @returns {Promise>} + */ + public getSpeakerModel(params: TextToSpeechV1.GetSpeakerModelParams): Promise> { + const _params = Object.assign({}, params); + const requiredParams = ['speakerId']; + + const missingParams = getMissingParams(_params, requiredParams); + if (missingParams) { + return Promise.reject(missingParams); + } + + const path = { + 'speaker_id': _params.speakerId + }; + + const sdkHeaders = getSdkHeaders(TextToSpeechV1.DEFAULT_SERVICE_NAME, 'v1', 'getSpeakerModel'); + + const parameters = { + options: { + url: '/v1/speakers/{speaker_id}', + method: 'GET', + path, + }, + defaultOptions: extend(true, {}, this.baseOptions, { + headers: extend(true, sdkHeaders, { + 'Accept': 'application/json', + }, _params.headers), + }), + }; + + return this.createRequest(parameters); + }; + + /** + * Delete a speaker model. + * + * Deletes an existing speaker model from the service instance. The service deletes the enrolled speaker with the + * specified speaker ID. You must use credentials for the instance of the service that owns a speaker model to delete + * the speaker. + * + * Any prompts that are associated with the deleted speaker are not affected by the speaker's deletion. The prosodic + * data that defines the quality of a prompt is established when the prompt is created. A prompt is static and remains + * unaffected by deletion of its associated speaker. However, the prompt cannot be resubmitted or updated with its + * original speaker once that speaker is deleted. + * + * **Beta:** Speaker models and the custom prompts with which they are used are beta functionality that is supported + * only for use with US English custom models and voices. + * + * **See also:** [Deleting a speaker + * model](https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-tbe-speaker-models#tbe-speaker-models-delete). + * + * @param {Object} params - The parameters to send to the service. + * @param {string} params.speakerId - The speaker ID (GUID) of the speaker model. You must make the request with + * service credentials for the instance of the service that owns the speaker model. + * @param {OutgoingHttpHeaders} [params.headers] - Custom request headers + * @returns {Promise>} + */ + public deleteSpeakerModel(params: TextToSpeechV1.DeleteSpeakerModelParams): Promise> { + const _params = Object.assign({}, params); + const requiredParams = ['speakerId']; + + const missingParams = getMissingParams(_params, requiredParams); + if (missingParams) { + return Promise.reject(missingParams); + } + + const path = { + 'speaker_id': _params.speakerId + }; + + const sdkHeaders = getSdkHeaders(TextToSpeechV1.DEFAULT_SERVICE_NAME, 'v1', 'deleteSpeakerModel'); + + const parameters = { + options: { + url: '/v1/speakers/{speaker_id}', + method: 'DELETE', + path, + }, + defaultOptions: extend(true, {}, this.baseOptions, { + headers: extend(true, sdkHeaders, { + }, _params.headers), + }), + }; + + return this.createRequest(parameters); + }; + /************************* * userData ************************/ @@ -1127,6 +1670,7 @@ namespace TextToSpeechV1 { ES_LA_SOFIAV3VOICE = 'es-LA_SofiaV3Voice', ES_US_SOFIAVOICE = 'es-US_SofiaVoice', ES_US_SOFIAV3VOICE = 'es-US_SofiaV3Voice', + FR_CA_LOUISEV3VOICE = 'fr-CA_LouiseV3Voice', FR_FR_NICOLASV3VOICE = 'fr-FR_NicolasV3Voice', FR_FR_RENEEVOICE = 'fr-FR_ReneeVoice', FR_FR_RENEEV3VOICE = 'fr-FR_ReneeV3Voice', @@ -1221,6 +1765,7 @@ namespace TextToSpeechV1 { ES_LA_SOFIAV3VOICE = 'es-LA_SofiaV3Voice', ES_US_SOFIAVOICE = 'es-US_SofiaVoice', ES_US_SOFIAV3VOICE = 'es-US_SofiaV3Voice', + FR_CA_LOUISEV3VOICE = 'fr-CA_LouiseV3Voice', FR_FR_NICOLASV3VOICE = 'fr-FR_NicolasV3Voice', FR_FR_RENEEVOICE = 'fr-FR_ReneeVoice', FR_FR_RENEEV3VOICE = 'fr-FR_ReneeV3Voice', @@ -1300,6 +1845,7 @@ namespace TextToSpeechV1 { ES_LA_SOFIAV3VOICE = 'es-LA_SofiaV3Voice', ES_US_SOFIAVOICE = 'es-US_SofiaVoice', ES_US_SOFIAV3VOICE = 'es-US_SofiaV3Voice', + FR_CA_LOUISEV3VOICE = 'fr-CA_LouiseV3Voice', FR_FR_NICOLASV3VOICE = 'fr-FR_NicolasV3Voice', FR_FR_RENEEVOICE = 'fr-FR_ReneeVoice', FR_FR_RENEEV3VOICE = 'fr-FR_ReneeV3Voice', @@ -1352,6 +1898,7 @@ namespace TextToSpeechV1 { ES_ES = 'es-ES', ES_LA = 'es-LA', ES_US = 'es-US', + FR_CA = 'fr-CA', FR_FR = 'fr-FR', IT_IT = 'it-IT', JA_JP = 'ja-JP', @@ -1383,6 +1930,7 @@ namespace TextToSpeechV1 { ES_ES = 'es-ES', ES_LA = 'es-LA', ES_US = 'es-US', + FR_CA = 'fr-CA', FR_FR = 'fr-FR', IT_IT = 'it-IT', JA_JP = 'ja-JP', @@ -1524,6 +2072,113 @@ namespace TextToSpeechV1 { headers?: OutgoingHttpHeaders; } + /** Parameters for the `listCustomPrompts` operation. */ + export interface ListCustomPromptsParams { + /** The customization ID (GUID) of the custom model. You must make the request with credentials for the instance + * of the service that owns the custom model. + */ + customizationId: string; + headers?: OutgoingHttpHeaders; + } + + /** Parameters for the `addCustomPrompt` operation. */ + export interface AddCustomPromptParams { + /** The customization ID (GUID) of the custom model. You must make the request with credentials for the instance + * of the service that owns the custom model. + */ + customizationId: string; + /** The identifier of the prompt that is to be added to the custom model: + * * Include a maximum of 49 characters in the ID. + * * Include only alphanumeric characters and `_` (underscores) in the ID. + * * Do not include XML sensitive characters (double quotes, single quotes, ampersands, angle brackets, and + * slashes) in the ID. + * * To add a new prompt, the ID must be unique for the specified custom model. Otherwise, the new information for + * the prompt overwrites the existing prompt that has that ID. + */ + promptId: string; + /** Information about the prompt that is to be added to a custom model. The following example of a + * `PromptMetadata` object includes both the required prompt text and an optional speaker model ID: + * + * `{ "prompt_text": "Thank you and good-bye!", "speaker_id": "823068b2-ed4e-11ea-b6e0-7b6456aa95cc" }`. + */ + metadata: PromptMetadata; + /** An audio file that speaks the text of the prompt with intonation and prosody that matches how you would like + * the prompt to be spoken. + * * The prompt audio must be in WAV format and must have a minimum sampling rate of 16 kHz. The service accepts + * audio with higher sampling rates. The service transcodes all audio to 16 kHz before processing it. + * * The length of the prompt audio is limited to 30 seconds. + */ + file: NodeJS.ReadableStream|Buffer; + /** The filename for file. */ + filename: string; + headers?: OutgoingHttpHeaders; + } + + /** Parameters for the `getCustomPrompt` operation. */ + export interface GetCustomPromptParams { + /** The customization ID (GUID) of the custom model. You must make the request with credentials for the instance + * of the service that owns the custom model. + */ + customizationId: string; + /** The identifier (name) of the prompt. */ + promptId: string; + headers?: OutgoingHttpHeaders; + } + + /** Parameters for the `deleteCustomPrompt` operation. */ + export interface DeleteCustomPromptParams { + /** The customization ID (GUID) of the custom model. You must make the request with credentials for the instance + * of the service that owns the custom model. + */ + customizationId: string; + /** The identifier (name) of the prompt that is to be deleted. */ + promptId: string; + headers?: OutgoingHttpHeaders; + } + + /** Parameters for the `listSpeakerModels` operation. */ + export interface ListSpeakerModelsParams { + headers?: OutgoingHttpHeaders; + } + + /** Parameters for the `createSpeakerModel` operation. */ + export interface CreateSpeakerModelParams { + /** The name of the speaker that is to be added to the service instance. + * * Include a maximum of 49 characters in the name. + * * Include only alphanumeric characters and `_` (underscores) in the name. + * * Do not include XML sensitive characters (double quotes, single quotes, ampersands, angle brackets, and + * slashes) in the name. + * * Do not use the name of an existing speaker that is already defined for the service instance. + */ + speakerName: string; + /** An enrollment audio file that contains a sample of the speaker’s voice. + * * The enrollment audio must be in WAV format and must have a minimum sampling rate of 16 kHz. The service + * accepts audio with higher sampling rates. It transcodes all audio to 16 kHz before processing it. + * * The length of the enrollment audio is limited to 1 minute. Speaking one or two paragraphs of text that include + * five to ten sentences is recommended. + */ + audio: NodeJS.ReadableStream|Buffer; + headers?: OutgoingHttpHeaders; + } + + /** Parameters for the `getSpeakerModel` operation. */ + export interface GetSpeakerModelParams { + /** The speaker ID (GUID) of the speaker model. You must make the request with service credentials for the + * instance of the service that owns the speaker model. + */ + speakerId: string; + headers?: OutgoingHttpHeaders; + } + + /** Parameters for the `deleteSpeakerModel` operation. */ + export interface DeleteSpeakerModelParams { + /** The speaker ID (GUID) of the speaker model. You must make the request with service credentials for the + * instance of the service that owns the speaker model. + */ + speakerId: string; + headers?: OutgoingHttpHeaders; + } + /** Parameters for the `deleteUserData` operation. */ export interface DeleteUserDataParams { /** The customer ID for which all data is to be deleted. */ @@ -1559,11 +2214,15 @@ namespace TextToSpeechV1 { /** The description of the custom model. */ description?: string; /** An array of `Word` objects that lists the words and their translations from the custom model. The words are - * listed in alphabetical order, with uppercase letters listed before lowercase letters. The array is empty if the - * custom model contains no words. This field is returned only by the **Get a voice** method and only when you - * specify the customization ID of a custom model. + * listed in alphabetical order, with uppercase letters listed before lowercase letters. The array is empty if no + * words are defined for the custom model. This field is returned only by the **Get a custom model** method. */ words?: Word[]; + /** An array of `Prompt` objects that provides information about the prompts that are defined for the specified + * custom model. The array is empty if no prompts are defined for the custom model. This field is returned only by + * the **Get a custom model** method. + */ + prompts?: Prompt[]; } /** Information about existing custom models. */ @@ -1575,6 +2234,52 @@ namespace TextToSpeechV1 { customizations: CustomModel[]; } + /** Information about a custom prompt. */ + export interface Prompt { + /** The user-specified text of the prompt. */ + prompt: string; + /** The user-specified identifier (name) of the prompt. */ + prompt_id: string; + /** The status of the prompt: + * * `processing`: The service received the request to add the prompt and is analyzing the validity of the prompt. + * * `available`: The service successfully validated the prompt, which is now ready for use in a speech synthesis + * request. + * * `failed`: The service's validation of the prompt failed. The status of the prompt includes an `error` field + * that describes the reason for the failure. + */ + status: string; + /** If the status of the prompt is `failed`, an error message that describes the reason for the failure. The + * field is omitted if no error occurred. + */ + error?: string; + /** The speaker ID (GUID) of the speaker for which the prompt was defined. The field is omitted if no speaker ID + * was specified. + */ + speaker_id?: string; + } + + /** Information about the prompt that is to be added to a custom model. The following example of a `PromptMetadata` object includes both the required prompt text and an optional speaker model ID: `{ "prompt_text": "Thank you and good-bye!", "speaker_id": "823068b2-ed4e-11ea-b6e0-7b6456aa95cc" }`. */ + export interface PromptMetadata { + /** The required written text of the spoken prompt. The length of a prompt's text is limited to a few sentences. + * Speaking one or two sentences of text is the recommended limit. A prompt cannot contain more than 1000 + * characters of text. Escape any XML control characters (double quotes, single quotes, ampersands, angle brackets, + * and slashes) that appear in the text of the prompt. + */ + prompt_text: string; + /** The optional speaker ID (GUID) of a previously defined speaker model that is to be associated with the + * prompt. + */ + speaker_id?: string; + } + + /** Information about the custom prompts that are defined for a custom model. */ + export interface Prompts { + /** An array of `Prompt` objects that provides information about the prompts that are defined for the specified + * custom model. The array is empty if no prompts are defined for the custom model. + */ + prompts: Prompt[]; + } + /** The pronunciation of the specified text. */ export interface Pronunciation { /** The pronunciation of the specified text in the requested voice and format. If a custom model is specified, @@ -1583,6 +2288,67 @@ namespace TextToSpeechV1 { pronunciation: string; } + /** Information about a speaker model. */ + export interface Speaker { + /** The speaker ID (GUID) of the speaker. */ + speaker_id: string; + /** The user-defined name of the speaker. */ + name: string; + } + + /** A custom models for which the speaker has defined prompts. */ + export interface SpeakerCustomModel { + /** The customization ID (GUID) of a custom model for which the speaker has defined one or more prompts. */ + customization_id: string; + /** An array of `SpeakerPrompt` objects that provides information about each prompt that the user has defined + * for the custom model. + */ + prompts: SpeakerPrompt[]; + } + + /** Custom models for which the speaker has defined prompts. */ + export interface SpeakerCustomModels { + /** An array of `SpeakerCustomModel` objects. Each object provides information about the prompts that are + * defined for a specified speaker in the custom models that are owned by a specified service instance. The array + * is empty if no prompts are defined for the speaker. + */ + customizations: SpeakerCustomModel[]; + } + + /** The speaker ID of the speaker model. */ + export interface SpeakerModel { + /** The speaker ID (GUID) of the speaker model. */ + speaker_id: string; + } + + /** A prompt that a speaker has defined for a custom model. */ + export interface SpeakerPrompt { + /** The user-specified text of the prompt. */ + prompt: string; + /** The user-specified identifier (name) of the prompt. */ + prompt_id: string; + /** The status of the prompt: + * * `processing`: The service received the request to add the prompt and is analyzing the validity of the prompt. + * * `available`: The service successfully validated the prompt, which is now ready for use in a speech synthesis + * request. + * * `failed`: The service's validation of the prompt failed. The status of the prompt includes an `error` field + * that describes the reason for the failure. + */ + status: string; + /** If the status of the prompt is `failed`, an error message that describes the reason for the failure. The + * field is omitted if no error occurred. + */ + error?: string; + } + + /** Information about all speaker models for the service instance. */ + export interface Speakers { + /** An array of `Speaker` objects that provides information about the speakers for the service instance. The + * array is empty if the service instance has no speakers. + */ + speakers: Speaker[]; + } + /** Additional service features that are supported with the voice. */ export interface SupportedFeatures { /** If `true`, the voice can be customized; if `false`, the voice cannot be customized. (Same as