Skip to content

Commit 917ef20

Browse files
committed
fix(Speech to Text): Refactor the websocket method to better match the other SDKs
1 parent 92ce90f commit 917ef20

File tree

3 files changed

+35
-33
lines changed

3 files changed

+35
-33
lines changed

examples/speech_to_text_v1.rb

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ def on_data(data:)
7272
File.open(Dir.getwd + "/resources/speech.wav") do |audio_file|
7373
speech_to_text.recognize_using_websocket(
7474
audio: audio_file,
75-
recognize_callback: mycallback
75+
recognize_callback: mycallback,
76+
content_type: "audio/wav"
7677
).start
7778
end
7879

@@ -83,7 +84,8 @@ def on_data(data:)
8384
chunk_data: true, # Tell the websocket object that audio will be given in chunks
8485
recognize_callback: mycallback,
8586
interim_results: true,
86-
inactivity_timeout: 3
87+
inactivity_timeout: 3,
88+
content_type: "audio/wav"
8789
)
8890
audio_file = File.open(Dir.getwd + "/resources/speech.wav")
8991
Thread.new do

lib/ibm_watson/speech_to_text_v1.rb

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -426,17 +426,17 @@ def recognize(audio:, content_type:, model: nil, customization_id: nil, acoustic
426426
end
427427

428428
##
429-
# @!method recognize_using_websocket(audio: nil,chunk_data: false,content_type: "audio/l16; rate=44100",model: "en-US_BroadbandModel",recognize_callback: nil,customization_id: nil,acoustic_customization_id: nil,customization_weight: nil,version: nil,inactivity_timeout: 30,interim_results: false,keywords: nil,keywords_threshold: nil,max_alternatives: 1,word_alternatives_threshold: nil,word_confidence: false,timestamps: false,profanity_filter: nil,smart_formatting: false,speaker_labels: nil)
429+
# @!method recognize_using_websocket(content_type:,recognize_callback:,audio: nil,chunk_data: false,model: nil,customization_id: nil,acoustic_customization_id: nil,customization_weight: nil,base_model_version: nil,inactivity_timeout: nil,interim_results: nil,keywords: nil,keywords_threshold: nil,max_alternatives: nil,word_alternatives_threshold: nil,word_confidence: nil,timestamps: nil,profanity_filter: nil,smart_formatting: nil,speaker_labels: nil)
430430
# Sends audio for speech recognition using web sockets.
431+
# @param content_type [String] The type of the input: audio/basic, audio/flac, audio/l16, audio/mp3, audio/mpeg, audio/mulaw, audio/ogg, audio/ogg;codecs=opus, audio/ogg;codecs=vorbis, audio/wav, audio/webm, audio/webm;codecs=opus, audio/webm;codecs=vorbis, or multipart/form-data.
432+
# @param recognize_callback [RecognizeCallback] The instance handling events returned from the service.
431433
# @param audio [IO] Audio to transcribe in the format specified by the `Content-Type` header.
432434
# @param chunk_data [Boolean] If true, then the WebSocketClient will expect to receive data in chunks rather than as a single audio file
433-
# @param content_type [String] The type of the input: audio/basic, audio/flac, audio/l16, audio/mp3, audio/mpeg, audio/mulaw, audio/ogg, audio/ogg;codecs=opus, audio/ogg;codecs=vorbis, audio/wav, audio/webm, audio/webm;codecs=opus, audio/webm;codecs=vorbis, or multipart/form-data.
434435
# @param model [String] The identifier of the model to be used for the recognition request.
435-
# @param recognize_callback [RecognizeCallback] The instance handling events returned from the service.
436436
# @param customization_id [String] The GUID of a custom language model that is to be used with the request. The base model of the specified custom language model must match the model specified with the `model` parameter. You must make the request with service credentials created for the instance of the service that owns the custom model. By default, no custom language model is used.
437437
# @param acoustic_customization_id [String] The GUID of a custom acoustic model that is to be used with the request. The base model of the specified custom acoustic model must match the model specified with the `model` parameter. You must make the request with service credentials created for the instance of the service that owns the custom model. By default, no custom acoustic model is used.
438438
# @param customization_weight [Float] If you specify a `customization_id` with the request, you can use the `customization_weight` parameter to tell the service how much weight to give to words from the custom language model compared to those from the base model for speech recognition. Specify a value between 0.0 and 1.0. Unless a different customization weight was specified for the custom model when it was trained, the default value is 0.3. A customization weight that you specify overrides a weight that was specified when the custom model was trained. The default value yields the best performance in general. Assign a higher value if your audio makes frequent use of OOV words from the custom model. Use caution when setting the weight: a higher value can improve the accuracy of phrases from the custom model's domain, but it can negatively affect performance on non-domain phrases.
439-
# @param version [String] The version of the specified base `model` that is to be used for speech recognition. Multiple versions of a base model can exist when a model is updated for internal improvements. The parameter is intended primarily for use with custom models that have been upgraded for a new base model. The default value depends on whether the parameter is used with or without a custom model. For more information, see [Base model version](https://console.bluemix.net/docs/services/speech-to-text/input.html#version).
439+
# @param base_model_version [String] The version of the specified base `model` that is to be used for speech recognition. Multiple versions of a base model can exist when a model is updated for internal improvements. The parameter is intended primarily for use with custom models that have been upgraded for a new base model. The default value depends on whether the parameter is used with or without a custom model. For more information, see [Base model version](https://console.bluemix.net/docs/services/speech-to-text/input.html#version).
440440
# @param inactivity_timeout [Integer] The time in seconds after which, if only silence (no speech) is detected in submitted audio, the connection is closed with a 400 error. Useful for stopping audio submission from a live microphone when a user simply walks away. Use `-1` for infinity.
441441
# @param interim_results [Boolean] Send back non-final previews of each "sentence" as it is being processed. These results are ignored in text mode.
442442
# @param keywords [Array<String>] Array of keyword strings to spot in the audio. Each keyword string can include one or more tokens. Keywords are spotted only in the final hypothesis, not in interim results. If you specify any keywords, you must also specify a keywords threshold. Omit the parameter or specify an empty array if you do not need to spot keywords.
@@ -450,25 +450,25 @@ def recognize(audio:, content_type:, model: nil, customization_id: nil, acoustic
450450
# @param speaker_labels [Boolean] Indicates whether labels that identify which words were spoken by which participants in a multi-person exchange are to be included in the response. The default is `false`; no speaker labels are returned. Setting `speaker_labels` to `true` forces the `timestamps` parameter to be `true`, regardless of whether you specify `false` for the parameter. To determine whether a language model supports speaker labels, use the `GET /v1/models` method and check that the attribute `speaker_labels` is set to `true`. You can also refer to [Speaker labels](https://console.bluemix.net/docs/services/speech-to-text/output.html#speaker_labels).
451451
# @return [WebSocketClient] Returns a new WebSocketClient object
452452
def recognize_using_websocket(
453+
content_type:,
454+
recognize_callback:,
453455
audio: nil,
454456
chunk_data: false,
455-
content_type: "audio/l16; rate=44100",
456-
model: "en-US_BroadbandModel",
457-
recognize_callback: nil,
457+
model: nil,
458458
customization_id: nil,
459459
acoustic_customization_id: nil,
460460
customization_weight: nil,
461-
version: nil,
462-
inactivity_timeout: 30,
463-
interim_results: false,
461+
base_model_version: nil,
462+
inactivity_timeout: nil,
463+
interim_results: nil,
464464
keywords: nil,
465465
keywords_threshold: nil,
466-
max_alternatives: 1,
466+
max_alternatives: nil,
467467
word_alternatives_threshold: nil,
468-
word_confidence: false,
469-
timestamps: false,
468+
word_confidence: nil,
469+
timestamps: nil,
470470
profanity_filter: nil,
471-
smart_formatting: false,
471+
smart_formatting: nil,
472472
speaker_labels: nil
473473
)
474474
raise ArgumentError("Audio must be provided") if audio.nil? && !chunk_data
@@ -489,7 +489,7 @@ def recognize_using_websocket(
489489
"customization_id" => customization_id,
490490
"acoustic_customization_id" => acoustic_customization_id,
491491
"customization_weight" => customization_weight,
492-
"version" => version
492+
"base_model_version" => base_model_version
493493
}
494494
params.delete_if { |_, v| v.nil? }
495495
url += "/v1/recognize?" + HTTP::URI.form_encode(params)
@@ -514,25 +514,25 @@ def recognize_using_websocket(
514514
# :nocov:
515515
# @deprecated This will method be removed in the next major release. Use {#recognize_using_websocket} instead.
516516
def recognize_with_websocket(
517+
content_type:,
518+
recognize_callback:,
517519
audio: nil,
518520
chunk_data: false,
519-
content_type: "audio/l16; rate=44100",
520-
model: "en-US_BroadbandModel",
521-
recognize_callback: nil,
521+
model: nil,
522522
customization_id: nil,
523523
acoustic_customization_id: nil,
524524
customization_weight: nil,
525-
version: nil,
526-
inactivity_timeout: 30,
527-
interim_results: false,
525+
base_model_version: nil,
526+
inactivity_timeout: nil,
527+
interim_results: nil,
528528
keywords: nil,
529529
keywords_threshold: nil,
530-
max_alternatives: 1,
530+
max_alternatives: nil,
531531
word_alternatives_threshold: nil,
532-
word_confidence: false,
533-
timestamps: false,
532+
word_confidence: nil,
533+
timestamps: nil,
534534
profanity_filter: nil,
535-
smart_formatting: false,
535+
smart_formatting: nil,
536536
speaker_labels: nil
537537
)
538538
Kernel.warn("[DEPRECATION] `recognize_with_websocket` is deprecated and will be removed in the next major release. Please use `recognize_using_websocket` instead.")
@@ -545,7 +545,7 @@ def recognize_with_websocket(
545545
customization_id: customization_id,
546546
acoustic_customization_id: acoustic_customization_id,
547547
customization_weight: customization_weight,
548-
version: version,
548+
base_model_version: base_model_version,
549549
inactivity_timeout: inactivity_timeout,
550550
interim_results: interim_results,
551551
keywords: keywords,

test/integration/test_speech_to_text_v1.rb

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ def test_recognize_websocket_as_chunks
138138
timestamps: true,
139139
max_alternatives: 2,
140140
word_alternatives_threshold: 0.5,
141-
model: "en-US_BroadbandModel"
141+
content_type: "audio/wav"
142142
)
143143
Thread.new do
144144
until audio_file.eof?
@@ -162,7 +162,7 @@ def test_recognize_websocket
162162
timestamps: true,
163163
max_alternatives: 2,
164164
word_alternatives_threshold: 0.5,
165-
model: "en-US_BroadbandModel"
165+
content_type: "audio/wav"
166166
)
167167
thr = Thread.new { speech.start }
168168
thr.join
@@ -180,7 +180,7 @@ def test_inactivity_timeout_using_websocket
180180
timestamps: true,
181181
max_alternatives: 2,
182182
word_alternatives_threshold: 0.5,
183-
model: "en-US_BroadbandModel"
183+
content_type: "audio/wav"
184184
)
185185
thr = Thread.new { speech.start }
186186
thr.join
@@ -198,7 +198,7 @@ def test_broken_audio_using_websocket
198198
timestamps: true,
199199
max_alternatives: 2,
200200
word_alternatives_threshold: 0.5,
201-
model: "en-US_BroadbandModel"
201+
content_type: "audio/wav"
202202
)
203203
thr = Thread.new { speech.start }
204204
thr.join
@@ -225,7 +225,7 @@ def test_invalid_auth_using_websocket
225225
timestamps: true,
226226
max_alternatives: 2,
227227
word_alternatives_threshold: 0.5,
228-
model: "en-US_BroadbandModel"
228+
content_type: "audio/wav"
229229
)
230230
thr = Thread.new { speech.start }
231231
thr.join

0 commit comments

Comments
 (0)