Skip to content

Commit f9dea80

Browse files
committed
feat(Text to Speech): Synthesize using web sockets
1 parent f2c1acb commit f9dea80

File tree

10 files changed

+490
-2
lines changed

10 files changed

+490
-2
lines changed

examples/speaker_text_to_speech.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# You need to install pyaudio to run this example
2+
# pip install pyaudio
3+
4+
# In this example, the a websocket connection is opened with a text
5+
# passed in the request. When the service responds with the synthesized
6+
# audio, the pyaudio would play it in a blocking mode
7+
8+
from watson_developer_cloud import TextToSpeechV1
9+
from watson_developer_cloud.websocket import SynthesizeCallback
10+
import pyaudio
11+
12+
# If service instance provides API key authentication
13+
service = TextToSpeechV1(
14+
## url is optional, and defaults to the URL below. Use the correct URL for your region.
15+
url='https://stream.watsonplatform.net/text-to-speech/api',
16+
iam_apikey='your_apikey')
17+
18+
# service = TextToSpeechV1(
19+
# ## url is optional, and defaults to the URL below. Use the correct URL for your region.
20+
# # url='https://stream.watsonplatform.net/text-to-speech/api,
21+
# username='YOUR SERVICE USERNAME',
22+
# password='YOUR SERVICE PASSWORD')
23+
24+
class Play(object):
25+
"""
26+
Wrapper to play the audio in a blocking mode
27+
"""
28+
def __init__(self):
29+
self.format = pyaudio.paInt16
30+
self.channels = 1
31+
self.rate = 22050
32+
self.chunk = 1024
33+
self.pyaudio = None
34+
self.stream = None
35+
36+
def start_streaming(self):
37+
self.pyaudio = pyaudio.PyAudio()
38+
self.stream = self._open_stream()
39+
self._start_stream()
40+
41+
def _open_stream(self):
42+
stream = self.pyaudio.open(
43+
format=self.format,
44+
channels=self.channels,
45+
rate=self.rate,
46+
output=True,
47+
frames_per_buffer=self.chunk,
48+
start=False
49+
)
50+
return stream
51+
52+
def _start_stream(self):
53+
self.stream.start_stream()
54+
55+
def write_stream(self, audio_stream):
56+
self.stream.write(audio_stream)
57+
58+
def complete_playing(self):
59+
self.stream.stop_stream()
60+
self.stream.close()
61+
self.pyaudio.terminate()
62+
63+
class MySynthesizeCallback(SynthesizeCallback):
64+
def __init__(self):
65+
SynthesizeCallback.__init__(self)
66+
self.play = Play()
67+
68+
def on_connected(self):
69+
print 'Opening stream to play'
70+
self.play.start_streaming()
71+
72+
def on_error(self, error):
73+
print 'Error received: {}'.format(error)
74+
75+
def on_timing_information(self, timing_information):
76+
print timing_information
77+
78+
def on_audio_stream(self, audio_stream):
79+
self.play.write_stream(audio_stream)
80+
81+
def on_close(self):
82+
print 'Completed synthesizing'
83+
self.play.complete_playing()
84+
85+
testCallback = MySynthesizeCallback()
86+
87+
# An example SSML text
88+
SSML_sorry_text = """<speak version=\"1.0\">
89+
<emphasis> I am sorry, I know how it feels.</emphasis>
90+
</speak>"""
91+
92+
# Another example of SSML text
93+
SSML_text = """
94+
<speak>
95+
I have been assigned to handle your order status request.
96+
<express-as type=\"Apology\">
97+
I am sorry to inform you that the items you requested are backordered.
98+
We apologize for the inconvenience.
99+
</express-as>
100+
<express-as type=\"Uncertainty\">
101+
We don't know when the items will become available. Maybe next week,
102+
but we are not sure at this time.
103+
</express-as>
104+
<express-as type=\"GoodNews\">
105+
But because we want you to be a satisfied customer, we are giving you
106+
a 50% discount on your order!
107+
</express-as>
108+
</speak>"""
109+
110+
service.synthesize_using_websocket(SSML_text,
111+
testCallback,
112+
accept='audio/wav',
113+
voice="en-US_AllisonVoice"
114+
)

examples/text_to_speech_v1.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import json
44
from os.path import join, dirname
55
from watson_developer_cloud import TextToSpeechV1
6+
from watson_developer_cloud.websocket import SynthesizeCallback
67

78
# If service instance provides API key authentication
89
# service = TextToSpeechV1(
@@ -63,3 +64,36 @@
6364

6465
# response = service.delete_voice_model('YOUR CUSTOMIZATION ID').get_result()
6566
# print(response)
67+
68+
# Synthesize using websocket. Note: The service accepts one request per connection
69+
file_path = join(dirname(__file__), "../resources/dog.wav")
70+
class MySynthesizeCallback(SynthesizeCallback):
71+
def __init__(self):
72+
SynthesizeCallback.__init__(self)
73+
self.fd = open(file_path, "ab")
74+
75+
def on_connected(self):
76+
print("Connection was successful")
77+
78+
def on_error(self, error):
79+
print("Error received: {}".format(error))
80+
81+
def on_content_type(self, content_type):
82+
print("Content type: {}".format(content_type))
83+
84+
def on_timing_information(self, timing_information):
85+
print(timing_information)
86+
87+
def on_audio_stream(self, audio_stream):
88+
self.fd.write(audio_stream)
89+
90+
def on_close(self):
91+
self.fd.close()
92+
print("Done synthesizing. Closing the connection")
93+
94+
myCallback = MySynthesizeCallback()
95+
service.synthesize_using_websocket("I like to pet dogs",
96+
myCallback,
97+
accept='audio/wav',
98+
voice="en-US_AllisonVoice"
99+
)

requirements-dev.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,4 @@ Sphinx>=1.3.1
1717
bumpversion>=0.5.3
1818

1919
# Web sockets
20-
websocket-client==0.47.0
20+
websocket-client==0.52.0

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
requests>=2.0,<3.0
22
python_dateutil>=2.5.3
3-
websocket-client==0.47.0
3+
websocket-client==0.52.0

test/integration/test_text_to_speech_v1.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# coding: utf-8
22
import unittest
33
import watson_developer_cloud
4+
from watson_developer_cloud.websocket import SynthesizeCallback
45
import pytest
56
import os
67

@@ -67,3 +68,34 @@ def test_custom_words(self):
6768
self.text_to_speech.delete_word(customization_id, 'ACLs')
6869
word = self.text_to_speech.get_word(customization_id, 'MACLs').get_result()
6970
assert word['translation'] == 'mackles'
71+
72+
def test_synthesize_using_websocket(self):
73+
file = "tongue_twister.wav"
74+
class MySynthesizeCallback(SynthesizeCallback):
75+
def __init__(self):
76+
SynthesizeCallback.__init__(self)
77+
self.fd = None
78+
self.error = None
79+
80+
def on_connected(self):
81+
self.fd = open(file, "ab")
82+
83+
def on_error(self, error):
84+
self.error = error
85+
86+
def on_audio_stream(self, audio_stream):
87+
self.fd.write(audio_stream)
88+
89+
def on_close(self):
90+
self.fd.close()
91+
92+
testCallback = MySynthesizeCallback()
93+
self.text_to_speech.synthesize_using_websocket("She sells seashells by the seashore",
94+
testCallback,
95+
accept='audio/wav',
96+
voice="en-GB_KateVoice"
97+
)
98+
assert testCallback.error is None
99+
assert testCallback.fd is not None
100+
assert os.stat(file).st_size > 0
101+
os.remove(file)

watson_developer_cloud/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,5 +31,6 @@
3131
from .discovery_v1 import DiscoveryV1
3232
from .version import __version__
3333
from .speech_to_text_v1_adapter import SpeechToTextV1Adapter as SpeechToTextV1
34+
from .text_to_speech_adapter_v1 import TextToSpeechV1Adapter as TextToSpeechV1
3435
from .visual_recognition_v3_adapter import VisualRecognitionV3Adapter as VisualRecognitionV3
3536
from .discovery_v1_adapter import DiscoveryV1Adapter as DiscoveryV1
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
2+
# coding: utf-8
3+
4+
# Copyright 2018 IBM All Rights Reserved.
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
18+
from watson_developer_cloud.websocket import SynthesizeCallback, SynthesizeListener
19+
import base64
20+
from .text_to_speech_v1 import TextToSpeechV1
21+
from .watson_service import _remove_null_values
22+
try:
23+
from urllib.parse import urlencode
24+
except ImportError:
25+
from urllib import urlencode
26+
27+
BEARER = 'Bearer'
28+
29+
class TextToSpeechV1Adapter(TextToSpeechV1):
30+
def synthesize_using_websocket(self,
31+
text,
32+
synthesize_callback,
33+
accept=None,
34+
voice=None,
35+
timings=None,
36+
customization_id=None,
37+
http_proxy_host=None,
38+
http_proxy_port=None,
39+
**kwargs):
40+
"""
41+
Synthesizes text to spoken audio using web sockets. It supports the use of
42+
the SSML <mark> element to identify the location of user-specified markers in the audio.
43+
It can also return timing information for all strings of the input text.
44+
Note:The service processes one request per connection.
45+
46+
:param str text: Provides the text that is to be synthesized. The client can pass plain
47+
text or text that is annotated with the Speech Synthesis Markup Language (SSML). For more
48+
information, see [Specifying input text](https://console.bluemix.net/docs/services/text-to-speech/http.html#input).
49+
SSML input can also include the <mark> element;
50+
see [Specifying an SSML mark](https://console.bluemix.net/docs/services/text-to-speech/word-timing.html#mark).
51+
The client can pass a maximum of 5 KB of text with the request.
52+
:param SynthesizeCallback synthesize_callback: The callback method for the websocket.
53+
:param str accept: Specifies the requested format (MIME type) of the audio. For more information, see [Specifying
54+
an audio format](https://console.bluemix.net/docs/services/text-to-speech/http.html#format). In addition to the
55+
supported specifications, you can use */* to specify the default audio format, audio/ogg;codecs=opus.
56+
:param str voice: The voice to use for synthesis.
57+
:param list[str] timings: Specifies that the service is to return word timing information for all strings of the
58+
input text. The service returns the start and end time of each string of the input. Specify words as the lone element
59+
of the array to request word timings. Specify an empty array or omit the parameter to receive no word timings. For
60+
more information, see [Obtaining word timings](https://console.bluemix.net/docs/services/text-to-speech/word-timing.html#timing).
61+
Not supported for Japanese input text.
62+
:param str customization_id: Specifies the globally unique identifier (GUID) for a custom voice model that is to be used for the
63+
synthesis. A custom voice model is guaranteed to work only if it matches the language of the voice that is used for the synthesis.
64+
If you include a customization ID, you must call the method with the service credentials of the custom model's owner. Omit the
65+
parameter to use the specified voice with no customization. For more information, see [Understanding customization]
66+
(https://console.bluemix.net/docs/services/text-to-speech/custom-intro.html#customIntro).
67+
:param str http_proxy_host: http proxy host name.
68+
:param str http_proxy_port: http proxy port. If not set, set to 80.
69+
:param dict headers: A `dict` containing the request headers
70+
:return: A `dict` containing the `SpeechRecognitionResults` response.
71+
:rtype: dict
72+
"""
73+
if text is None:
74+
raise ValueError('text must be provided')
75+
if synthesize_callback is None:
76+
raise ValueError('synthesize_callback must be provided')
77+
if not isinstance(synthesize_callback, SynthesizeCallback):
78+
raise Exception(
79+
'Callback is not a derived class of SynthesizeCallback')
80+
81+
headers = {}
82+
if self.default_headers is not None:
83+
headers = self.default_headers.copy()
84+
if 'headers' in kwargs:
85+
headers.update(kwargs.get('headers'))
86+
87+
if self.token_manager:
88+
access_token = self.token_manager.get_token()
89+
headers['Authorization'] = '{0} {1}'.format(BEARER, access_token)
90+
else:
91+
authstring = "{0}:{1}".format(self.username, self.password)
92+
base64_authorization = base64.b64encode(authstring.encode('utf-8')).decode('utf-8')
93+
headers['Authorization'] = 'Basic {0}'.format(base64_authorization)
94+
95+
url = self.url.replace('https:', 'wss:')
96+
params = {
97+
'voice': voice,
98+
'customization_id': customization_id,
99+
}
100+
params = _remove_null_values(params)
101+
url += '/v1/synthesize?{0}'.format(urlencode(params))
102+
103+
options = {
104+
'text': text,
105+
'accept': accept,
106+
'timings': timings
107+
}
108+
options = _remove_null_values(options)
109+
110+
SynthesizeListener(options,
111+
synthesize_callback,
112+
url,
113+
headers,
114+
http_proxy_host,
115+
http_proxy_port,
116+
self.verify)

watson_developer_cloud/websocket/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,5 @@
1717
from .recognize_abstract_callback import RecognizeCallback
1818
from .recognize_listener import RecognizeListener
1919
from .audio_source import AudioSource
20+
from .synthesize_callback import SynthesizeCallback
21+
from .synthesize_listener import SynthesizeListener

0 commit comments

Comments
 (0)