feat(Text to Speech): Synthesize using web sockets

ehdsouza · ehdsouza · commit f9dea80e72f7 · 2018-11-28T17:11:49.000-05:00
diff --git a/examples/speaker_text_to_speech.py b/examples/speaker_text_to_speech.py
@@ -0,0 +1,114 @@
+# You need to install pyaudio to run this example
+# pip install pyaudio
+
+# In this example, the a websocket connection is opened with a text
+# passed in the request. When the service responds with the synthesized
+# audio, the pyaudio would play it in a blocking mode
+
+from watson_developer_cloud import TextToSpeechV1
+from watson_developer_cloud.websocket import SynthesizeCallback
+import pyaudio
+
+# If service instance provides API key authentication
+service = TextToSpeechV1(
+    ## url is optional, and defaults to the URL below. Use the correct URL for your region.
+    url='https://stream.watsonplatform.net/text-to-speech/api',
+    iam_apikey='your_apikey')
+
+# service = TextToSpeechV1(
+#     ## url is optional, and defaults to the URL below. Use the correct URL for your region.
+#     # url='https://stream.watsonplatform.net/text-to-speech/api,
+#     username='YOUR SERVICE USERNAME',
+#     password='YOUR SERVICE PASSWORD')
+
+class Play(object):
+    """
+    Wrapper to play the audio in a blocking mode
+    """
+    def __init__(self):
+        self.format = pyaudio.paInt16
+        self.channels = 1
+        self.rate = 22050
+        self.chunk = 1024
+        self.pyaudio = None
+        self.stream = None
+
+    def start_streaming(self):
+        self.pyaudio = pyaudio.PyAudio()
+        self.stream = self._open_stream()
+        self._start_stream()
+
+    def _open_stream(self):
+        stream = self.pyaudio.open(
+            format=self.format,
+            channels=self.channels,
+            rate=self.rate,
+            output=True,
+            frames_per_buffer=self.chunk,
+            start=False
+        )
+        return stream
+
+    def _start_stream(self):
+        self.stream.start_stream()
+
+    def write_stream(self, audio_stream):
+        self.stream.write(audio_stream)
+
+    def complete_playing(self):
+        self.stream.stop_stream()
+        self.stream.close()
+        self.pyaudio.terminate()
+
+class MySynthesizeCallback(SynthesizeCallback):
+    def __init__(self):
+        SynthesizeCallback.__init__(self)
+        self.play = Play()
+
+    def on_connected(self):
+        print 'Opening stream to play'
+        self.play.start_streaming()
+
+    def on_error(self, error):
+        print 'Error received: {}'.format(error)
+
+    def on_timing_information(self, timing_information):
+        print timing_information
+
+    def on_audio_stream(self, audio_stream):
+        self.play.write_stream(audio_stream)
+
+    def on_close(self):
+        print 'Completed synthesizing'
+        self.play.complete_playing()
+
+testCallback = MySynthesizeCallback()
+
+# An example SSML text
+SSML_sorry_text = """<speak version=\"1.0\">
+        <emphasis> I am sorry, I know how it feels.</emphasis>
+        </speak>"""
+
+# Another example of SSML text
+SSML_text = """
+   <speak>
+        I have been assigned to handle your order status request.
+       <express-as type=\"Apology\">
+        I am sorry to inform you that the items you requested are backordered.
+        We apologize for the inconvenience.
+       </express-as>
+      <express-as type=\"Uncertainty\">
+        We don't know when the items will become available. Maybe next week,
+        but we are not sure at this time.
+      </express-as>
+      <express-as type=\"GoodNews\">
+        But because we want you to be a satisfied customer, we are giving you
+        a 50% discount on your order!
+      </express-as>
+   </speak>"""
+
+service.synthesize_using_websocket(SSML_text,
+                                   testCallback,
+                                   accept='audio/wav',
+                                   voice="en-US_AllisonVoice"
+                                  )
diff --git a/examples/text_to_speech_v1.py b/examples/text_to_speech_v1.py
@@ -3,6 +3,7 @@
 import json
 from os.path import join, dirname
 from watson_developer_cloud import TextToSpeechV1
+from watson_developer_cloud.websocket import SynthesizeCallback
 
 # If service instance provides API key authentication
 # service = TextToSpeechV1(
@@ -63,3 +64,36 @@
 
 # response = service.delete_voice_model('YOUR CUSTOMIZATION ID').get_result()
 # print(response)
+
+# Synthesize using websocket. Note: The service accepts one request per connection
+file_path = join(dirname(__file__), "../resources/dog.wav")
+class MySynthesizeCallback(SynthesizeCallback):
+    def __init__(self):
+        SynthesizeCallback.__init__(self)
+        self.fd = open(file_path, "ab")
+
+    def on_connected(self):
+        print("Connection was successful")
+
+    def on_error(self, error):
+        print("Error received: {}".format(error))
+
+    def on_content_type(self, content_type):
+        print("Content type: {}".format(content_type))
+
+    def on_timing_information(self, timing_information):
+        print(timing_information)
+
+    def on_audio_stream(self, audio_stream):
+        self.fd.write(audio_stream)
+
+    def on_close(self):
+        self.fd.close()
+        print("Done synthesizing. Closing the connection")
+
+myCallback = MySynthesizeCallback()
+service.synthesize_using_websocket("I like to pet dogs",
+                                   myCallback,
+                                   accept='audio/wav',
+                                   voice="en-US_AllisonVoice"
+                                  )
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -17,4 +17,4 @@ Sphinx>=1.3.1
 bumpversion>=0.5.3
 
 # Web sockets
-websocket-client==0.47.0
+websocket-client==0.52.0
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,3 @@
 requests>=2.0,<3.0
 python_dateutil>=2.5.3
-websocket-client==0.47.0
+websocket-client==0.52.0
diff --git a/test/integration/test_text_to_speech_v1.py b/test/integration/test_text_to_speech_v1.py
@@ -1,6 +1,7 @@
 # coding: utf-8
 import unittest
 import watson_developer_cloud
+from watson_developer_cloud.websocket import SynthesizeCallback
 import pytest
 import os
 
@@ -67,3 +68,34 @@ def test_custom_words(self):
         self.text_to_speech.delete_word(customization_id, 'ACLs')
         word = self.text_to_speech.get_word(customization_id, 'MACLs').get_result()
         assert word['translation'] == 'mackles'
+
+    def test_synthesize_using_websocket(self):
+        file = "tongue_twister.wav"
+        class MySynthesizeCallback(SynthesizeCallback):
+            def __init__(self):
+                SynthesizeCallback.__init__(self)
+                self.fd = None
+                self.error = None
+
+            def on_connected(self):
+                self.fd = open(file, "ab")
+
+            def on_error(self, error):
+                self.error = error
+
+            def on_audio_stream(self, audio_stream):
+                self.fd.write(audio_stream)
+
+            def on_close(self):
+                self.fd.close()
+
+        testCallback = MySynthesizeCallback()
+        self.text_to_speech.synthesize_using_websocket("She sells seashells by the seashore",
+                                                       testCallback,
+                                                       accept='audio/wav',
+                                                       voice="en-GB_KateVoice"
+                                                      )
+        assert testCallback.error is None
+        assert testCallback.fd is not None
+        assert os.stat(file).st_size > 0
+        os.remove(file)
diff --git a/watson_developer_cloud/__init__.py b/watson_developer_cloud/__init__.py
@@ -31,5 +31,6 @@
 from .discovery_v1 import DiscoveryV1
 from .version import __version__
 from .speech_to_text_v1_adapter import SpeechToTextV1Adapter as SpeechToTextV1
+from .text_to_speech_adapter_v1 import TextToSpeechV1Adapter as TextToSpeechV1
 from .visual_recognition_v3_adapter import VisualRecognitionV3Adapter as VisualRecognitionV3
 from .discovery_v1_adapter import DiscoveryV1Adapter as DiscoveryV1
diff --git a/watson_developer_cloud/text_to_speech_adapter_v1.py b/watson_developer_cloud/text_to_speech_adapter_v1.py
@@ -0,0 +1,116 @@
+
+# coding: utf-8
+
+# Copyright 2018 IBM All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from watson_developer_cloud.websocket import SynthesizeCallback, SynthesizeListener
+import base64
+from .text_to_speech_v1 import TextToSpeechV1
+from .watson_service import _remove_null_values
+try:
+    from urllib.parse import urlencode
+except ImportError:
+    from urllib import urlencode
+
+BEARER = 'Bearer'
+
+class TextToSpeechV1Adapter(TextToSpeechV1):
+    def synthesize_using_websocket(self,
+                                   text,
+                                   synthesize_callback,
+                                   accept=None,
+                                   voice=None,
+                                   timings=None,
+                                   customization_id=None,
+                                   http_proxy_host=None,
+                                   http_proxy_port=None,
+                                   **kwargs):
+        """
+        Synthesizes text to spoken audio using web sockets. It supports the use of
+        the SSML <mark> element to identify the location of user-specified markers in the audio.
+        It can also return timing information for all strings of the input text.
+        Note:The service processes one request per connection.
+
+        :param str text: Provides the text that is to be synthesized. The client can pass plain
+        text or text that is annotated with the Speech Synthesis Markup Language (SSML). For more
+        information, see [Specifying input text](https://console.bluemix.net/docs/services/text-to-speech/http.html#input).
+        SSML input can also include the <mark> element;
+        see [Specifying an SSML mark](https://console.bluemix.net/docs/services/text-to-speech/word-timing.html#mark).
+        The client can pass a maximum of 5 KB of text with the request.
+        :param SynthesizeCallback synthesize_callback: The callback method for the websocket.
+        :param str accept: Specifies the requested format (MIME type) of the audio. For more information, see [Specifying
+        an audio format](https://console.bluemix.net/docs/services/text-to-speech/http.html#format). In addition to the
+        supported specifications, you can use */* to specify the default audio format, audio/ogg;codecs=opus.
+        :param str voice: The voice to use for synthesis.
+        :param list[str] timings: Specifies that the service is to return word timing information for all strings of the
+        input text. The service returns the start and end time of each string of the input. Specify words as the lone element
+        of the array to request word timings. Specify an empty array or omit the parameter to receive no word timings. For
+        more information, see [Obtaining word timings](https://console.bluemix.net/docs/services/text-to-speech/word-timing.html#timing).
+        Not supported for Japanese input text.
+        :param str customization_id: Specifies the globally unique identifier (GUID) for a custom voice model that is to be used for the
+        synthesis. A custom voice model is guaranteed to work only if it matches the language of the voice that is used for the synthesis.
+        If you include a customization ID, you must call the method with the service credentials of the custom model's owner. Omit the
+        parameter to use the specified voice with no customization. For more information, see [Understanding customization]
+        (https://console.bluemix.net/docs/services/text-to-speech/custom-intro.html#customIntro).
+        :param str http_proxy_host: http proxy host name.
+        :param str http_proxy_port: http proxy port. If not set, set to 80.
+        :param dict headers: A `dict` containing the request headers
+        :return: A `dict` containing the `SpeechRecognitionResults` response.
+        :rtype: dict
+        """
+        if text is None:
+            raise ValueError('text must be provided')
+        if synthesize_callback is None:
+            raise ValueError('synthesize_callback must be provided')
+        if not isinstance(synthesize_callback, SynthesizeCallback):
+            raise Exception(
+                'Callback is not a derived class of SynthesizeCallback')
+
+        headers = {}
+        if self.default_headers is not None:
+            headers = self.default_headers.copy()
+        if 'headers' in kwargs:
+            headers.update(kwargs.get('headers'))
+
+        if self.token_manager:
+            access_token = self.token_manager.get_token()
+            headers['Authorization'] = '{0} {1}'.format(BEARER, access_token)
+        else:
+            authstring = "{0}:{1}".format(self.username, self.password)
+            base64_authorization = base64.b64encode(authstring.encode('utf-8')).decode('utf-8')
+            headers['Authorization'] = 'Basic {0}'.format(base64_authorization)
+
+        url = self.url.replace('https:', 'wss:')
+        params = {
+            'voice': voice,
+            'customization_id': customization_id,
+        }
+        params = _remove_null_values(params)
+        url += '/v1/synthesize?{0}'.format(urlencode(params))
+
+        options = {
+            'text': text,
+            'accept': accept,
+            'timings': timings
+        }
+        options = _remove_null_values(options)
+
+        SynthesizeListener(options,
+                           synthesize_callback,
+                           url,
+                           headers,
+                           http_proxy_host,
+                           http_proxy_port,
+                           self.verify)
diff --git a/watson_developer_cloud/websocket/__init__.py b/watson_developer_cloud/websocket/__init__.py
@@ -17,3 +17,5 @@
 from .recognize_abstract_callback import RecognizeCallback
 from .recognize_listener import RecognizeListener
 from .audio_source import AudioSource
+from .synthesize_callback import SynthesizeCallback
+from .synthesize_listener import SynthesizeListener
diff --git a/watson_developer_cloud/websocket/synthesize_callback.py b/watson_developer_cloud/websocket/synthesize_callback.py
diff --git a/watson_developer_cloud/websocket/synthesize_listener.py b/watson_developer_cloud/websocket/synthesize_listener.py