In [7]:
import os
import azure.cognitiveservices.speech as speechsdk

speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('AZURE_SPEECH_KEY'), region=os.environ.get('AZURE_SPEECH_REGION'))
audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)

# The language of the voice that speaks.
speech_config.speech_synthesis_language='zh-CN'
speech_config.speech_synthesis_voice_name='zh-CN-XiaohanNeural'

speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)

text = "今天天气真不错，ChatGPT真好用"

speech_synthesizer.speak_text_async(text)

<azure.cognitiveservices.speech.ResultFuture at 0x7fcb38630700>

## 使用男声

In [11]:
speech_config.speech_synthesis_voice_name='zh-CN-YunfengNeural'
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
speech_synthesizer.speak_text_async(text)

<azure.cognitiveservices.speech.ResultFuture at 0x7fcb38630af0>

## 自定义语气和角色

In [18]:
ssml = """<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
       xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="zh-CN">
    <voice name="zh-CN-YunyeNeural">
        儿子看见母亲走了过来，说到：
        <mstts:express-as role="Boy" style="cheerful">
            “妈妈，我想要买个新玩具”
        </mstts:express-as>
    </voice>
    <voice name="zh-CN-XiaomoNeural">
        母亲放下包，说：
        <mstts:express-as role="SeniorFemale" style="angry">
            “我看你长得像个玩具。”
        </mstts:express-as>
    </voice>
</speak>"""

speech_synthesis_result = speech_synthesizer.speak_ssml_async(ssml).get()

In [22]:
ssml = """<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
       xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US">
    <voice name="en-US-JennyNeural">
        <mstts:express-as style="excited">
            That'd be just amazing!
        </mstts:express-as>
        <mstts:express-as style="friendly">
            What's next?
        </mstts:express-as>
    </voice>
</speak>"""

speech_synthesis_result = speech_synthesizer.speak_ssml_async(ssml).get()

## 输出到文件

In [23]:
audio_config = speechsdk.audio.AudioOutputConfig(filename="./data/tts.wav")

speech_config.speech_synthesis_language='zh-CN'
speech_config.speech_synthesis_voice_name='zh-CN-XiaohanNeural'

speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)

text = "今天天气真不错，ChatGPT真好用"

speech_synthesizer.speak_text_async(text)


<azure.cognitiveservices.speech.ResultFuture at 0x7fcb5874cc70>

## 输出成MP3

In [28]:
speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio48Khz192KBitRateMonoMp3)

speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
result = speech_synthesizer.speak_text_async(text).get()
stream =speechsdk.AudioDataStream(result)

stream.save_to_wav_file("./data/tts.mp3")

## PaddlePaddle语音合成

In [55]:
from paddlespeech.cli.tts.infer import TTSExecutor

tts_executor = TTSExecutor()

text = "今天天气十分不错，百度也能做语音合成。"
output_file = "./data/paddlespeech.wav"
tts_executor(text=text, output=output_file)


[32m[2023-04-08 20:58:44,592] [    INFO][0m - Already cached /Users/xuwenhao/.paddlenlp/models/bert-base-chinese/bert-base-chinese-vocab.txt[0m
[32m[2023-04-08 20:58:44,609] [    INFO][0m - tokenizer config file saved in /Users/xuwenhao/.paddlenlp/models/bert-base-chinese/tokenizer_config.json[0m
[32m[2023-04-08 20:58:44,610] [    INFO][0m - Special tokens file saved in /Users/xuwenhao/.paddlenlp/models/bert-base-chinese/special_tokens_map.json[0m


'/Users/xuwenhao/Codebase/personal/geektime-ai-course/data/paddlespeech.wav'

In [53]:
import wave
import pyaudio


def play_wav_audio(wav_file):
    # open the wave file
    wf = wave.open(wav_file, 'rb')

    # instantiate PyAudio
    p = pyaudio.PyAudio()

    # open a stream
    stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                    channels=wf.getnchannels(),
                    rate=wf.getframerate(),
                    output=True)

    # read data from the wave file and play it
    data = wf.readframes(1024)
    while data:
        stream.write(data)
        data = wf.readframes(1024)

    # close the stream and terminate PyAudio
    stream.stop_stream()
    stream.close()
    p.terminate()

play_wav_audio(output_file)

## 英文内容丢失

In [69]:
tts_executor = TTSExecutor()

text = "今天天气十分不错，PaddleSpeech也能做语音合成。"
output_file = "./data/paddlespeech_missing.wav"
tts_executor(text=text, output=output_file)

play_wav_audio(output_file)

[32m[2023-04-08 21:24:59,936] [    INFO][0m - Already cached /Users/xuwenhao/.paddlenlp/models/bert-base-chinese/bert-base-chinese-vocab.txt[0m
[32m[2023-04-08 21:24:59,954] [    INFO][0m - tokenizer config file saved in /Users/xuwenhao/.paddlenlp/models/bert-base-chinese/tokenizer_config.json[0m
[32m[2023-04-08 21:24:59,955] [    INFO][0m - Special tokens file saved in /Users/xuwenhao/.paddlenlp/models/bert-base-chinese/special_tokens_map.json[0m


'/Users/xuwenhao/Codebase/personal/geektime-ai-course/data/paddlespeech_missing.wav'

## 选用合适模型，提供中英文混合效果

In [71]:
tts_executor = TTSExecutor()

text = "早上好, how are you? 百度Paddle Speech一样能做中英文混合的语音合成。"
output_file = "./data/paddlespeech_mix.wav"
tts_executor(text=text, output=output_file, 
             am="fastspeech2_mix", voc="hifigan_csmsc", 
             lang="mix", spk_id=174)

play_wav_audio(output_file)

[32m[2023-04-08 21:30:09,670] [    INFO][0m - Already cached /Users/xuwenhao/.paddlenlp/models/bert-base-chinese/bert-base-chinese-vocab.txt[0m
[32m[2023-04-08 21:30:09,695] [    INFO][0m - tokenizer config file saved in /Users/xuwenhao/.paddlenlp/models/bert-base-chinese/tokenizer_config.json[0m
[32m[2023-04-08 21:30:09,698] [    INFO][0m - Special tokens file saved in /Users/xuwenhao/.paddlenlp/models/bert-base-chinese/special_tokens_map.json[0m


In [72]:
tts_executor = TTSExecutor()

text = "早上好, how are you? 百度Paddle Speech一样能做中英文混合的语音合成。"
output_file = "./data/paddlespeech_mix_another.wav"
tts_executor(text=text, output=output_file, 
             am="fastspeech2_mix", voc="hifigan_csmsc", 
             lang="mix", spk_id=175)

play_wav_audio(output_file)

[32m[2023-04-08 21:34:34,947] [    INFO][0m - Already cached /Users/xuwenhao/.paddlenlp/models/bert-base-chinese/bert-base-chinese-vocab.txt[0m
[32m[2023-04-08 21:34:34,961] [    INFO][0m - tokenizer config file saved in /Users/xuwenhao/.paddlenlp/models/bert-base-chinese/tokenizer_config.json[0m
[32m[2023-04-08 21:34:34,962] [    INFO][0m - Special tokens file saved in /Users/xuwenhao/.paddlenlp/models/bert-base-chinese/special_tokens_map.json[0m
