In [1]:
#!/bin/python3
# 语音合成
import websocket
import datetime
import hashlib
import base64
import hmac
import json
from urllib.parse import urlencode
import ssl
from wsgiref.handlers import format_date_time
from datetime import datetime
from time import mktime
import _thread as thread
import os
# from integrationxuF import pcm2wav 
import wave
STATUS_FIRST_FRAME = 0  # 第一帧的标识
STATUS_CONTINUE_FRAME = 1  # 中间帧标识
STATUS_LAST_FRAME = 2  # 最后一帧的标识


class TTS():
    """
    语音合成 TTS :Text To Speech
    """
    # 初始化
    def __init__(self, APPID, APIKey, APISecret, Text,OutFile,wavFile=None):
        self.APPID = APPID
        self.APIKey = APIKey
        self.APISecret = APISecret
        self.Text = Text
        # 公共参数(common)
        self.CommonArgs = {"app_id": self.APPID}
        self.OutFile = OutFile
        self.wavFile = wavFile
        # 业务参数(business)，更多个性化参数可在官网查看
        """
        aue	string => raw：未压缩的pcm    lame：mp3 (当aue=lame时需传参sfl=1)
        auf string => audio/L16;rate=8000：合成8K 的音频     audio/L16;rate=16000：合成16K 的音频
        vcn string =>	发音人，可选值：请到控制台添加试用或购买发音人，添加后即显示发音人参数值	"xiaoyan"
        这里只是部分，详细的可去官网 https://www.xfyun.cn/doc/tts/online_tts/API.html#%E6%8E%A5%E5%8F%A3%E8%B0%83%E7%94%A8%E6%B5%81%E7%A8%8B
        """
        self.BusinessArgs = {"aue": "lame", "sfl":1,"auf": "audio/L16;rate=16000", "vcn": "aisxping",
         "tte": "utf8","auf":"audio/L16;rate=16000"}
        self.Data = {"status": 2, "text": str(base64.b64encode(self.Text.encode('utf-8')), "UTF8")}

 
    # 运行
    def run(self):
        websocket.enableTrace(False)
        wsUrl = self.create_url()
        ws = websocket.WebSocketApp(wsUrl,
         on_message=self.on_message, 
         on_error=self.on_error, 
         on_close=self.on_close)
        ws.on_open = self.on_open
        ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE})
        # self.towav()  # 这个是 请求的文件格式 是 pcm;目的是把pcm转为wav格式的；这里我用的mp3 就没必要使用这个了
        print("-"*10,"运行结束","-"*10)

    # pcm2wav
    def towav(self):
        """
        这里是分割pcm的路径，把转换为WAV格式的放到同目录下；可自行指定路径
        """
        if self.wavFile is  None:
            splits = str(self.OutFile).split('/')
            splits.pop(-1)
            splits.append("demo.wav")
            self.wavFile = '/'.join(splits)
        # print("wavFile:",self.wavFile)
        self.pcm2wav(self.OutFile,self.wavFile)
    
    def pcm2wav(self,pcmfile,wavfile,channels=1,rate=16000):
        with open(pcmfile,'rb') as fp:
            pcmdata = fp.read()
        with wave.open(wavfile, 'wb') as wav:
            wav.setnchannels(channels)
            wav.setsampwidth(16 // 8)
            wav.setframerate(rate)
                # 写入
            wav.writeframes(pcmdata)


    # 生成url
    def create_url(self):
        url = 'wss://tts-api.xfyun.cn/v2/tts'
        # 生成RFC1123格式的时间戳
        now = datetime.now()
        date = format_date_time(mktime(now.timetuple()))
        # 拼接字符串
        signature_origin = "host: " + "ws-api.xfyun.cn" + "\n"
        signature_origin += "date: " + date + "\n"
        signature_origin += "GET " + "/v2/tts " + "HTTP/1.1"
        # 进行hmac-sha256进行加密
        signature_sha = hmac.new(self.APISecret.encode('utf-8'), signature_origin.encode('utf-8'),
                                 digestmod=hashlib.sha256).digest()
        signature_sha = base64.b64encode(signature_sha).decode(encoding='utf-8')

        authorization_origin = "api_key=\"%s\", algorithm=\"%s\", headers=\"%s\", signature=\"%s\"" % (
            self.APIKey, "hmac-sha256", "host date request-line", signature_sha)
        authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8')
        # 将请求的鉴权参数组合为字典
        v = {
            "authorization": authorization,
            "date": date,
            "host": "ws-api.xfyun.cn"
        }
        # 拼接鉴权参数，生成url
        url = url + '?' + urlencode(v)
        # print("date: ",date)
        # print("v: ",v)
        # 此处打印出建立连接时候的url,参考本demo的时候可取消上方打印的注释，比对相同参数时生成的url与自己代码生成的url是否一致
        # print('websocket url :', url)
        return url

    def on_message(self,ws, message):
        try:
            message =json.loads(message)
            code = message["code"]
            sid = message["sid"]
            audio = message["data"]["audio"]
            audio = base64.b64decode(audio)
            status = message["data"]["status"]
            # print(message)
            if status == 2:
                print("ws is closed")
                ws.close()
            if code != 0:
                errMsg = message["message"]
                print("sid:%s call error:%s code is:%s" % (sid, errMsg, code))
            else:

                with open(self.OutFile, 'ab') as f:
                    f.write(audio)

        except Exception as e:
            print("receive msg,but parse exception:", e)

    # 收到websocket错误的处理
    def on_error(self,ws, error):
        print("### error:", error)


    # 收到websocket关闭的处理
    def on_close(self,ws):
        print("### closed ###")

    # 收到websocket连接建立的处理
    def on_open(self,ws):
        def run(*args):
            d = {"common": self.CommonArgs,
                "business": self.BusinessArgs,
                "data": self.Data,
                }
            d = json.dumps(d)
            print("------>开始发送文本数据")
            ws.send(d)
            if os.path.exists(self.OutFile):
                os.remove(self.OutFile)

        thread.start_new_thread(run, ())


if __name__ == "__main__":
    # 测试时候在此处正确填写相关信息即可运行
    # 那个文本中的两个句号是为了延长一下时间，不加的话最后一个字听不清
    out = '/root/code/mp3/demo.mp3'
    # 下面这些参数 自己到官网获取即可
    tts = TTS(APPID='923663a4',
                    APISecret='NDJmMTY4M2NmYmNjZmIzOWY2Zjk1NzZj',
                    APIKey='fa7f12a7540c645b83fba27006bbffbb',
                       Text="下午的一个语音合成示例。。",OutFile=out)
    tts.run()
   


---------- 运行结束 ----------


In [7]:
# -*- coding:utf-8 -*-
#
#   author: iflytek
#
#  本demo测试时运行的环境为：Windows + Python3.7
#  本demo测试成功运行时所安装的第三方库及其版本如下：
#   cffi==1.12.3
#   gevent==1.4.0
#   greenlet==0.4.15
#   pycparser==2.19
#   six==1.12.0
#   websocket==0.2.1
#   websocket-client==0.56.0
#   合成小语种需要传输小语种文本、使用小语种发音人vcn、tte=unicode以及修改文本编码方式
#  错误码链接：https://www.xfyun.cn/document/error-code （code返回错误码时必看）
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
import websocket
import datetime
import hashlib
import base64
import hmac
import json
from urllib.parse import urlencode
import time
import ssl
from wsgiref.handlers import format_date_time
from datetime import datetime
from time import mktime
import _thread as thread
import os


STATUS_FIRST_FRAME = 0  # 第一帧的标识
STATUS_CONTINUE_FRAME = 1  # 中间帧标识
STATUS_LAST_FRAME = 2  # 最后一帧的标识


class Ws_Param(object):
    # 初始化
    def __init__(self, APPID, APIKey, APISecret, Text):
        self.APPID = APPID
        self.APIKey = APIKey
        self.APISecret = APISecret
        self.Text = Text

        # 公共参数(common)
        self.CommonArgs = {"app_id": self.APPID}
        # 业务参数(business)，更多个性化参数可在官网查看
        self.BusinessArgs = {"aue": "lame","sfl":1, "auf": "audio/L16;rate=16000", "vcn": "x4_enus_luna_assist", "tte": "utf8"}
        # self.BusinessArgs = {"aue": "lame","sfl":1, "auf": "audio/L16;rate=16000", "vcn": "x4_enus_gavin_assist", "tte": "utf8"}
        self.Data = {"status": 2, "text": str(base64.b64encode(self.Text.encode('utf-8')), "UTF8")}
        #使用小语种须使用以下方式，此处的unicode指的是 utf16小端的编码方式，即"UTF-16LE"”
        #self.Data = {"status": 2, "text": str(base64.b64encode(self.Text.encode('utf-16')), "UTF8")}

    # 生成url
    def create_url(self):
        url = 'wss://tts-api.xfyun.cn/v2/tts'
        # 生成RFC1123格式的时间戳
        now = datetime.now()
        date = format_date_time(mktime(now.timetuple()))

        # 拼接字符串
        signature_origin = "host: " + "ws-api.xfyun.cn" + "\n"
        signature_origin += "date: " + date + "\n"
        signature_origin += "GET " + "/v2/tts " + "HTTP/1.1"
        # 进行hmac-sha256进行加密
        signature_sha = hmac.new(self.APISecret.encode('utf-8'), signature_origin.encode('utf-8'),
                                 digestmod=hashlib.sha256).digest()
        signature_sha = base64.b64encode(signature_sha).decode(encoding='utf-8')

        authorization_origin = "api_key=\"%s\", algorithm=\"%s\", headers=\"%s\", signature=\"%s\"" % (
            self.APIKey, "hmac-sha256", "host date request-line", signature_sha)
        authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8')
        # 将请求的鉴权参数组合为字典
        v = {
            "authorization": authorization,
            "date": date,
            "host": "ws-api.xfyun.cn"
        }
        # 拼接鉴权参数，生成url
        url = url + '?' + urlencode(v)
        # print("date: ",date)
        # print("v: ",v)
        # 此处打印出建立连接时候的url,参考本demo的时候可取消上方打印的注释，比对相同参数时生成的url与自己代码生成的url是否一致
        # print('websocket url :', url)
        return url

def on_message(ws, message):
    try:
        message =json.loads(message)
        code = message["code"]
        sid = message["sid"]
        audio = message["data"]["audio"]
        audio = base64.b64decode(audio)
        status = message["data"]["status"]
        print(message)
        if status == 2:
            print("ws is closed")
            ws.close()
        if code != 0:
            errMsg = message["message"]
            print("sid:%s call error:%s code is:%s" % (sid, errMsg, code))
        else:

            with open('/root/code/mp3/demo_forbt2.mp3', 'ab') as f:
                f.write(audio)
                print("!!write!!")

    except Exception as e:
        print("receive msg,but parse exception:", e)



# 收到websocket错误的处理
def on_error(ws, error):
    print("### error:", error)


# 收到websocket关闭的处理
def on_close(ws):
    print("### closed ###")


# 收到websocket连接建立的处理
def on_open(ws):
    def run(*args):
        d = {"common": wsParam.CommonArgs,
             "business": wsParam.BusinessArgs,
             "data": wsParam.Data,
             }
        d = json.dumps(d)
        print("------>开始发送文本数据")
        ws.send(d)
        # if os.path.exists('/root/code/mp3/demo.mp3'):
        #     os.remove('/root/code/mp3/demo.mp3')

    thread.start_new_thread(run, ())


if __name__ == "__main__":
    # 测试时候在此处正确填写相关信息即可运行
    wsParam = Ws_Param(APPID='923663a4', APISecret='NDJmMTY4M2NmYmNjZmIzOWY2Zjk1NzZj',
                       APIKey='fa7f12a7540c645b83fba27006bbffbb',
                       Text='We present GazeHOI, the first interaction dataset that simultaneously incorporates 3D modeling of gaze, hand, and object interactions. Here are some examples.')
                    #    Text = 'In the pre-diffusion phase, we decouple the gaze-guided condition into two components: Gaze-guided Spatial-Temporal Feature Encoding and Gaze-guided Goal Pose Generation, to extract the spatial-temporal feature c1 and the goal hand pose c2.')
                    #    Text="In this paper, we present Gaze-guided hand-object interaction synthesis: benchmark and method. Gaze is rich with intent information.  See an example of assembly furniture.  we observe that examining the consistent relationships among gaze, hand, and object can enhance our understanding of human activities in fine-grained scenarios. Besides, gaze can benefit VR and Human-Robot Interaction. Existing work primarily focused on extracting intent from gaze, Mogaze and GIMO  bridge the gap from gaze to intent and further to motion. However, they lack exploration into the hand and detailed interactions within fine-grained environments.  Therefore, we introduce a novel task: gaze-guided hand-object interaction synthesis. Addressing this task presents us with three significant challenges. First, there is a lack of suitable datasets. Existing HOI datasets focus on hands and objects solely, overlooking the corresponding gazes. So we Collect the first interaction dataset that simultaneously incorporates 3D modeling of gaze, hand, and object interactions. Challenge 2, no existing method for extracting the gaze feature for interaction scenarios. To solve this, we Decouple the gaze-guided conditions into spatial-temporal feature encoding and Goal Pose Generation using the Gaze-contact Consistency Value to extract features in different levels of information granularity. Challenge 3, The common approach for hand-object interaction is to deconstruct the task into different stages, to ensure consistency across each stage of the process, which, in turn, places new demands on precise and controllable generation. therefore, we present a hierarchical framework centered on GHO-Diffusion and introduce the Spherical Gaussian constraint in the diffusion denoising step and Contact Consistency in the post-diffusion. Next, I will introduce our dataset, GazeHOI. Here are examples of some sequences. For data collection, we use ZCAM E2 for hand tracking, Optitrack px13W for object tracking, Realsense D455 and pupil core for ego and gaze tracking. For data annotation, we utilize MediaPipe to extract 2D hand keypoints from each view, followed by triangulation and MANO model fitting. We scanned a set of objects and attached 3mm markers to them to obtain the object pose from OptiTrack raw data using the ICP algorithm. Combined with gaze from ego view, we obtain a set of fully annotated data. For gaze-guided hand-object interaction synthesis, given gazes and initial hand-object pose, we hope to synthesize the complete hand-object motions. Next, I will show our pipeline. In pre-diffusion, we decouple gaze-guided condition into gaze-guided Spatial-Temporal Feature Encoding and Gaze-guided Goal Pose Generation. especially, in Gaze-guided Goal Pose Generation, we explore the relationship between gaze map and contact map. Finally, we get spatial-temporal feature c1 and the goal hand pose c2. In GHO-diffusion, we design a stacked diffusion. first， conditioned on c1 and object geometry and initial pose, we can get object motions through object diffusion.  The object motion provides the goal pose frame index f, combined with goal hand pose c2 and initial hand pose, we can get hand motions from initial to reach the object through hand diffusion. In the sampling stage, we apply Spherical Gaussian Constraint to improve fine-grained goal pose alignment. In post-diffusion, we leverage the contact consistency to optimize the hand pose while moving together with the object. Here are some visualization results about our method and baseline. MDM means an end-to-end method, ours without goal pose means directly generation hand motion conditioned on object motion. There are some fail cases. gaze can provide insight for object translations, but it is hard to predict the rotation. In the assembly task, both our method and the baseline currently fail to achieve satisfactory results. Thank you for watching.")
    websocket.enableTrace(False)
    wsUrl = wsParam.create_url()
    ws = websocket.WebSocketApp(wsUrl, on_message=on_message, on_error=on_error, on_close=on_close)
    ws.on_open = on_open
    ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE})


------>开始发送文本数据
{'code': 0, 'message': 'success', 'sid': 'tts000ef578@hu18f57b6ad8705e0902', 'data': {'audio': '//NoxAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA//NoxAAVQxZtv0EQAkGlG5JLbbKdG2mBprTtaIMRlvvf9FYplzkP/bdU52//pdnbqQ/em57ovZ2Ts6otSL2e+vZ++7Sd2uT0VyVuRr8/vTzDDkbM+4Ve1H1xUnfXcoWaKhGIQIAAoDhCItGoeYDYEphvldx5iZeMDAZmd+Rac9a1c7FUA6l5hPEtGEEEHI52FooRWmO9uP5kNoEz1Rbo/iMZO6hGAS9Vm17Mtp62cPy2S1nIZ4jECAAsEwieqad+koJJE4Hpk+2sWYvEoGvfdu2s7daH+35d//NoxHZHnBYMDZ7QAD9WIT1BzKKO5RW+44U2uSCDKeQy6Wap73YnT4yrCP08koIapJPU+VwVVjEuo6emmcbOdqM40lTkbtTdS3ftyzd2RSGLS3Vzm8ZRFNW7/Z+HJRXpKO5P6r0kgnrdXV3kprwAwyhoduu29VvnWV/ajEYpsb1Lylzu36sur/OX4YvQ5KI/ahifi8muSu3GakzvCU7npjH5BG6arL6tPejF+U147YzjlBKpXmo=', 'status': 1, 'ced': '134'}}
!!write!!
{'code': 0, 'message': 'suc