In [None]:
import gradio as gr
import requests
import re
import os

import dotenv

dotenv.load_dotenv()


# STT(음성 → 텍스트) 요청 함수
def request_stt(audio_path):
    endpoint = f"{os.getenv('AZURE_SPEECH_ENDPOINT_URL')}/speech/recognition/conversation/cognitiveservices/v1?language=ko-KR"

    headers = {
        "Ocp-Apim-Subscription-Key": "45SFNywqabNccYBIp1fGDsJKJYe5N1pfPZLrSlLGY1ebDzyPLT75JQQJ99BFACYeBjFXJ3w3AAAYACOGWzIo"
    }

    with open(audio_path, "rb") as audio_file:
        audio_data = audio_file.read()

    response = requests.post(endpoint, headers=headers, data=audio_data)

    if response.status_code != 200:
        return None

    response_json = response.json()
    content = response_json["DisplayText"]

    return content


# TTS(텍스트 → 음성) 요청 함수
def request_tts(text, voice="ko-KR-SunHiNeural"):
    endpont = f"{os.getenv('AZURE_SPEECH_ENDPOINT_URL')}/cognitiveservices/v1"
    headers = {
        "Ocp-Apim-Subscription-Key": f"{os.getenv('AZURE_SPEECH_API_KEY')}",
        "X-Microsoft-OutputFormat": "riff-8khz-16bit-mono-pcm",
        "Content-Type": "application/ssml+xml",
    }
    # SSML 형식으로 요청 본문 생성
    body = f"""
        <speak version='1.0' xml:lang='ko-KR'>
            <voice xml:lang='ko-KR' xml:gender='Female' name='{voice}'>
                {text}
            </voice>
        </speak>
    """
    response = requests.post(endpont, headers=headers, data=body)
    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        return None
    import datetime

    now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

    filename = f"tts_result_{now}.wav"
    with open(filename, "wb") as audio_file:
        audio_file.write(response.content)
    return filename


# GPT API 요청 함수
def request_gpt(text):
    endpoint = f"{os.getenv('AZURE_OPENAI_ENDPOINT_URL')}/openai/deployments/fimtrus-gpt-41/chat/completions?api-version=2025-01-01-preview"
    headers = {"Authorization": f"Bearer {os.getenv('AZURE_OPENAI_API_KEY')}"}
    body = {
        "messages": [{"role": "user", "content": text}],
        "max_completion_tokens": 800,
        "temperature": 1,
        "top_p": 1,
        "frequency_penalty": 0,
        "presence_penalty": 0,
        "model": "fimtrus-gpt-41",
    }
    response = requests.post(endpoint, headers=headers, json=body)
    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        return None
    response_json = response.json()
    content = response_json["choices"][0]["message"]["content"]
    return content


# Gradio UI 정의
with gr.Blocks() as demo:
    voice_list = [
        "ko-KR-SunHiNeural",
        "ko-KR-InJoonNeural",
        "ko-KR-HyunsuMultilingualNeural",
        "ko-KR-BongJinNeural",
        "ko-KR-GookMinNeural",
        "ko-KR-HyunsuNeural",
        "ko-KR-JiMinNeural",
        "ko-KR-SeoHyeonNeural",
        "ko-KR-SoonBokNeural",
        "ko-KR-YuJinNeural",
    ]

    # 오디오 입력이 변경될 때 STT 실행
    def change_audio(audio_path):
        if audio_path is None:
            return None
        content = request_stt(audio_path)
        return content

    # TTS 버튼 클릭 시 실행
    def click_send_tts(text):
        filename = request_tts(text)
        return filename

    # GPT 전송 버튼 클릭 시 실행
    def send_gpt(text, histories):
        content = request_gpt(text)
        print(histories)
        print(content)
        # 대화 기록에 사용자/AI 메시지 추가
        histories.append({"role": "user", "content": text})
        histories.append({"role": "assistant", "content": content})
        # 챗봇 이벤트로 이동해야함.
        filename = request_tts(content)
        return histories, filename

    def change_prompt_audio(audio_path):
        if audio_path is None:
            return None

        text = request_stt(audio_path)
        return text

    def change_prompt_text(text, histories):
        if text is None or text == "":
            return histories

        content = request_gpt(text)

        # 대화 기록에 사용자/AI 메시지 추가
        histories.append({"role": "user", "content": text})
        histories.append({"role": "assistant", "content": content})

        return histories

    def change_chatbot(histories, voice):
        content = histories[-1]["content"]

        pattern = r"[^가-힣a-zA-Z0-9\s!.,]"

        cleaned_content = re.sub(pattern, "", content)

        filename = request_tts(cleaned_content, voice)

        return filename

    def change_voice(voice):
        return voice

    # UI 레이아웃 구성
    with gr.Row():
        # 좌측: 챗봇, 입력창, GPT 음성 출력
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(type="messages")
            with gr.Row():
                with gr.Column(scale=1):
                    prompt_audio = gr.Audio(
                        sources="microphone", label="질문", type="filepath"
                    )

                with gr.Column(scale=3):
                    prompt_textbox = gr.Textbox(
                        label="메시지 입력",
                        placeholder="여기에 메시지를 입력하세요.",
                        interactive=False,
                    )

                # with gr.Column(scale=1):
                #     send_gpt_button = gr.Button("전송")

            voice_dropdown = gr.Dropdown(
                choices=voice_list,
                label="음성 선택",
                value="ko-KR-SunHiNeural",
                type="value",
            )

            gpt_audio = gr.Audio(
                label="GPT 음성 출력", type="filepath", interactive=False, autoplay=True
            )
        # 우측: STT, TTS
        with gr.Column(scale=1):
            # STT 영역
            with gr.Column():
                gr.Markdown("### STT")
                input_audio = gr.Audio(
                    sources="microphone", type="filepath", label="마이크 입력"
                )
                output_text = gr.Textbox(
                    label="음성 인식 결과",
                    placeholder="여기에 음성 인식 결과가 표시됩니다.",
                    interactive=False,
                )
            # TTS 영역
            with gr.Column():
                gr.Markdown("### TTS")
                tts_textbox = gr.Textbox(
                    label="텍스트 입력", placeholder="텍스트를 입력하세요."
                )
                send_tts_button = gr.Button("음성으로 변환")
                output_tts_audio = gr.Audio(
                    label="음성 출력", type="filepath", interactive=False, autoplay=True
                )

    # 이벤트 연결
    input_audio.change(change_audio, inputs=[input_audio], outputs=[output_text])
    send_tts_button.click(
        click_send_tts, inputs=[tts_textbox], outputs=[output_tts_audio]
    )
    # send_gpt_button.click(send_gpt, inputs=[prompt_textbox, chatbot], outputs=[chatbot, gpt_audio])

    prompt_audio.input(
        change_prompt_audio, inputs=[prompt_audio], outputs=[prompt_textbox]
    )

    prompt_textbox.change(
        change_prompt_text, inputs=[prompt_textbox, chatbot], outputs=[chatbot]
    )

    chatbot.change(
        change_chatbot, inputs=[chatbot, voice_dropdown], outputs=[gpt_audio]
    )

    voice_dropdown.change(
        change_voice, inputs=[voice_dropdown], outputs=[voice_dropdown]
    )

# Gradio 앱 실행
demo.launch()

* Running on local URL:  http://127.0.0.1:7880

To create a public link, set `share=True` in `launch()`.


