In [2]:
import requests
import gradio as gr

In [1]:
SPEECH_API_KEY = '***'
GPT_API_KEY = '***'

def request_stt(file_path):
    if not file_path:
        return
    
    ENDPOINT = 'https://koreacentral.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?language=ko-KR&format=detailed'
    headers = {
        'Ocp-Apim-Subscription-Key': SPEECH_API_KEY,
        'Content-Type': 'audio/wav',
        'language': 'ko-KR',
        'format': 'detailed'
    }
    with open(file_path, 'rb') as f:
        data = f.read()
        
    response = requests.post(ENDPOINT, headers=headers, data=data)
    if response.status_code == 200:
        return response.json()['DisplayText']


def request_tts(text, speed, voice):
    if not text or text.startswith('⚠️'):
        return
    
    ENDPOINT = 'https://koreacentral.tts.speech.microsoft.com/cognitiveservices/v1'
    headers = {
        'Ocp-Apim-Subscription-Key': SPEECH_API_KEY,
        'Content-Type': 'application/ssml+xml',
        'X-Microsoft-OutputFormat': 'audio-16khz-128kbitrate-mono-mp3',
    }
    data = f"""
    <speak version='1.0' xml:lang='ko-KR'>
        <voice xml:lang='ko-KR' xml:gender='Female' name='{voice}'>
            <prosody rate='{speed}'>
                {text}
            </prosody>
        </voice>
    </speak>
    """
        
    response = requests.post(ENDPOINT, headers=headers, data=data)
    if response.status_code == 200:
        file_path = './resources/tmp.mp3'
        with open(file_path, 'wb') as f:
            f.write(response.content)
        return file_path


def request_gpt(history):
    if not history:
        return history
    if not history[-1]['content']:
        history.pop()
        return history
    
    ENDPOINT = 'https://fimtrus-openai3.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-10-21'
    SYSTEM_INSTRUCTION = {
        'role': 'system', 
        'content': '당신은 유용한 정보를 제공할 수 있는 AI 음성 비서입니다. '
                   '이모지나 마크다운 등을 사용하지 말고, 소리내서 발음할 수 있도록 답하세요.'
    }
    headers = {
        'Content-Type': 'application/json',
        'API-Key': GPT_API_KEY
    }
    json_data = {
        'messages': [SYSTEM_INSTRUCTION] + history,
        'temperature': 0.7,
        'top_p': 0.9,
        'max_tokens': 800
    }
    
    response = requests.post(ENDPOINT, headers=headers, json=json_data)
    if response.status_code == 200:
        response_message = response.json()['choices'][0]['message']
    else:
        response_message = {
            'role': 'assistant', 
            'content': f'⚠️ 문제가 발생했습니다: [{response.status_code}] {response.reason}'
        }
    history.append(response_message)
    return history

In [86]:
history = []

def create_user_message(message, history):
    history.append({
        'role': 'user',
        'content': message
    })
    return history


def get_last_message_content(history, *args):
    return history[-1]['content'] if history else '테스트 문장입니다.', *args


with gr.Blocks() as demo:
    gr.Markdown('# 🧠 STT-GPT-TTS')
    with gr.Row():
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(history, type='messages')
        
        with gr.Column():
            voice_dropdown = gr.Dropdown(
                label='🗣️ 음성 선택', 
                choices=[
                    'ko-KR-SunHiNeural', 'ko-KR-InJoonNeural', 'ko-KR-HyunsuMultilingualNeural',
                    'ko-KR-BongJinNeural', 'ko-KR-GookMinNeural', 'ko-KR-HyunsuNeural',
                    'ko-KR-JiMinNeural', 'ko-KR-SeoHyeonNeural', 'ko-KR-SoonBokNeural',
                    'ko-KR-YuJinNeural'
                ], 
                interactive=True
            )
            speed_slider = gr.Slider(label='⏩ 발화 속도', minimum=0.1, maximum=2.0, value=1.0, interactive=True)
            tts_output = gr.Audio(label='합성 음성', type='filepath', interactive=False, autoplay=True)

    text_input = gr.Textbox(
        placeholder='메시지를 입력하거나 음성을 녹음해보세요.',
        show_label=False,
        submit_btn=True,
    )
    mic_input = gr.Microphone(show_label=False, type='filepath')

    text_input.submit(
        create_user_message, inputs=[text_input, chatbot], outputs=[chatbot]
    ).then(
        request_gpt, inputs=[chatbot], outputs=[chatbot]
    ).then(
        lambda: (None, None), outputs=[text_input, mic_input]
    ).then(
        lambda *args: request_tts(*get_last_message_content(*args)), 
        inputs=[chatbot, speed_slider, voice_dropdown], outputs=[tts_output]
    )

    mic_input.input(request_stt, inputs=[mic_input], outputs=[text_input])
    
    speed_slider.change(
        lambda *args: request_tts(*get_last_message_content(*args)), 
        inputs=[chatbot, speed_slider, voice_dropdown], outputs=[tts_output]
    )
    voice_dropdown.change(
        lambda *args: request_tts(*get_last_message_content(*args)), 
        inputs=[chatbot, speed_slider, voice_dropdown], outputs=[tts_output]
    )
    

demo.launch()

* Running on local URL:  http://127.0.0.1:7919

To create a public link, set `share=True` in `launch()`.


