In [1]:
!pip install SpeechRecognition



In [2]:
!pip install pygame



In [3]:
import pygame

pygame 2.6.1 (SDL 2.28.4, Python 3.11.11)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [4]:
import gradio as gr
import speech_recognition as sr
import os
import io
import openai
import anthropic
import asyncio
import re
import pygame
import threading
import queue
import time

from google import genai
from google.genai import types
from dotenv import load_dotenv
from io import BytesIO
from pydub import AudioSegment
from pydub.playback import play
from gtts import gTTS
from IPython.display import Audio


In [5]:
class Talker:
    def __init__(self):
        self.lang = 'ru'
        self.queue = queue.Queue()
        self.pattern = re.compile(r"[.!?]+")
        self.buffer = ''
        self.thread = threading.Thread(target=self._run_openai, daemon=True)
        self.thread.start()

    def _detect_language_by_first_letter(self, text):
        first_char = text.strip()[0]
        if 'А' <= first_char <= 'я' or first_char in 'Ёё':
            self.lang = 'ru'
        elif 'A' <= first_char <= 'Z' or 'a' <= first_char <= 'z':
            self.lang = 'en'
    
    def play(self, text):
        if text:
            self._detect_language_by_first_letter(text)
            self.buffer += text
            parts = list(self.pattern.finditer(self.buffer))
            
            if len(parts) > 0:
                ind = parts[-1].end()
                self.queue.put(self.buffer[:ind])
                self.buffer = self.buffer[ind:]
                
        elif self.buffer:
            self.queue.put(self.buffer)

    def _run_openai(self):
        while True:
            text = self.queue.get()
            if text is None:
                break;
                
            response = openai.audio.speech.create(
                model="tts-1",
                voice="alloy",
                input=text
            )
        
            audio_stream = BytesIO(response.content)
            audio = AudioSegment.from_file(audio_stream, format="mp3")
            play(audio)
    
    def _run_gTTS(self):
        pygame.mixer.init()
        while True:
            text = self.queue.get()
            if text is None:
                break  # For clean shutdown

            # Generate TTS audio
            tts = gTTS(text=text, lang=self.lang)
            audio_buffer = io.BytesIO()
            tts.write_to_fp(audio_buffer)
            audio_buffer.seek(0)

            # Play audio
            pygame.mixer.music.load(audio_buffer)
            pygame.mixer.music.play()

            while pygame.mixer.music.get_busy():
                pygame.time.wait(100)

    def stop(self):
        self.queue.put(None)
        self.thread.join()

talker = Talker()
# talker.play('Расскажи-ка нам об индукции!')


In [6]:
def transcribe(audio, lang):
    text = None
    if audio:
        try:
            recognizer = sr.Recognizer()
            with sr.AudioFile(audio) as source:
                audio_data = recognizer.record(source)
                text = recognizer.recognize_google(audio_data, language=lang)
            print(text)
        except Exception as e:
            text = "Sorry, failed to recognize" if lang == 'en-US' else 'Извините, речь не распознана.'
            print(e)
            
    return text or ''


load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')

if openai_api_key:
    print(openai_api_key[:7])
else:
    print('Failed to load OPENAI_API_KEY')


SYSTEM_MSG = 'Ты полезный технический помощник для инженеров. Ты даёшь чёткие краткие ответы. Если ты не знаешь ответ, то так и говоришь, что не знаешь.'

gpt = openai.OpenAI()
claude = anthropic.Anthropic()
gemini = genai.Client(api_key=os.getenv('GOOGLE_API_KEY'))
gemini_chat = gemini.chats.create(
    model='gemini-2.5-flash',
    config=types.GenerateContentConfig(system_instruction=SYSTEM_MSG)
)
deepseek = openai.OpenAI(api_key=os.getenv('DEEPSEEK_API_KEY'), base_url="https://api.deepseek.com")
ollama = openai.OpenAI(api_key='ollama', base_url="http://localhost:11434/v1")


def stream_gpt(history):
    messages = [{'role': 'system', 'content': SYSTEM_MSG}]
    messages += history
    stream = gpt.chat.completions.create(
        model='gpt-4o-mini',
        messages=messages,
        stream=True
    )
    result = ''
    history += [{'role': 'assistant', 'content': result}]
    for chunk in stream:
        text = chunk.choices[0].delta.content
        talker.play(text)
        result += chunk.choices[0].delta.content or ''
        history[-1] = {'role': 'assistant', 'content': result}
        yield history


def stream_claude(history):
    response = claude.messages.stream(
        model='claude-3-haiku-20240307',
        max_tokens=500,
        system=SYSTEM_MSG,
        messages=history
    )

    result = ''
    history += [{'role': 'assistant', 'content': result}]
    with response as stream:
        for text in stream.text_stream:
            result += text or ''
            talker.play(text)
            history[-1] = {'role': 'assistant', 'content': result}
            yield history


def stream_gemini(history):
    user_msg = history[-1]['content']
    response = gemini_chat.send_message_stream(user_msg)
    result = ''
    history += [{'role': 'assistant', 'content': result}]

    if len(gemini_chat.get_history()) > 0:
        print(gemini_chat.get_history()[-1])
    
    for chunk in response:
        result += chunk.text or ''
        talker.play(chunk.text)
        history[-1] = {'role': 'assistant', 'content': result}
        yield history


def stream_deepseek(history):
    messages = [{'role': 'system', 'content': SYSTEM_MSG}]
    messages += history
    stream = deepseek.chat.completions.create(
        model='deepseek-chat',
        messages=messages,
        stream=True
    )
    result = ''
    history += [{'role': 'assistant', 'content': result}]
    for chunk in stream:
        text = chunk.choices[0].delta.content
        talker.play(text)
        result += chunk.choices[0].delta.content or ''
        history[-1] = {'role': 'assistant', 'content': result}
        yield history


def stream_ollama(history):
    messages = [{'role': 'system', 'content': SYSTEM_MSG}]
    messages += history
    stream = ollama.chat.completions.create(
        model='llama3.2',
        messages=messages,
        stream=True
    )
    result = ''
    history += [{'role': 'assistant', 'content': result}]
    for chunk in stream:
        text = chunk.choices[0].delta.content
        talker.play(text)
        result += chunk.choices[0].delta.content or ''
        history[-1] = {'role': 'assistant', 'content': result}
        yield history  
        

def ask_ai_model(model, history):
    clean_history = [{'role': h['role'], 'content': h['content']} for h in history]

    match model:
        case 'gpt':
            yield from stream_gpt(clean_history)
        case 'Claude':
            yield from stream_claude(clean_history)
        case 'Gemini':
            yield from stream_gemini(clean_history)
        case 'Deepseek':
            yield from stream_deepseek(clean_history)
        case 'ollama':
            yield from stream_ollama(clean_history)
        case _:
            text = f'{model}: not implemented yet.'
            clean_history += [{'role': 'assistant', 'content': text}]
            yield clean_history


def entry_submit(message, history):
    history += [{'role': 'user', 'content': message}]
    return "", history


with gr.Blocks() as ui:
    with gr.Row():
        chatbot = gr.Chatbot(label="Chat", height=300, type="messages")
    with gr.Row():
        entry = gr.Textbox(label="Chat with our AI Assistant", value="что такое фазовые конденсаторы")
        audio_entry = gr.Audio(label="Voice input", sources="microphone", type="filepath")
    with gr.Row():
        model = gr.Dropdown(label = "Choose a model", value='gpt', choices=['gpt', 'Claude', 'Gemini', 'ollama', 'Deepseek'])
        language = gr.Dropdown(label="Choose a language for speech recognition", value='ru-RU', choices=['en-US', 'ru-RU'])
    with gr.Row():
        clear_button = gr.Button("Clear")

    entry.submit(entry_submit, inputs=[entry, chatbot], outputs=[entry, chatbot]).then(
        ask_ai_model, inputs=[model, chatbot], outputs=[chatbot])
    
    audio_entry.input(transcribe, inputs=[audio_entry, language], outputs=[entry])
    
    clear_button.click(lambda: ('', ""), inputs=None, outputs=[entry, chatbot], queue=False)


ui.launch(inbrowser=False)

sk-proj
* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




Input #0, wav, from '/var/folders/ny/bjrptr9d4f9c3mmxj10zzcl40000gp/T/tmpec17akyq.wav':
  Duration: 00:00:10.82, bitrate: 384 kb/s
  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, 1 channels, s16, 384 kb/s
  10.70 M-A:  0.000 fd=   0 aq=    0KB vq=    0KB sq=    0B 




Input #0, wav, from '/var/folders/ny/bjrptr9d4f9c3mmxj10zzcl40000gp/T/tmp7u1i8qh9.wav':
  Duration: 00:00:09.02, bitrate: 384 kb/s
  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, 1 channels, s16, 384 kb/s
   8.92 M-A:  0.000 fd=   0 aq=    0KB vq=    0KB sq=    0B 




Input #0, wav, from '/var/folders/ny/bjrptr9d4f9c3mmxj10zzcl40000gp/T/tmp283wbem2.wav':
  Duration: 00:00:04.27, bitrate: 384 kb/s
  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, 1 channels, s16, 384 kb/s
   4.24 M-A:  0.000 fd=   0 aq=    0KB vq=    0KB sq=    0B 


