In [None]:
# Install Basic Packages for KINYA TTS To Work
# This command installs the Kinyarwanda Text-to-Speech (TTS) package from a local directory
# in development mode to enable real-time code modifications without reinstallation
!pip install -e /content/drive/MyDrive/Kin-Assistant/Inference/


# Monotonic_Align-dependency Issues handle

In [None]:
# Install specific version of NumPy to ensure compatibility with monotonic_align
# This prevents version conflicts that can break the alignment module
!pip install --force-reinstall "numpy<2.1.0,>1.26.0"

# Install Cython which is required for compiling the monotonic_align C extension
# The --force-install flag ensures any existing installation is overwritten
!pip install --force-install Cython

In [None]:
# Install Cython and NumPy packages
# Cython is needed for the C extensions compilation
# NumPy provides the array operations required by the TTS model
!pip install Cython numpy

In [None]:
# Change directory to the monotonic_align module location
%cd /content/drive/MyDrive/Kin-Assistant/Inference/monotonic_align

# Create the monotonic_align directory if it doesn't exist
# This directory will contain the compiled C extension
!mkdir -p monotonic_align

# Compile the C extension using Cython
# The --inplace flag builds the extension in the source directory
# This is required for the duration alignment algorithm in the TTS system
!python setup.py build_ext --inplace

In [None]:
# Change directory to the monotonic_align module
%cd /content/drive/MyDrive/Kin-Assistant/Inference/monotonic_align

# Install the monotonic_align package in development mode
# This allows importing the module after compilation
# The -e flag (editable mode) enables changes without reinstallation
!pip install -e .

In [None]:
# Test if the monotonic_align module was properly installed and can be imported
# This verification is crucial before proceeding with the TTS system
import monotonic_align
print("Successfully imported!")

# STT Setup


In [None]:
# Upgrade pip to the latest version to avoid compatibility issues
# --no-cache-dir ensures a clean installation without using cached packages
%pip install --no-cache-dir --upgrade pip

# Install Cython which is required for the Speech-to-Text (STT) compilation
# Cython allows Python code to achieve C-like performance
%pip install --no-cache-dir cython

# Install all STT dependencies from the requirements file
# This includes packages necessary for the NVIDIA NeMo framework
%pip install --no-cache-dir -r /content/drive/MyDrive/Kin-Assistant/stt/requirements.txt

# Install system packages (commented out, uncomment if needed)
# This would install system dependencies listed in packages.txt
# %apt-get update && cat /content/drive/MyDrive/Kin-Assistant/stt/packages.txt | xargs apt-get install -y

# Install a specific version of Google Protobuf
# Version 4.21.1 is required for compatibility with the NeMo models
%pip install protobuf==4.21.1

In [None]:
# Install system-level audio processing dependencies
# sox: Sound eXchange, a command-line utility for audio processing
# libsox-fmt-all: Additional format libraries for sox to handle various audio formats
# These are essential for audio processing in the STT pipeline
!apt-get update && apt-get install -y sox libsox-fmt-all

In [None]:
# Authenticate with Hugging Face Hub using the stored token
# This allows downloading models from Hugging Face repositories
from google.colab import userdata
from huggingface_hub import login

# Retrieve the Hugging Face token from Colab secrets
hf_token = userdata.get('HF_TOKEN')

# Log in to Hugging Face Hub with the token
# add_to_git_credential=True allows git operations with Hugging Face repositories
login(token=hf_token, add_to_git_credential=True)

In [None]:
# Install Gradio - a Python library for creating UI components for ML models
# Gradio will be used to create a web interface for the voice assistant
# This enables easy interaction with the STT and TTS components
!pip install gradio

In [None]:
### Handle numpy version compatibility issues
# Installing a specific version of NumPy (1.24.3) that is known to work with the STT model
# This prevents "RuntimeError: Numpy is not available" errors that can occur with certain versions
!pip install numpy==1.24.3

In [None]:
### Main Function of my system.
from __future__ import print_function, division

import gradio as gr
import os
import re
import torch
import torchaudio
from typing import Tuple
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import time

# -------------------- TTS Setup (Your Existing Code) --------------------
from kinyatts.tts.commons import intersperse
from kinyatts.tts.utils import get_hparams_from_file, load_checkpoint
from kinyatts.tts.models import SynthesizerTrn
from kinyatts.tts.text import text_to_sequence
from kinyatts.tts.text.symbols import symbols

import os
import nemo.collections.asr as nemo_asr
from pydub import AudioSegment
import pyaudioconvert as pac
import numpy as np
import time as time
import uuid

#import the model using hugging face
hf_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(model_name="mbazaNLP/Kinyarwanda_nemo_stt_conformer_model")

# TTS Global variables
inference_engine = (None, None, None, None)

# Create necessary directories
os.makedirs("/content/sounds", exist_ok=True)
os.makedirs("/content/outputs", exist_ok=True)

def kinya_tts_setup():
    global inference_engine

    device = torch.device('cpu')
    if torch.cuda.is_available():
        device = torch.device('cuda:0')

    path_to_tts_config = '/content/drive/MyDrive/Kin-Assistant/Inference/kinyatts/ms_ktjw_istft_vits2_base.json'
    path_to_tts_model = '/content/drive/MyDrive/Kin-Assistant/TTS_MODEL_ms_ktjw_istft_vits2_base_1M.pt'
    tts_hps = get_hparams_from_file(path_to_tts_config)

    if "use_mel_posterior_encoder" in tts_hps.model.keys() and tts_hps.model.use_mel_posterior_encoder == True:
        print("Using mel posterior encoder for VITS2")
        posterior_channels = 80  # vits2
        tts_hps.data.use_mel_posterior_encoder = True
    else:
        print("Using lin posterior encoder for VITS1")
        posterior_channels = tts_hps.data.filter_length // 2 + 1
        tts_hps.data.use_mel_posterior_encoder = False
    tts_model = SynthesizerTrn(
        len(symbols),
        posterior_channels,
        tts_hps.train.segment_size // tts_hps.data.hop_length,
        n_speakers=tts_hps.data.n_speakers, #- >0 for multi speaker
        **tts_hps.model).to(device)
    _ = tts_model.eval()
    _ = load_checkpoint(path_to_tts_model, tts_model, None)

    louder_vol = torchaudio.transforms.Vol(gain=3.0, gain_type="amplitude")

    inference_engine = (device, tts_model, tts_hps, louder_vol)

    print('TTS API engine ready!', flush=True)

# -------------------- Transcriber Setup (Your Existing Code) --------------------
class Transcriber:
    def __init__(self, audio_bytes : bytes) -> None:
        self.audio_bytes = audio_bytes

        #save the audio
        self.save_audio()

        #convert the audio file
        self.convert_wav_to_16bit_mono()

        #Transcribe
        self.transcription = self.transcribe()

    def save_audio(self):
        self.file_id = len(os.listdir('/content/sounds/'))
        with open(f"/content/sounds/sound-{self.file_id}.wav", "wb") as audio_file:
            audio_file.seek(0)
            audio_file.write(self.audio_bytes)


    def convert_wav_to_16bit_mono(self):
        try:
            file_path = f"/content/sounds/sound-{self.file_id}.wav"
            pac.convert_wav_to_16bit_mono(file_path,file_path)
            return True
        except FileNotFoundError:
            return False

    def transcribe(self):
        try:
            file_path = f"/content/sounds/sound-{self.file_id}.wav"
            result= hf_model.transcribe([file_path])
            return result[0]
        except FileNotFoundError:
            return "Unable to transcribe audio!"


In [None]:
# -------------------- Text-to-Speech Conversation Logic --------------------

def get_text(text, hps):
    """
    Convert input text to a sequence of phonetic IDs for TTS processing.
    
    Args:
        text (str): Input text in Kinyarwanda
        hps: Model hyperparameters
        
    Returns:
        torch.LongTensor: Tensor of token IDs representing the input text
    """
    # Convert text to sequence of phonetic IDs
    text_norm = text_to_sequence(text)
    
    # Add blank tokens between phonemes if required by the model
    if hps.data.add_blank:
        text_norm = intersperse(text_norm, 0)
    
    # Convert to PyTorch tensor
    text_norm = torch.LongTensor(text_norm)
    return text_norm

def kinya_tts(inputstr, output_folder='/content/outputs') -> Tuple[str, float]:
    """
    Generate Kinyarwanda speech from text input.
    
    This function:
    1. Processes the input text to remove special characters
    2. Converts text to phonetic sequence
    3. Runs inference using the TTS model
    4. Adjusts volume and saves the output as a WAV file
    
    Args:
        inputstr (str): Input text in Kinyarwanda
        output_folder (str): Directory to save the generated audio
        
    Returns:
        Tuple[str, float]: Path to the generated audio file and its duration in seconds
    """
    global inference_engine
    (device, tts_model, tts_hps, louder_vol) = inference_engine
    
    # Remove brackets and other special characters that might affect pronunciation
    fltstr = re.sub(r"[\[\](){}]", "", inputstr)
    
    # Convert text to phonetic sequence
    stn_tst = get_text(fltstr, tts_hps)
    
    # Set speech generation speed (slightly slower than normal for clarity)
    speed = 0.97
    
    # Generate audio using the TTS model
    with torch.no_grad():  # Disable gradient calculation for inference
        # Prepare input tensors
        x_tst = stn_tst.to(device).unsqueeze(0)  # Add batch dimension
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
        
        # Run inference with specific parameters
        # - noise_scale: Controls speaking variation (0.667 is moderate)
        # - noise_scale_w: Controls prosody variation (0.8 is natural)
        # - length_scale: Controls speaking speed (1/speed adjusts pace)
        audio = tts_model.infer(
            x_tst, 
            x_tst_lengths, 
            noise_scale=.667, 
            noise_scale_w=0.8, 
            length_scale=1/speed
        )[0][0, 0].data.cpu().float()
    
    # Calculate audio duration in seconds
    AUDIO_TIME = audio.size(0) / tts_hps.data.sampling_rate
    
    # Increase volume for better audibility
    audio = louder_vol(audio.unsqueeze(0))
    
    # Generate unique filename using UUID to prevent overwriting
    unique_id = uuid.uuid4().hex
    output_wav_file = f"{output_folder}/output_{unique_id}.wav"
    
    # Save audio to WAV file
    torchaudio.save(output_wav_file, audio, tts_hps.data.sampling_rate)
    
    return output_wav_file, AUDIO_TIME

In [None]:
### NLP logic to return and answer questions.
import random
import string

# Define intents, patterns, and responses separately
INTENTS = {
    "greeting": {
        "patterns": ["muraho", "hello", "salut", "mwiriwe", "wiriweho"],
        "responses": [
            "Muraho neza! Nishimiye kukubona hano. Ufite ikibazo ushaka kumbaza?",
            "Mwiriwe neza! Umeze neza?",
            "Wiriweho neza! Nishimiye kuganira nawe."
        ]
    },
    "ask_owner": {
        "patterns": ["uri nde?", "wakozwe nande?", "uturuka he?"],
        "responses": [
            "Ndi gacurabwenge umuhanga w'abanyarwanda"
        ]
    },
    "ask_news": {
        "patterns": ["amakuru", "amakuru yawe", "amakuru y'umunsi", "amakuru y'icyumweru", "amakuru ya mugitondo", "amakuru ya nijoro"],
        "responses": [
            "Amakuru ni meza cyane, urakoze kubaza!",
            "Umunsi wagenze neza, ndashimira Imana. Wowe se?",
            "Icyumweru cyagenze neza rwose, wowe ho?",
            "Mugitondo wagenze neza cyane! Wowe uko byagenze?",
            "Nijoro wagenze neza cyane, ndabashimiye!",
        ]
    },
    "gratitude": {
        "patterns": ["urakoze", "murakoze", "murakoze cyane", "ndashimira", "turabashimira"],
        "responses": [
            "Turagushimiye cyane nawe!",
            "Murakoze cyane! Imana ibahe umugisha.",
            "Twishimiye ko dukorana neza. Urakoze cyane!"
        ]
    },
    "help_request": {
        "patterns": ["wamfasha", "mfite ikibazo", "ndifuza ubufasha", "ushobora kungira inama"],
        "responses": [
            "Yego rwose, mbwira icyo ukeneye!",
            "Nditeguye kugufasha. Mbwira ikibazo cyawe.",
            "Nta kibazo, reka tuganire. Ukeneye iki?"
        ]
    },
    "apology": {
        "patterns": ["mbabarira", "ndababarira", "ndasaba imbabazi"],
        "responses": [
            "Birumvikana! Twese tugira ibyo dukosamo.",
            "Imbabazi zawe zirakiriwe. Duhitemo gutera imbere.",
            "Ntacyo bibaye, humura."
        ]
    },
    "farewell": {
        "patterns": ["murabeho", "tuzabonana", "imana iguhe umugisha"],
        "responses": [
            "Murabeho neza! Imana iguhe imigisha myinshi.",
            "Tuzabonana ubutaha, Imana ikomeze ikurinde!",
            "Murabeho! Wihangane kandi ukomeze utsinde."
        ]
    },
    "default": {
        "patterns": [],
        "responses": [
            "Mbabarira, sinabyumvise neza. Wansobanurira neza icyo ushaka?",
            "Ndagusabye kongera usobanura neza.",
            "Ndashaka kukumva neza, mbwira witonze."
        ]
    }
}

# Additional direct questions and answers
QA = {
    "Rwanda Coding Academy iherereye he?": "Iherereye mu Karere ka Nyabihu, mu Ntara y'Iburengerazuba.",
    "Umurwa mukuru w'u Rwanda ni uwuhe?": "Ni Kigali.",
    "Ni inde Perezida w'u Rwanda?": "Ni Paul Kagame.",
    "Ikirere cy'uyu munsi kimeze gite?": "Sinabasha kukubwira ikirere nyacyo, ariko ushobora kugenzura kuri meteo.gov.rw.",
    "Uburebure bwa Mount Kigali ni bungana iki?": "Bufite uburebure bwa metero zirenga 1,800.",
    "U Rwanda rufite intara zingana iki?": "Rufite intara 5: Intara y'Amajyaruguru, Amajyepfo, Iburengerazuba, Iburasirazuba, n'Umujyi wa Kigali."
}

# Create keyword mappings for QA
KEYWORD_MAP = {}
for question, answer in QA.items():
    # Extract words from question
    normalized_question = normalize_text(question)
    words = normalized_question.split()

    # Create combinations of 2 and 3 keywords
    from itertools import combinations
    for n in [2, 3]:
        if len(words) >= n:
            keyword_combos = combinations(words, n)
            for combo in keyword_combos:
                key = " ".join(sorted(combo))  # Sort to ensure consistent key
                if key not in KEYWORD_MAP:
                    KEYWORD_MAP[key] = []
                KEYWORD_MAP[key].append(question)

def normalize_text(text):
    # Remove punctuation, lowercase, and strip whitespace
    text = text.strip().lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def match_intent(normalized_input):
    for intent, data in INTENTS.items():
        for pattern in data["patterns"]:
            normalized_pattern = normalize_text(pattern)
            if normalized_pattern in normalized_input:
                return intent
    return "default"

def match_with_keywords(normalized_input):
    words = normalized_input.split()

    # Try to match with exact question first
    for question in QA:
        if normalize_text(question) == normalized_input:
            return question

    # Try combinations of keywords in the input
    potential_matches = {}

    # Check combinations of 3 keywords first (more specific)
    if len(words) >= 3:
        for combo in combinations(words, 3):
            key = " ".join(sorted(combo))
            if key in KEYWORD_MAP:
                for question in KEYWORD_MAP[key]:
                    potential_matches[question] = potential_matches.get(question, 0) + 3  # Higher weight for 3-word match

    # Then check combinations of 2 keywords
    if len(words) >= 2:
        for combo in combinations(words, 2):
            key = " ".join(sorted(combo))
            if key in KEYWORD_MAP:
                for question in KEYWORD_MAP[key]:
                    potential_matches[question] = potential_matches.get(question, 0) + 1

    # Return the question with highest match score if any
    if potential_matches:
        best_match = max(potential_matches.items(), key=lambda x: x[1])[0]
        return best_match

    return None

def get_response(user_input):
    normalized_input = normalize_text(user_input)

    # First try exact match or keyword-based matching
    matched_question = match_with_keywords(normalized_input)
    if matched_question:
        return QA[matched_question]

    # Fallback to intent-based response
    intent = match_intent(normalized_input)
    response = random.choice(INTENTS[intent]["responses"])
    return response

In [None]:
def full_conversation(audio_file):
    with open(audio_file, "rb") as f:
        if not os.path.isfile(audio_file):
          return "No audio detected, please try again."
        audio_bytes = f.read()

    # Step 1: Transcribe the audio
    transcriber = Transcriber(audio_bytes)
    user_text = transcriber.transcription.text
    print(f"User: {user_text}")

    # Step 2: Get the chatbot response
    response_text = get_response(user_text)
    print(f"Assistant Response: {response_text}")

    # Step 3: Convert chatbot response to speech
    wav_file, _ = kinya_tts(response_text)

    # Step 4: Return transcription and audio response
    return user_text, wav_file

In [None]:
kinya_tts_setup()  # Initialize TTS engine once at startup

In [None]:
## Gradio Interface for Launching the TTS
conversation_demo = gr.Interface(
    fn=full_conversation,
    inputs=gr.Audio(type="filepath", label="Upload Kinyarwanda Audio"),
    outputs=[
        gr.Textbox(label="Transcribed User Text"),
        gr.Audio(label="Assistant Voice Response", autoplay=True)
    ],
    title="Kinyarwanda Voice Assistant",
    description="Speak in Kinyarwanda, get a transcription + voice reply generated by AI!",
    examples=[
        ["/content/sounds/sound-0.wav"],
        ["/content/sounds/sound-1.wav"]
    ]
)

# Launch
conversation_demo.launch(debug=True)

# AI as the MiddleWare Section(Used GPT API-key to answer) questions.

In [None]:
## First Installing openai
!pip install --quiet openai

[0m

In [None]:
## ChatGPT Function Key
from google.colab import userdata
from openai import OpenAI

client = OpenAI(api_key=userdata.get("OPENAI_API_KEY"))

def call_chatgpt(prompt: str) -> str:
    """Call GPT as a fallback and return its reply."""
    try:
        resp = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=150,
            temperature=0.3,
        )
        return resp.choices[0].message.content.strip()
    except Exception as e:
        # Log the error and return a safe fallback
        print(f"[ChatGPT API error] {e}")
        return ("Simbashije kubona igisubizo ongera gerageza mukanya")

In [None]:
### NLP Logic manual with also CHATGPT enabled.
import os
import openai
import random
import string
from itertools import combinations


# Define intents, patterns, and responses separately
INTENTS = {
    "greeting": {
        "patterns": ["muraho", "hello", "salut", "mwiriwe", "wiriweho"],
        "responses": [
            "Muraho neza! Nishimiye kukubona hano. Ufite ikibazo ushaka kumbaza?",
            "Mwiriwe neza! Umeze neza?",
            "Wiriweho neza! Nishimiye kuganira nawe."
        ]
    },
    "ask_owner": {
        "patterns": ["uri nde?", "wakozwe nande?", "uturuka he?"],
        "responses": [
            "Ndi gacurabwenge umuhanga w'abanyarwanda"
        ]
    },
    "ask_news": {
        "patterns": ["amakuru", "amakuru yawe", "amakuru y'umunsi", "amakuru y'icyumweru", "amakuru ya mugitondo", "amakuru ya nijoro"],
        "responses": [
            "Amakuru ni meza cyane, urakoze kubaza!",
            "Umunsi wagenze neza, ndashimira Imana. Wowe se?",
            "Icyumweru cyagenze neza rwose, wowe ho?",
            "Mugitondo wagenze neza cyane! Wowe uko byagenze?",
            "Nijoro wagenze neza cyane, ndabashimiye!",
        ]
    },
    "gratitude": {
        "patterns": ["urakoze", "murakoze", "murakoze cyane", "ndashimira", "turabashimira"],
        "responses": [
            "Turagushimiye cyane nawe!",
            "Murakoze cyane! Imana ibahe umugisha.",
            "Twishimiye ko dukorana neza. Urakoze cyane!"
        ]
    },
    "help_request": {
        "patterns": ["wamfasha", "mfite ikibazo", "ndifuza ubufasha", "ushobora kungira inama"],
        "responses": [
            "Yego rwose, mbwira icyo ukeneye!",
            "Nditeguye kugufasha. Mbwira ikibazo cyawe.",
            "Nta kibazo, reka tuganire. Ukeneye iki?"
        ]
    },
    "apology": {
        "patterns": ["mbabarira", "ndababarira", "ndasaba imbabazi"],
        "responses": [
            "Birumvikana! Twese tugira ibyo dukosamo.",
            "Imbabazi zawe zirakiriwe. Duhitemo gutera imbere.",
            "Ntacyo bibaye, humura."
        ]
    },
    "farewell": {
        "patterns": ["murabeho", "tuzabonana", "imana iguhe umugisha"],
        "responses": [
            "Murabeho neza! Imana iguhe imigisha myinshi.",
            "Tuzabonana ubutaha, Imana ikomeze ikurinde!",
            "Murabeho! Wihangane kandi ukomeze utsinde."
        ]
    },
    "default": {
        "patterns": [],
        "responses": [
            "Mbabarira, sinabyumvise neza. Wansobanurira neza icyo ushaka?",
            "Ndagusabye kongera usobanura neza.",
            "Ndashaka kukumva neza, mbwira witonze."
        ]
    }
}

# Additional direct questions and answers
QA = {
    "Rwanda Coding Academy iherereye he?": "Iherereye mu Karere ka Nyabihu, mu Ntara y'Iburengerazuba.",
    "Umurwa mukuru w'u Rwanda ni uwuhe?": "Ni Kigali.",
    "Ni inde Perezida w'u Rwanda?": "Ni Paul Kagame.",
    "Ikirere cy'uyu munsi kimeze gite?": "Sinabasha kukubwira ikirere nyacyo, ariko ushobora kugenzura kuri meteo.gov.rw.",
    "Uburebure bwa Mount Kigali ni bungana iki?": "Bufite uburebure bwa metero zirenga 1,800.",
    "U Rwanda rufite intara zingana iki?": "Rufite intara 5: Intara y'Amajyaruguru, Amajyepfo, Iburengerazuba, Iburasirazuba, n'Umujyi wa Kigali."
}

# Create keyword mappings for QA
KEYWORD_MAP = {}

def normalize_text(text):
    # Remove punctuation, lowercase, and strip whitespace
    text = text.strip().lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def match_intent(normalized_input):
    for intent, data in INTENTS.items():
        for pattern in data["patterns"]:
            normalized_pattern = normalize_text(pattern)
            if normalized_pattern in normalized_input:
                return intent
    return "default"

def get_response(user_input: str) -> str:
    normalized = normalize_text(user_input)

    # 2. Try Intent Based.
    intent = match_intent(normalized)
    if intent != "default":
        return random.choice(INTENTS[intent]["responses"])

    # 3. Fallback: delegate to ChatGPT
    return call_chatgpt(user_input)

In [None]:
def full_conversation(audio_file):
    with open(audio_file, "rb") as f:
        if not os.path.isfile(audio_file):
          return "No audio detected, please try again."
        audio_bytes = f.read()

    # Step 1: Transcribe the audio
    transcriber = Transcriber(audio_bytes)
    user_text = transcriber.transcription.text
    print(f"User: {user_text}")

    # Step 2: Get the chatbot response
    response_text = get_response(user_text)
    print(f"Assistant Response: {response_text}")

    # Step 3: Convert chatbot response to speech
    wav_file, _ = kinya_tts(response_text)

    # Step 4: Return transcription and audio response
    return user_text, wav_file

In [None]:
kinya_tts_setup()  # Initialize TTS engine once at startup

Using mel posterior encoder for VITS2
Multi-stream iSTFT VITS2
TTS API engine ready!


In [None]:
## Gradio Interface for Launching the TTS
conversation_demo = gr.Interface(
    fn=full_conversation,
    inputs=gr.Audio(type="filepath", label="Upload Kinyarwanda Audio"),
    outputs=[
        gr.Textbox(label="Transcribed User Text"),
        gr.Audio(label="Assistant Voice Response", autoplay=True)
    ],
    title="Kinyarwanda Voice Assistant",
    description="Speak in Kinyarwanda, get a transcription + voice reply generated by AI!",
    examples=[
        ["/content/sounds/sound-0.wav"],
        ["/content/sounds/sound-1.wav"]
    ]
)

# Launch
conversation_demo.launch(debug=True)

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://c1f65cef94afa1c877.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Transcribing:   0%|          | 0/1 [00:00<?, ?it/s][NeMo W 2025-04-28 05:10:03 nemo_logging:405] CTC decoding strategy 'greedy' is slower than 'greedy_batch', which implements the same exact interface. Consider changing your strategy to 'greedy_batch' for a free performance improvement.
Transcribing: 100%|██████████| 1/1 [00:00<00:00,  2.00it/s]


User: afurika ifite ibihugu bingahe
Assistant Response: Afurika ifite ibihugu 54 byigenga. Ibi bihugu bigize umugabane wa kabiri mu bunini no mu bwinshi bw'abaturage ku isi.


Transcribing: 100%|██████████| 1/1 [00:00<00:00, 21.65it/s]


User: umwami wambere wu rwanda nuwube
Assistant Response: Umwami wa mbere wa Rwanda yari Yuhi I Gahima, umwe mu bami ba kera b'ibihugu by'Abanyarwanda. Hariho urutonde rw'abami b'u Rwanda, kandi Yuhi I Gahima ni umwe mu bami ba mbere bazwi mu mateka y'u Rwanda. Nyuma ye, hari hagiyeho abandi bami benshi kugeza ku mwami wa nyuma, Kigeli V Ndahindurwa, wahagaritse ingoma mu 1961 ubwo u Rwanda rwahindukaga repubulika.
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://c1f65cef94afa1c877.gradio.live


