In [None]:
!pip install pydub gradio librosa

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting gradio
  Downloading gradio-5.31.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio

In [None]:
import gradio as gr
from pydub import AudioSegment
import tempfile
import os
import librosa
import tensorflow as tf
import numpy as np
import soundfile as sf
import matplotlib.pyplot as plt
import librosa.display
import time

In [None]:
CURRENT_AUDIO = None

def process_audio(audio_path):
    global CURRENT_AUDIO
    if not audio_path or not os.path.exists(audio_path):
        raise gr.Error("Сначала сделайте запись!")
    audio = AudioSegment.from_file(audio_path)
    processed = audio[:1000]

    if len(processed) < 1000:
        silence = AudioSegment.silent(1000 - len(processed))
        processed += silence.set_frame_rate(processed.frame_rate)

    temp_path = os.path.join(tempfile.gettempdir(), f"base_{os.getpid()}.wav")
    processed.export(temp_path, format="wav")
    CURRENT_AUDIO = temp_path
    return temp_path

In [None]:
def add_noise(waveform, noise_level=0.02):
    noise = tf.random.normal(shape=tf.shape(waveform), mean=0.0, stddev=noise_level)
    return waveform + noise

def adjust_volume(waveform, volume_factor=1.0):
    return waveform * volume_factor

def apply_time_shift(waveform, shift_seconds=0.0, sample_rate=44100):
    shift_samples = int(shift_seconds * sample_rate)
    return tf.roll(waveform, shift_samples, axis=0)

def stretch_signal(waveform, stretch_rate):
    def _np_stretch(wav_np, rate_np):
        wav_np = np.squeeze(wav_np)

        orig_len = wav_np.shape[0]
        target_len = int(orig_len / rate_np)

        x_old = np.linspace(0, orig_len-1, orig_len)
        x_new = np.linspace(0, orig_len-1, target_len)

        stretched = np.interp(x_new, x_old, wav_np).astype(np.float32)

        if stretched.shape[0] < orig_len:
            stretched = np.pad(stretched, (0, orig_len - stretched.shape[0]))
        else:
            stretched = stretched[:orig_len]

        return stretched

    stretched = tf.numpy_function(
        _np_stretch,
        [waveform, stretch_rate],
        tf.float32
    )

    stretched.set_shape(waveform.shape)
    return stretched

In [None]:
def apply_augmentations(noise_level, volume_factor, shift_seconds, stretch_rate):
      if not CURRENT_AUDIO or not os.path.exists(CURRENT_AUDIO):
          raise gr.Error("Сначала сделайте и обработайте аудиозапись!")

      waveform, sr = sf.read(CURRENT_AUDIO)
      if len(waveform.shape) > 1:
          waveform = np.mean(waveform, axis=1)
      waveform = tf.convert_to_tensor(waveform, dtype=tf.float32)

      if volume_factor != 1.0:
          waveform = adjust_volume(waveform, volume_factor)
      if shift_seconds != 0:
          waveform = apply_time_shift(waveform, shift_seconds)
      if noise_level > 0:
          waveform = add_noise(waveform, noise_level)
      if stretch_rate != 1.0:
          waveform = stretch_signal(waveform, stretch_rate)

      waveform_np = waveform.numpy()

      temp_path = os.path.join(tempfile.gettempdir(), f"augmented_{os.getpid()}.wav")
      sf.write(
          temp_path,
          waveform_np.astype(np.float32),
          sr,
          subtype='FLOAT'
      )

      return temp_path

In [None]:
def plot_waveform(audio, sr):
    fig, ax = plt.subplots(figsize=(10, 4))
    librosa.display.waveshow(audio, sr=sr, ax=ax)
    ax.set(title='Waveform', xlabel='Time (s)', ylabel='Amplitude')
    plt.close(fig)
    return fig

def plot_spectrogram(audio, sr):
    stft = librosa.stft(audio)
    spectrogram = librosa.amplitude_to_db(np.abs(stft), ref=np.max)
    fig, ax = plt.subplots(figsize=(10, 4))
    img = librosa.display.specshow(spectrogram, sr=sr, x_axis='time', y_axis='log', ax=ax)
    fig.colorbar(img, ax=ax, format='%+2.0f dB')
    ax.set(title='Spectrogram', xlabel='Time (s)', ylabel='Frequency (Hz)')
    plt.close(fig)
    return fig

def plot_mel_spectrogram(audio, sr):
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, fmax=8000)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    fig, ax = plt.subplots(figsize=(10, 4))
    img = librosa.display.specshow(mel_spectrogram_db, sr=sr, x_axis='time', y_axis='mel', fmax=8000, cmap='magma', ax=ax)
    fig.colorbar(img, ax=ax, format='%+2.0f dB')
    ax.set(title='Mel Spectrogram', xlabel='Time (s)', ylabel='Frequency (Hz)')
    plt.close(fig)
    return fig

def plot_mfcc(audio, sr):
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    fig, ax = plt.subplots(figsize=(10, 4))
    img = librosa.display.specshow(mfccs, sr=sr, x_axis='time', ax=ax)
    fig.colorbar(img, ax=ax)
    ax.set(title='MFCC', xlabel='Time (s)')
    plt.close(fig)
    return fig

In [None]:
def generate_visuals(audio_path, augmented_path):
      if isinstance(audio_path, dict):
          audio_path = audio_path["name"]
      if isinstance(augmented_path, dict):
          augmented_path = augmented_path["name"]

      if not audio_path or not augmented_path or not all([os.path.exists(p) for p in [audio_path, augmented_path]]):
          raise gr.Error("Сначала сделайте запись и ее аугментированную версию!")

      orig_audio, orig_sr = librosa.load(str(audio_path), sr=None, mono=True)
      aug_audio, aug_sr = librosa.load(str(augmented_path), sr=None, mono=True)

      orig_audio = librosa.util.normalize(orig_audio)
      aug_audio = librosa.util.normalize(aug_audio)

      return {
          vis_container: gr.update(visible=True),
          orig_wave: plot_waveform(orig_audio, orig_sr),
          orig_spec: plot_spectrogram(orig_audio, orig_sr),
          orig_mel: plot_mel_spectrogram(orig_audio, orig_sr),
          orig_mfcc: plot_mfcc(orig_audio, orig_sr),
          aug_wave: plot_waveform(aug_audio, aug_sr),
          aug_spec: plot_spectrogram(aug_audio, aug_sr),
          aug_mel: plot_mel_spectrogram(aug_audio, aug_sr),
          aug_mfcc: plot_mfcc(aug_audio, aug_sr),
      }

In [1]:
COMMANDS = ['bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'four', 'go', 'happy',
          'house', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven',
          'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', 'wow', 'yes', 'zero']

TARGET_SR = 16000
MODEL_BASE_DIR = 'drive/MyDrive/Models/Models'
MODEL_SUBDIRS = {
    'Mel Spectrogram': 'MEL',
    'Spectrogram': 'SPECT',
    'MFCC': 'MFCC'
}

In [None]:
def list_model_files(directory: str):
    return [f for f in os.listdir(directory) if f.endswith(('.keras', '.h5'))]

def load_model(path: str):
    return tf.keras.models.load_model(path)

In [None]:
def get_spectrogram(waveform: np.ndarray):
    waveform = waveform[:TARGET_SR]
    zero_padding = tf.zeros([TARGET_SR] - tf.shape(waveform), dtype=tf.float32)
    waveform = tf.cast(waveform, dtype=tf.float32)
    equal_length = tf.concat([waveform, zero_padding], 0)
    spectrogram = tf.signal.stft(equal_length, frame_length=255, frame_step=128)
    spectrogram = tf.abs(spectrogram)[..., tf.newaxis]
    return spectrogram

def get_mel_spectrogram(waveform: np.ndarray):
    waveform = waveform[:TARGET_SR]
    zero_padding = tf.zeros([TARGET_SR] - tf.shape(waveform), dtype=tf.float32)
    waveform = tf.concat([waveform, zero_padding], 0)
    spectrogram = tf.signal.stft(waveform, frame_length=400, frame_step=160)
    spectrogram = tf.abs(spectrogram)
    num_mel_bins = 80
    linear_to_mel = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins, spectrogram.shape[-1], TARGET_SR, 80.0, 7600.0
    )
    mel_spectrogram = tf.tensordot(spectrogram, linear_to_mel, 1)
    mel_spectrogram = tf.math.log(mel_spectrogram + 1e-6)[..., tf.newaxis]
    return mel_spectrogram

def get_mfcc(waveform: np.ndarray):
    waveform = waveform[:TARGET_SR]
    zero_padding = tf.zeros([TARGET_SR] - tf.shape(waveform), dtype=tf.float32)
    waveform = tf.concat([waveform, zero_padding], 0)
    stft = tf.signal.stft(waveform, frame_length=400, frame_step=160)
    spectrogram = tf.abs(stft)
    num_mel_bins = 80
    linear_to_mel = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins, spectrogram.shape[-1], TARGET_SR, 80.0, 7600.0
    )
    mel = tf.tensordot(spectrogram, linear_to_mel, 1)
    log_mel = tf.math.log(mel + 1e-6)
    mfccs = tf.signal.mfccs_from_log_mel_spectrograms(log_mel)[..., :13]
    return mfccs[..., tf.newaxis]

In [None]:
FEATURE_FUNCS = {
    'Spectrogram': get_spectrogram,
    'Mel Spectrogram': get_mel_spectrogram,
    'MFCC': get_mfcc
}

In [None]:
def predict_command(audio_path, model_path, feature_method):
    audio, sr = librosa.load(audio_path, sr=None)

    audio = librosa.resample(audio, orig_sr=sr, target_sr=TARGET_SR)

    model = load_model(model_path)

    start_time = time.time()
    features = FEATURE_FUNCS[feature_method](audio)
    features = tf.expand_dims(features, 0)

    logits = model(features, training=False)
    probs = tf.nn.softmax(logits, axis=-1).numpy()[0]
    latency = time.time() - start_time

    idx = np.argmax(probs)
    return COMMANDS[idx], probs[idx], latency

In [2]:
def get_models_for_feature(feature_type):
    subdir = MODEL_SUBDIRS.get(feature_type, '')
    model_dir = os.path.join(MODEL_BASE_DIR, subdir)

    if not os.path.exists(model_dir):
        print(f"Warning: Model directory {model_dir} not found!")
        return []

    return [f for f in os.listdir(model_dir)
            if f.endswith(('.keras', '.h5'))]

In [3]:
def update_model_list(feature_type):
    models = get_models_for_feature(feature_type)
    return gr.update(choices=models, value=models[0] if models else None)

In [None]:
app_name = """
<div style="text-align: center; margin-bottom: 20px;">
    <h1 style="font-size: 2.5em; color: #f97316; margin-bottom: 10px;">Audio Command Recognition</h1>
    <p style="font-size: 1.2em; color: #f97316;">Приложение для обработки и классификации голосовых команд</p>
</div>
"""

In [None]:
def create_commands_table():
    chunk_size = (len(COMMANDS) + 2) // 3
    columns = [COMMANDS[i:i + chunk_size] for i in range(0, len(COMMANDS), chunk_size)]

    max_len = max(len(col) for col in columns)
    for col in columns:
        col.extend([""] * (max_len - len(col)))

    return list(zip(*columns))

In [None]:
def classify_handler(audio_path, model_file, feature_method):
        if not audio_path or not os.path.exists(audio_path):
            raise gr.Error("Cначала создайте аугментированную запись!")

        model_dir = os.path.join(MODEL_BASE_DIR, MODEL_SUBDIRS[feature_method])
        model_path = os.path.join(model_dir, model_file)

        command, confidence, latency = predict_command(audio_path, model_path, feature_method)

        return (
            command,
            f"{confidence * 100:.1f}%",
            f"{latency * 1000:.1f} мс"
        )

In [None]:
with gr.Blocks() as demo:

    # Блок информации
    gr.Markdown(app_name)

    commands_table = gr.Dataframe(
        value=create_commands_table(),
        headers=["", "", ""],
        row_count=(len(COMMANDS) + 2) // 3,
        col_count=3,
        interactive=False,
        datatype=["str", "str", "str"],
        elem_classes=["compact-table"],
    )


    # Блок записи
    gr.Markdown("""
    <h2 style='color: #f97316;'>Запись</h2>
    """)

    audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Запишите аудио")

    gr.Button("Обработать").click(
        fn=process_audio,
        inputs=audio_input,
        outputs=gr.Audio(label="Обработанная запись", interactive=False)
    )



    # Блок аугментаций
    gr.Markdown("""
    <h2 style='color: #f97316;'>Аугментации</h2>
    """)

    with gr.Row():
        with gr.Column():
            volume_slider = gr.Slider(0.1, 3.0, value=1.0, label="Громкость")
            shift_slider = gr.Slider(-0.5, 0.5, value=0.0, label="Сдвиг (сек)")
        with gr.Column():
            noise_slider = gr.Slider(0.0, 0.1, value=0.0, label="Уровень шума")
            stretch_slider = gr.Slider(0.5, 2.0, value=1.0, label="Растяжение")

    apply_aug = gr.Button("Аугментировать")
    aug_output = gr.Audio(label="Результат аугментаций", type="filepath", interactive=False)
    apply_aug.click(
        apply_augmentations,
        inputs=[noise_slider, volume_slider, shift_slider, stretch_slider],
        outputs=aug_output
    )



    # Блок визуализаций
    gr.Markdown("""
    <h2 style='color: #f97316;'>Визуализации</h2>
    """)
    show_vis_btn = gr.Button("Показать визуализации")

    with gr.Column(visible=False) as vis_container:
        with gr.Row():
            gr.Markdown("### Оригинал")
            gr.Markdown("### Аугментированный")

        with gr.Row():
            with gr.Column():
                orig_wave = gr.Plot()
                orig_spec = gr.Plot()
                orig_mel = gr.Plot()
                orig_mfcc = gr.Plot()
            with gr.Column():
                aug_wave = gr.Plot()
                aug_spec = gr.Plot()
                aug_mel = gr.Plot()
                aug_mfcc = gr.Plot()

    show_vis_btn.click(
        fn=generate_visuals,
        inputs=[audio_input, aug_output],
        outputs=[vis_container, orig_wave, orig_spec, orig_mel, orig_mfcc, aug_wave, aug_spec, aug_mel, aug_mfcc]
    )



    # Блок классификации
    gr.Markdown("""
    <h2 style='color: #f97316;'>Классификация</h2>
    """)

    with gr.Row():
        feature_dropdown = gr.Dropdown(
            choices=list(FEATURE_FUNCS.keys()),
            value="Mel Spectrogram",
            label="Выберите представление данных"
        )
        model_dropdown = gr.Dropdown(
            label="Выберите модель",
            interactive=True
        )

    feature_dropdown.change(
        fn=update_model_list,
        inputs=feature_dropdown,
        outputs=model_dropdown
    )

    demo.load(
        fn=lambda: update_model_list(feature_dropdown.value),
        outputs=model_dropdown
    )

    classify_btn = gr.Button("Классифицировать")

    with gr.Row():
        command_output = gr.Textbox(label="Предсказание")
        confidence_output = gr.Textbox(label="Уверенность")
        latency_output = gr.Textbox(label="Скорость")

    classify_btn.click(
        fn=classify_handler,
        inputs=[aug_output, model_dropdown, feature_dropdown],
        outputs=[command_output, confidence_output, latency_output]
    )



demo.launch(debug=True)

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://78691b3745baf88e5c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
