<a href="https://colab.research.google.com/github/xpdlaldam/nlp/blob/master/Hugging%20Face/audio_analysis/speech_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [None]:
!pip install -U datasets gradio evaluate transformers[sentencepiece]

In [None]:
pip install -U datasets

In [None]:
from transformers import pipeline
from datasets import load_dataset
from transformers.pipelines.pt_utils import KeyDataset
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
from tqdm.auto import tqdm
import gradio as gr

In [None]:
# load model and tokenizer
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# load dummy dataset and read soundfiles
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")

In [None]:
dataset = load_dataset("superb", name="asr", split="test")

In [None]:
from huggingface_hub import list_datasets
print([dataset.id for dataset in list_datasets()])

In [None]:
minds = load_dataset("PolyAI/minds14", name="ko-KR", split="train")
minds

In [None]:
minds = load_dataset("PolyAI/minds14", name="en-GB", split="train")
minds

In [None]:
id2label = minds.features["intent_class"].int2str
id2label(minds[0]["intent_class"])

In [None]:
# Stream data instead of downloading the full dataset
dataset = load_dataset("ymoslem/EUbookshop-Speech-Irish", split="train", streaming=True)

# Take the first 5 samples
small_sample = [next(iter(dataset)) for _ in range(5)]
small_sample

In [None]:
id2label = minds.features["intent_class"].int2str
id2label(minds[0]["intent_class"])

In [None]:
minds['intent_class']

In [None]:
def generate_audio():
    example = minds[0]
    audio = example["audio"]
    return (
        audio["sampling_rate"], # Hz
        audio["array"], # contains the sound represented in numbers in an array
    ), id2label(example["intent_class"])

In [None]:
with gr.Blocks() as demo:
    with gr.Column():
        for _ in range(1):
            audio, label = generate_audio()
            output = gr.Audio(audio, label=label)

demo.launch(debug=True)

In [None]:
# tokenize
input_values = processor(ds[0]["audio"]["array"], return_tensors="pt", padding="longest").input_values  # Batch size 1

# retrieve logits
logits = model(input_values).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)

In [None]:
import gradio as gr
from datasets import load_dataset, Audio

# Load dataset in streaming mode
streaming_data = load_dataset("ymoslem/EUbookshop-Speech-Irish", split="train", streaming=True)
stream_iter = iter(streaming_data)

# Set up the Audio decoder (to decode audio directly from the stream)
audio_decoder = Audio()

# Function to generate audio and label from the stream
def generate_audio():
    # Get the next sample from the stream
    example = next(stream_iter)

    # Decode the audio
    audio = audio_decoder.decode_example(example["audio"])
    audio_array = audio["array"]
    sampling_rate = audio["sampling_rate"]

    # Get English translation as label
    label = example.get("translation", {}).get("en", "No English translation available")

    return (audio_array, sampling_rate), label

# Build the Gradio UI
with gr.Blocks() as demo:
    with gr.Column():
        # Gradio components
        audio_component = gr.Audio(label="Irish Speech Audio", type="numpy")
        label_component = gr.Textbox(label="English Translation")

        def update():
            return generate_audio()

        # Button to trigger the next sample
        gr.Button("Get Next Sample").click(fn=update, inputs=[], outputs=[audio_component, label_component])

demo.launch(debug=True)
