<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/ufidon/nlp/blob/main/asrs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  </td>
  <td>
    <a target="_blank" href="https://kaggle.com/kernels/welcome?src=https://github.com/ufidon/nlp/blob/main/asrs.ipynb"><img src="https://kaggle.com/static/images/open-in-kaggle.svg" /></a>
  </td>
</table>
<br>

# [Speech Recognition & Synthesis Applications](https://huggingface.co/learn/audio-course/chapter7/introduction)
- From [HuggingFace Audio Course](https://huggingface.co/learn/audio-course)

- 📖 [Unit 4: Build A Music Genre Classifier](https://huggingface.co/learn/audio-course/chapter4)

In [None]:
# ①. Pre-trained models and datasets for audio classification
# install the latest transformers
!pip install git+https://github.com/huggingface/transformers

# Keyword spotting (KWS) is the task of identifying a keyword in a spoken utterance. 

In [None]:
# 1.1. load dataset Minds-14
from datasets import load_dataset

minds = load_dataset("PolyAI/minds14", name="en-AU", split="train")

In [None]:
# 1.2. load the classifier
from transformers import pipeline

classifier = pipeline(
    "audio-classification",
    model="anton-l/xtreme_s_xlsr_300m_minds14",
)

In [None]:
# 1.3. try a sample
classifier(minds[0]["audio"])

In [None]:
# 2. Speech Commands
# load dataset of speech commands
speech_commands = load_dataset(
    "speech_commands", "v0.02", split="validation", streaming=True
)
sample = next(iter(speech_commands))

In [None]:
# load a classifier for speech commands
classifier = pipeline(
    "audio-classification", model="MIT/ast-finetuned-speech-commands-v2"
)
classifier(sample["audio"].copy())

In [None]:
# verify the result
from IPython.display import Audio

Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])

In [None]:
# 3. Language Identification
# FLEURS (Few-shot Learning Evaluation of Universal Representations of Speech) 
# is a dataset for evaluating speech recognition systems in 102 languages

# load up a sample from the validation split of the FLEURS dataset using streaming mode:
fleurs = load_dataset("google/fleurs", "all", split="validation", streaming=True)
sample = next(iter(fleurs))

In [None]:
# load a classifier for language identification
classifier = pipeline(
    "audio-classification", model="sanchit-gandhi/whisper-medium-fleurs-lang-id"
)

In [None]:
# try it on the sample
classifier(sample["audio"])

In [None]:
# 4. Zero-Shot Audio Classification
# Load the Environmental Speech Challenge (ESC) dataset:
dataset = load_dataset("ashraq/esc50", split="train", streaming=True)
audio_sample = next(iter(dataset))["audio"]["array"]

# define our candidate labels form the set of possible classification labels. 
candidate_labels = ["Sound of a dog", "Sound of vacuum cleaner"]

# run both through the model to find the candidate label 
# that is most similar to the audio input:
classifier = pipeline(
    task="zero-shot-audio-classification", model="laion/clap-htsat-unfused"
)
classifier(audio_sample, candidate_labels=candidate_labels)

In [None]:
# Confirm the result
Audio(audio_sample, rate=16000)

In [None]:
# ② Fine-tuning a model for music classification


In [None]:
# ③ Build a demo with Gradio


- 📖 [Unit 7: Putting It All Together](https://huggingface.co/learn/audio-course/chapter7)

- 📖 [Unit 3: Transformer Architectures For Audio](https://huggingface.co/learn/audio-course/chapter3)