# ASR Module

## Install Dependencies

In [1]:
pip install --upgrade --no-cache-dir openai-whisper


Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install sounddevice wave transformers sentence-transformers librosa soundfile noisereduce

Collecting noisereduce
  Downloading noisereduce-3.0.3-py3-none-any.whl.metadata (14 kB)
Collecting matplotlib (from noisereduce)
  Downloading matplotlib-3.10.1-cp311-cp311-macosx_10_12_x86_64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib->noisereduce)
  Downloading contourpy-1.3.1-cp311-cp311-macosx_10_9_x86_64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib->noisereduce)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib->noisereduce)
  Downloading fonttools-4.56.0-cp311-cp311-macosx_10_9_x86_64.whl.metadata (101 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib->noisereduce)
  Downloading kiwisolver-1.4.8-cp311-cp311-macosx_10_9_x86_64.whl.metadata (6.2 kB)
Collecting pyparsing>=2.3.1 (from matplotlib->noisereduce)
  Downloading pyparsing-3.2.1-py3-none-any.whl.metadata (5.0 kB)
Downloading noisereduce-3.0.3-py3-none-any.whl (22 kB)
Downloading matplotlib-3.10.1-cp311-cp311-macosx_10_12_x86

## ASR

In [7]:
import whisper
import faiss
import torch
import sounddevice as sd
import numpy as np
import wave
import librosa
import noisereduce as nr
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import soundfile as sf


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Load Whisper model (using 'small' for efficiency)
asr_model = whisper.load_model("small")

# Load embedding model for memory storage
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Initialize FAISS index for vector storage
embedding_dim = 384  # Must match MiniLM embedding size
index = faiss.IndexFlatL2(embedding_dim)


In [9]:
def record_audio(filename, duration=10, samplerate=16000):
    """
    Records audio from the microphone and saves it as a WAV file.
    Args:
        filename (str): Path to save the recorded audio.
        duration (int): Duration of recording in seconds.
        samplerate (int): Sampling rate for audio recording.
    """
    print("Recording...")
    audio_data = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype=np.int16)
    sd.wait()
    print("Recording finished.")
    
    with wave.open(filename, "wb") as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(samplerate)
        wf.writeframes(audio_data.tobytes())


In [10]:
def reduce_noise(input_audio, output_audio):
    """
    Applies noise reduction to an audio file and saves the cleaned version.
    Args:
        input_audio (str): Path to the input noisy audio file.
        output_audio (str): Path to save the cleaned audio file.
    """
    y, sr = librosa.load(input_audio, sr=None)
    reduced_noise = nr.reduce_noise(y=y, sr=sr)
    sf.write(output_audio, reduced_noise, sr)
    return output_audio


In [11]:
def transcribe_audio(audio_path):
    """
    Transcribes speech from an audio file using Whisper ASR.
    Args:
        audio_path (str): Path to the audio file.
    Returns:
        str: Transcribed text.
    """
    cleaned_audio = reduce_noise(audio_path, "cleaned_audio.wav")
    result = asr_model.transcribe(cleaned_audio)
    return result["text"]

def store_transcription_in_memory(text):
    """
    Converts transcribed text into an embedding and stores it in FAISS.
    Args:
        text (str): Transcribed speech text.
    """
    embedding = embedding_model.encode([text])
    embedding = np.array(embedding).astype('float32')
    index.add(embedding)  # Store in FAISS
    print("Stored in Memory Module:", text)


In [13]:
# Test
audio_file = "recorded_audio.wav"
record_audio(audio_file)
transcription = transcribe_audio(audio_file)
if transcription:
    store_transcription_in_memory(transcription)


Recording...
Recording finished.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Stored in Memory Module: What did you do last week?你上周做了什么?Good morning. 早上好.
