In [1]:
pip install torch torchaudio transformers sounddevice numpy scipy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import sounddevice as sd
import numpy as np
from scipy.io.wavfile import write
import re
import time

# Load the Telugu ASR model and processor
model_name = "Harveenchadha/vakyansh-wav2vec2-telugu-tem-100"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

# Parameters
sample_rate = 16000  # Sample rate expected by the model
duration = 5         # Duration of each audio chunk in seconds
block_size = int(sample_rate * duration)  # Number of samples per chunk

def process_audio(audio_np):
    """
    Normalize and process audio to generate transcript.
    """
    # Normalize the audio
    if np.std(audio_np) != 0:
        audio_np = (audio_np - audio_np.mean()) / audio_np.std()
    else:
        audio_np = audio_np - audio_np.mean()

    # Process the audio with the ASR model
    input_values = processor(audio_np, return_tensors="pt", sampling_rate=sample_rate).input_values
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]

    # Clean the transcription if necessary
    transcription = re.sub(r"<s>", "", transcription)
    return transcription

def main():
    print("Starting real-time transcription. Press Ctrl+C to stop.")
    try:
        while True:
            # Notify user to start speaking
            print("\n[DEBUG] READY: Speak now for 5 seconds...")
            # Record audio for 'duration' seconds
            recording = sd.rec(block_size, samplerate=sample_rate, channels=1)
            sd.wait()  # Wait until recording is finished

            print("[DEBUG] STOPPED: Finished recording. Processing transcript...")
            # Convert the recorded audio to numpy array (flattened to 1-D)
            audio_np = recording[:, 0].astype(np.float32)
            
            # Process audio and get transcription
            transcription = process_audio(audio_np)
            print("[DEBUG] TRANSCRIPTION:", transcription)
            
            # Wait 2 seconds before starting next recording
            print("[DEBUG] Waiting for 2 seconds before the next recording...")
            time.sleep(2)
    except KeyboardInterrupt:
        print("\n[DEBUG] Terminating transcription. Goodbye!")
        return

if __name__ == "__main__":
    main()

Starting real-time transcription. Press Ctrl+C to stop.

[DEBUG] READY: Speak now for 5 seconds...
[DEBUG] STOPPED: Finished recording. Processing transcript...
[DEBUG] TRANSCRIPTION: ఎలా ఉన్నారు 
[DEBUG] Waiting for 2 seconds before the next recording...

[DEBUG] READY: Speak now for 5 seconds...
[DEBUG] STOPPED: Finished recording. Processing transcript...
[DEBUG] TRANSCRIPTION: బాగున్నారా 
[DEBUG] Waiting for 2 seconds before the next recording...

[DEBUG] READY: Speak now for 5 seconds...
[DEBUG] STOPPED: Finished recording. Processing transcript...
[DEBUG] TRANSCRIPTION: కింగ్ కోహిలీ్ 
[DEBUG] Waiting for 2 seconds before the next recording...

[DEBUG] READY: Speak now for 5 seconds...
[DEBUG] STOPPED: Finished recording. Processing transcript...
[DEBUG] TRANSCRIPTION: ఫీల్ 
[DEBUG] Waiting for 2 seconds before the next recording...

[DEBUG] READY: Speak now for 5 seconds...
[DEBUG] STOPPED: Finished recording. Processing transcript...
[DEBUG] TRANSCRIPTION: మంచిగా ఉన్నా
[DEBUG] W