In [None]:
# This cell installs all required libraries for the project.
# TensorFlow and TensorFlow Hub are used for loading and running
# pre-trained deep learning models such as YAMNet.
# Librosa and SoundFile are used for audio loading and processing.
# OpenAI Whisper is used to convert human speech in audio into text.
!pip install -q tensorflow tensorflow-hub librosa soundfile openai-whisper
#Installs all required libraries for audio processing, deep learning, and speech recognition.

In [None]:
# This cell imports all required libraries into the program.
# Import TensorFlow and TensorFlow Hub are used for running pre-trained models.
# Import Librosa, SoundFile,and NumPy are used for audio loading and processing.
# Import Counter is used to count dominant sound events.
# Import Whisper is used for converting human speech in audio into text.
import tensorflow as tf
import tensorflow_hub as hub
import librosa
import numpy as np
import soundfile as sf
from collections import Counter
import whisper
#Installs all required libraries for audio processing, deep learning, and speech recognition.


In [None]:
# This cell loads the pre-trained YAMNet model for environmental sound detection.
# It extracts the list of sound class names used by YAMNet.
# It also loads the Whisper model for converting speech audio into text.
yamnet_model = hub.load("https://tfhub.dev/google/yamnet/1")
class_map_path = yamnet_model.class_map_path().numpy().decode("utf-8")

class_names = []
with open(class_map_path) as f:
    for line in f.readlines()[1:]:
        class_names.append(line.strip().split(",")[2])

whisper_model = whisper.load_model("tiny")
#Imports the necessary Python libraries used throughout the audio captioning pipeline.

In [None]:
# This cell allows the user to either upload an audio file or record audio live using the microphone.
# The recorded or uploaded audio is saved and used as input for further processing.

# Import required modules for file upload, JavaScript-based audio recording, and audio decoding
from google.colab import files, output
from IPython.display import Javascript
from base64 import b64decode

# Maximum allowed duration for live audio recording (in seconds)
MAX_SEC = 10  # hard limit in seconds

# JavaScript code for recording audio directly from the browser with start, pause, resume, and stop controls
RECORD_JS = f"""
const b2text = blob => new Promise(resolve => {{
  const reader = new FileReader();
  reader.onloadend = e => resolve(e.srcElement.result);
  reader.readAsDataURL(blob);
}});

var record = maxTime => new Promise(async resolve => {{
  // Container
  const container = document.createElement("div");
  container.style.marginTop = "8px";
  container.style.fontSize = "14px";

  // Countdown text
  const label = document.createElement("div");
  label.style.fontWeight = "bold";
  label.innerText = "Ready.";
  container.appendChild(label);

  // Buttons
  const startBtn = document.createElement("button");
  startBtn.textContent = "Start";
  startBtn.style.marginRight = "4px";

  const pauseBtn = document.createElement("button");
  pauseBtn.textContent = "Pause";
  pauseBtn.disabled = true;
  pauseBtn.style.marginRight = "4px";

  const resumeBtn = document.createElement("button");
  resumeBtn.textContent = "Resume";
  resumeBtn.disabled = true;
  resumeBtn.style.marginRight = "4px";

  const stopBtn = document.createElement("button");
  stopBtn.textContent = "Stop";
  stopBtn.disabled = true;

  container.appendChild(startBtn);
  container.appendChild(pauseBtn);
  container.appendChild(resumeBtn);
  container.appendChild(stopBtn);

  document.querySelector("#output-area").appendChild(container);

  let stream = null;
  let recorder = null;
  let chunks = [];
  let elapsed = 0;
  let intervalId = null;

  function updateLabel() {{
    const remaining = Math.max(0, Math.round(maxTime - elapsed));
    label.innerText = "Recording... " + remaining + " s left";
  }}

  function clearTimer() {{
    if (intervalId !== null) {{
      clearInterval(intervalId);
      intervalId = null;
    }}
  }}

  function startTimer() {{
    clearTimer();
    intervalId = setInterval(() => {{
      elapsed += 1;
      updateLabel();
      if (elapsed >= maxTime) {{
        stopRecording();
      }}
    }}, 1000);
  }}

  function stopRecording() {{
    clearTimer();
    if (recorder && recorder.state !== "inactive") {{
      recorder.stop();
    }}
    if (stream) {{
      stream.getTracks().forEach(t => t.stop());
    }}
    startBtn.disabled = true;
    pauseBtn.disabled = true;
    resumeBtn.disabled = true;
    stopBtn.disabled = true;
  }}

  startBtn.onclick = async () => {{
    startBtn.disabled = true;
    pauseBtn.disabled = false;
    stopBtn.disabled = false;

    stream = await navigator.mediaDevices.getUserMedia({{ audio: true }});
    recorder = new MediaRecorder(stream);
    chunks = [];

    recorder.ondataavailable = e => chunks.push(e.data);

    recorder.onstop = async () => {{
      const blob = new Blob(chunks);
      const text = await b2text(blob);
      label.innerText = "Recording finished.";
      resolve(text);
    }};

    recorder.start();
    elapsed = 0;
    updateLabel();
    startTimer();
  }};

  pauseBtn.onclick = () => {{
    if (recorder && recorder.state === "recording") {{
      recorder.pause();
      pauseBtn.disabled = true;
      resumeBtn.disabled = false;
      label.innerText = "Paused at " + Math.round(elapsed) + " s";
      clearTimer();
    }}
  }};

  resumeBtn.onclick = () => {{
    if (recorder && recorder.state === "paused") {{
      recorder.resume();
      pauseBtn.disabled = false;
      resumeBtn.disabled = true;
      updateLabel();
      startTimer();
    }}
  }};

  stopBtn.onclick = () => {{
    stopRecording();
  }};
}});
"""

# Python function to trigger browser-based audio recording and save the recorded audio as a WAV file
def record_audio(sec=5, fname="recorded.wav"):
    if sec > MAX_SEC:
        sec = MAX_SEC
    print(f"Use the buttons below. Max {sec} seconds.")
    display(Javascript(RECORD_JS))
    s = output.eval_js(f"record({sec})")  # sec in seconds
    b = b64decode(s.split(",")[1])
    with open(fname, "wb") as f:
        f.write(b)
    print("Done recording.")
    return fname

# ---- MAIN CHOICE CELL ----
# Ask the user to choose between uploading a local audio file or recording live audio
choice = input("Enter 1 for local audio, 2 for live recording: ").strip()

if choice == "1":
  # Option 1: Upload an existing audio file from the local system
    print("Upload a local audio file...")
    uploaded = files.upload()
    audio_path = list(uploaded.keys())[0]
    print("Using local file:", audio_path)

elif choice == "2":
  # Option 2: Record audio in real time using the system microphone
    audio_path = record_audio(sec=MAX_SEC, fname="live_audio.wav")
    print("Using recorded file:", audio_path)

else:
  # Handle invalid user input
    raise ValueError("Invalid choice. Please run the cell again and enter 1 or 2.")
2

Use the buttons below. Max 60 seconds.


<IPython.core.display.Javascript object>

In [None]:
# This cell loads the uploaded audio file and converts it into
# a TensorFlow tensor so it can be processed by deep learning models.

y, sr = librosa.load(audio_path, sr=16000)
waveform = tf.convert_to_tensor(y, dtype=tf.float32)
#Allows the user to upload an audio file and stores its path for processing.


  y, sr = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [None]:
# This cell runs the YAMNet model on the audio waveform to detect
# sound events at the frame level and identifies the most probable
# sound class for each audio frame.

# YAMNet inference
scores, embeddings, spectrogram = yamnet_model(waveform)

# Frame-level top class detection
frame_top_classes = tf.argmax(scores, axis=1).numpy()
frame_event_labels = [class_names[i] for i in frame_top_classes]

print("Sample frame-level events:", frame_event_labels[:20])

#Loads the uploaded audio file and converts it into a TensorFlow tensor for model inference.

Sample frame-level events: ['Speech', 'Music', 'Music', 'Music', 'Music', 'Music', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech']


In [None]:
# This cell identifies dominant sound events by counting how often
# each sound occurs across audio frames and selecting the most
# frequently occurring events.
from collections import Counter

event_counter = Counter(frame_event_labels)

# Minimum frames threshold (important for short sounds)
MIN_FRAMES = 3

dominant_events = [
    event for event, count in event_counter.items()
    if count >= MIN_FRAMES
]

# Fallback: strongest global event
if not dominant_events:
    mean_scores = np.mean(scores.numpy(), axis=0)
    dominant_events = [class_names[np.argmax(mean_scores)]]

print("Dominant events:", dominant_events)

#Runs YAMNet on the audio to detect sound events at the frame level.

Dominant events: ['Speech', 'Music', '"Chicken']


In [None]:
# This cell defines a mapping from detected sound events
# to meaningful English sentences used for audio caption generation.

EVENT_TO_SENTENCE = {
    # Animals
    "Cat": "A cat is meowing.",
    "Dog": "A dog is barking.",
    "Bird": "Birds are chirping.",

    # Musical instruments
    "Guitar": "A guitar is being played.",
    "Piano": "A piano is playing.",
    "Drum": "Drums are being played.",
    "Violin": "A violin is playing.",

    # Vehicles
    "Vehicle horn": "A vehicle horn is sounding.",
    "Car horn": "A car horn is honking.",
    "Train horn": "A train horn is sounding.",
    "Train": "A train horn is sounding.",
    "Engine": "An engine sound is heard.",

    # Human
    "Speech": "A person is speaking.",
    "Singing": "A person is singing.",

    # Environment
    "Rain": "Rain is falling.",
    "Wind": "Wind is blowing.",
    "Footsteps": "People are walking."
}
#Maps detected sound events to meaningful English sentences.

In [None]:
# This cell defines a function to detect whether human speech
# is present in the audio based on dominant sound events and
# YAMNet confidence scores.

def detect_speech(scores, class_names, dominant_events, threshold=0.5):
    # Speech must be a dominant event
    if "Speech" not in dominant_events:
        return False

    mean_scores = np.mean(scores.numpy(), axis=0)

    speech_classes = ["Speech", "Conversation", "Narration"]
    for cls in speech_classes:
        if cls in class_names:
            idx = class_names.index(cls)
            if mean_scores[idx] > threshold:
                return True

    return False

#Determines whether human speech is present in the audio using confidence scores.

In [None]:
# This cell defines the caption generation function that converts
# dominant sound events into a complete human-readable sentence.

def hierarchical_graphac_caption(dominant_events):
    sentences = []
    for event in dominant_events:
        if event in EVENT_TO_SENTENCE:
            sentences.append(EVENT_TO_SENTENCE[event])
        else:
            sentences.append(f"The sound of {event.lower()} is heard.")
    return " ".join(sentences)

#Generates a caption by converting dominant sound events into a single sentence

In [None]:
# This cell defines the final decision logic that generates
# the audio caption by using speech transcription if speech
# is detected, otherwise using sound-based captioning.

def final_caption(audio_path, scores, dominant_events, class_names):
    background_caption = hierarchical_graphac_caption(dominant_events)

    if detect_speech(scores, class_names, dominant_events):
        result = whisper_model.transcribe(
            audio_path,
            task="translate"
        )
        speech_text = result["text"].strip()

        if speech_text:
            return (
                f'A person says: "{speech_text}". '
                f'Background sounds include: {background_caption}'
            )

    return background_caption


#Selects between speech transcription and sound-based captioning to produce the final caption

In [None]:
# This cell executes the complete audio captioning pipeline
# and prints the final generated caption for the input audio.

caption = final_caption(
    audio_path,
    scores,
    dominant_events,
    class_names
)

print("Generated Caption:")
print(caption)

#Executes the full pipeline and prints the final generated audio caption



Generated Caption:
A person says: "Vanakkam, my name is Vivek. I am doing ASR research.". Background sounds include: A person is speaking. The sound of cough is heard.
