In [3]:
import pandas as pd

# Load the CSV file
file_path = 'data/indicvoices.csv'
data = pd.read_csv(file_path)

# Ensure the 'duration' column exists and calculate the total duration
if 'duration' in data.columns:
    total_duration = data['duration'].sum()
    print(f"Total Duration: {total_duration/3600} hours")
else:
    print("The 'duration' column does not exist in the dataset.")

Total Duration: 454.16105503472215 hours


In [1]:
# get the number of wav files in data/audio

import os
audio_dir = 'data/audio'
num_wav_files = len([f for f in os.listdir(audio_dir) if f.endswith('.wav')])
print(f"Number of WAV files in '{audio_dir}': {num_wav_files}")

Number of WAV files in 'data/audio': 222331


In [4]:
# num rows in the csv file
num_rows = len(data)
print(f"Number of rows in the CSV file: {num_rows}")

Number of rows in the CSV file: 222331


In [5]:
import pandas as pd
import os

CSV_PATH = "data/indicvoices.csv"

if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"{CSV_PATH} not found")

print("üîπ Loading CSV...")
df = pd.read_csv(CSV_PATH)

if "audio" not in df.columns:
    raise ValueError("‚ùå 'audio' column not found in CSV")

def extract_batch(audio_name):
    """
    Extract batch number from:
    <lang>-<batch>-<file>-<segment>.wav
    """
    try:
        return audio_name.split("-")[1]
    except Exception:
        return None

print("üîπ Extracting batch column...")
df["batch"] = df["audio"].apply(extract_batch)

# Optional sanity check
missing = df["batch"].isna().sum()
if missing > 0:
    print(f"‚ö†Ô∏è Warning: {missing} rows could not extract batch")

print("üîπ Writing updated CSV...")
df.to_csv(CSV_PATH, index=False)

print("‚úÖ Batch column added successfully")

üîπ Loading CSV...
üîπ Extracting batch column...
üîπ Writing updated CSV...
‚úÖ Batch column added successfully


In [None]:
import csv
import re
from tqdm import tqdm

RAW_TO_NOISE_TAG = {

    # -------------------------
    # Generic / fallback
    # -------------------------
    "noise": "@noise",
    "persistent-noise-start": "@noise-start",
    "persistent-noise-end": "@noise-end",

    # -------------------------
    # Background environments / chatter
    # -------------------------
    "tv": "@background-tv",
    "music": "@background-music",
    "tones": "@background-music",
    "tone": "@background-music",
    "trill": "@background-music",

    "baby": "@background-chatter",
    "child": "@background-chatter",
    "children": "@background-chatter",
    "talking": "@background-chatter",
    "child_talking": "@background-chatter",
    "children_talking": "@background-chatter",
    "baby_talking": "@background-chatter",
    "whispering": "@whispers",

    # -------------------------
    # Background human sounds
    # -------------------------
    "baby_crying": "@background-crying",
    "child_crying": "@background-crying",
    "child_whining": "@background-crying",

    "child_laughing": "@background-laughter",
    "laughter": "@background-laughter",

    "child_yelling": "@background-yelling",
    "children_yelling": "@background-yelling",
    "yelling": "@background-yelling",

    "singing": "@background-singing",
    "whistling": "@background-whistling",
    "hum": "@background-humming",
    "humming": "@background-humming",
    "sigh": "@background-sighing",

    # -------------------------
    # Animals
    # -------------------------
    "animal": "@animal-sounds",
    "barking": "@animal-sounds",
    "meow": "@animal-sounds",

    "bird_squawk": "@bird-sounds",
    "squawking": "@bird-sounds",

    # -------------------------
    # Vehicles / machines
    # -------------------------
    "motorcycle": "@vehicle-noise",
    "printer": "@mechanical-noise",
    "typewriter": "@typing",
    "phone_vibrating": "@mechanical-noise",

    # -------------------------
    # Discrete background noises
    # -------------------------
    "dishes": "@mechanical-noise",
    "door": "@mechanical-noise",

    "footsteps": "@footsteps",
    "click": "@click",
    "clicking": "@click",
    "clink": "@clinking",
    "clinking": "@clinking",
    "clanking": "@clanking",
    "clanging": "@clanging",
    "tapping": "@tapping",
    "scratching": "@scratching",
    "squeak": "@squeak",
    "thumping": "@thumping",
    "pounding": "@pounding",
    "screeching": "@screeching",
    "rattling": "@rattling",
    "rustling": "@rustling",
    "popping": "@pounding",
    "smack": "@smack",

    "static": "@static",
    "hiss": "@hiss",

    # -------------------------
    # Signals / alerts
    # -------------------------
    "beep": "@beep",
    "bell": "@bell",
    "buzz": "@buzz",
    "buzzer": "@buzz",
    "ringing": "@ringing",
    "phone_ringing": "@phone-ringing",
    "horn": "@horn",
    "siren": "@siren",
    "chiming": "@chiming",

    # -------------------------
    # Speaker disfluencies
    # -------------------------
    "uhh": "@uhh",
    "umm": "@umm",
    "hmm": "@hmm",
    "uh-huh": "@uh-huh",
    "tsk": "@tsk",
    "stammers": "@stammering",

    # -------------------------
    # Speaker-produced sounds (non-persistent)
    # -------------------------
    "breathing": "@breathing",
    "inhaling": "@inhaling",
    "sniffing": "@sniffing",
    "sniffle": "@sniffing",
    "nose_blowing": "@nose-blowing",
    "cough": "@cough",
    "sneezing": "@sneezing",
    "throat_clearing": "@throat-clearing",
    "yawning": "@yawning",
    "swallowing": "@eating",
    "snoring": "@snoring",
    "wheezing": "@wheezing",
    "gasp": "@breathing",

    "groan": "@groan",
    "ugh": "@ugh",

    # -------------------------
    # Other
    # -------------------------
    "unintelligible": "@unintelligible"
}


# Match <token> or [token]
WRAPPED_TOKEN_REGEX = re.compile(r'(<|\[)([^>\]]+)(>|\])')

def replace_wrapped_tokens(text, mapping):
    """
    Replace wrapped tokens like <word> or [word] with mapped tag.
    Transcript is lowercased BEFORE matching.
    Wrapper (< > or [ ]) is REMOVED in output.
    """
    if not text:
        return text

    # Normalize transcript text to lowercase
    text = text.lower()

    def replacer(match):
        token = match.group(2)
        if token in mapping:
            return mapping[token]
        return match.group(0)

    return WRAPPED_TOKEN_REGEX.sub(replacer, text)



def fetch_data(file_path):
    data = []
    header = []
    try:
        with open(file_path, mode="r", encoding="utf-8") as file:
            reader = csv.reader(file)
            header = next(reader)
            for row in reader:
                data.append(row)
    except Exception as e:
        print(f"Error reading file: {e}")
    return header, data


# -------------------------
# Main processing
# -------------------------

path = "data/indicvoices.csv"
header, data = fetch_data(path)

# Get column indices safely
try:
    verbatim_idx = header.index("unsanitized_verbatim")
    normalized_idx = header.index("unsanitized_normalized")
except ValueError as e:
    raise RuntimeError(f"Missing required column: {e}")

# Process rows with progress bar
for row in tqdm(data, desc="Tagging transcripts", unit="rows"):
    row[verbatim_idx] = replace_wrapped_tokens(
        row[verbatim_idx], RAW_TO_NOISE_TAG
    )
    row[normalized_idx] = replace_wrapped_tokens(
        row[normalized_idx], RAW_TO_NOISE_TAG
    )

# Write output with progress feedback
output_path = "data/indicvoices_rsml_ready.csv"
with open(output_path, "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(data)

print(f"‚úÖ Tagging complete. Output saved to: {output_path}")


Tagging transcripts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 222331/222331 [00:00<00:00, 326982.43rows/s]


‚úÖ Tagging complete. Output saved to: data/indicvoices_tagged.csv
