# bhAI STT Model Benchmarking

Benchmarks multiple Speech-to-Text models against human-reviewed ground truth.

**Workflow:**
1. Install deps & clone repo
2. Upload audio zip (one file, ~30 seconds)
3. Run all models on GPU
4. Compare WER/CER across models
5. Download results

## 1. Setup

In [None]:
# Clone the repo
import os

REPO_URL = "https://github.com/sundar911/bhAI_voicebot.git"
BRANCH = "main"

if not os.path.exists("bhAI_voicebot"):
    !git clone --branch {BRANCH} {REPO_URL}

%cd bhAI_voicebot

In [None]:
# Install dependencies
!pip install -e ".[benchmarking]" -q
!pip install onnxruntime-gpu huggingface_hub -q

# HuggingFace token — paste yours below
HF_TOKEN = ""  # <-- PASTE YOUR HUGGINGFACE TOKEN HERE

!huggingface-cli login --token {HF_TOKEN}

## 2. Upload Audio Files

Run the cell below — it will open a file picker. Select `sharepoint_audio.zip` from your machine.

This zip contains only the 86 question audio files that have ground truth transcriptions (no answer files).

In [None]:
import os
from pathlib import Path
from google.colab import files

# Upload the audio zip
print("Select sharepoint_audio.zip from your machine...")
uploaded = files.upload()

# Unzip into data/sharepoint_sync/
zip_name = list(uploaded.keys())[0]
!mkdir -p data/sharepoint_sync
!unzip -o {zip_name} -d data/sharepoint_sync/

# Show what we got
for domain in ["helpdesk", "hr_admin", "production"]:
    d = Path(f"data/sharepoint_sync/{domain}")
    count = len(list(d.glob("*"))) if d.exists() else 0
    print(f"  {domain}: {count} files")

## 3. Verify Environment

In [None]:
import torch

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
else:
    print("WARNING: No GPU detected. Go to Runtime > Change runtime type > T4 GPU")

# Count downloaded audio files
from pathlib import Path
for domain in ["helpdesk", "hr_admin", "production"]:
    d = Path(f"data/sharepoint_sync/{domain}")
    count = len(list(d.glob("*"))) if d.exists() else 0
    print(f"  {domain}: {count} files")

In [None]:
# Verify ground truth xlsx
import sys
sys.path.insert(0, ".")

from benchmarking.scripts.load_ground_truth import load_ground_truth

gt = load_ground_truth()
print(f"Ground truth entries: {len(gt)}")

# Show breakdown by domain
from collections import Counter
domains = Counter(k.split("/")[0] for k in gt)
for d, c in domains.most_common():
    print(f"  {d}: {c}")

## 4. Quick Test — Verify One Model Works

In [None]:
from src.bhai.stt.registry import get_stt, list_models

print("Available models:", list_models())

# Quick test with meta_mms (smallest model, ~2GB)
stt = get_stt("meta_mms", work_dir=Path(".bhai_temp/test"), device="cuda")

# Find a test audio file
test_files = list(Path("data/sharepoint_sync").glob("**/*.ogg"))[:1]
if test_files:
    result = stt.transcribe(test_files[0])
    print(f"\nFile: {test_files[0].name}")
    print(f"Transcription: {result['text']}")
    stt.cleanup()
    print("\nQuick test passed!")
else:
    print("No audio files found — check step 2.")

## 5. Run All Models

This runs each model sequentially across all audio files for each domain.
Models are loaded one at a time to manage GPU memory.

**Estimated time**: ~30-60 min per domain depending on GPU.

In [None]:
# Select which models and domains to benchmark
MODELS = [
    "vaani_whisper",
    "indic_conformer",
    "whisper_large_v3",
    "meta_mms",
    "indic_wav2vec",
]

DOMAINS = ["hr_admin", "helpdesk", "production"]

In [None]:
# Run benchmarks
for domain in DOMAINS:
    input_dir = f"data/sharepoint_sync/{domain}"
    if not Path(input_dir).exists():
        print(f"Skipping {domain} — no audio directory")
        continue

    for model in MODELS:
        print(f"\n{'#'*60}")
        print(f"# {model} on {domain}")
        print(f"{'#'*60}")
        !python benchmarking/scripts/generate_initial_transcriptions.py \
            --model {model} \
            --input {input_dir} \
            --domain {domain} \
            --device cuda \
            --append

## 6. Compare Models

In [None]:
# Run comparison for each domain
for domain in DOMAINS:
    print(f"\n{'='*60}")
    print(f"  {domain.upper()}")
    print(f"{'='*60}")
    !python benchmarking/scripts/compare_models.py \
        --domain {domain} \
        --output benchmarking/results/comparison_{domain}.csv

In [None]:
# Display results as a pandas table (if pandas available)
try:
    import pandas as pd

    for domain in DOMAINS:
        csv_path = f"benchmarking/results/comparison_{domain}.csv"
        if Path(csv_path).exists():
            df = pd.read_csv(csv_path)
            print(f"\n{domain.upper()}:")
            display(df.sort_values("wer"))
except ImportError:
    print("Install pandas for table display: pip install pandas")

## 7. Download Results

In [None]:
# Download comparison CSVs and per-model JSONL files
from google.colab import files

for domain in DOMAINS:
    csv_path = f"benchmarking/results/comparison_{domain}.csv"
    if Path(csv_path).exists():
        files.download(csv_path)

# Also download all transcription JSONL files
import shutil
shutil.make_archive("transcription_results", "zip", "data/transcription_dataset")
files.download("transcription_results.zip")

print("Done! Results downloaded.")