# Exercise 4 (Automated Speech Recognition system for aeroport virtual assistant)

This notebook implements Exercise 4.1.
1: Implement offline ASR using Vosk to handle messages in 3 different languages.
2: Add noise handling in form of light band-pass filter and small noise gate.
3: Add custom messages and generate WER report for the system processing.

In [28]:
# === Notebook settings ===
# Languages to evaluate cab be a subset of: 'en', 'it', 'es', or more if we extend with extra models
LANGS = ['en', 'it', 'es']

# Toggle the noise-handling pipeline (band-pass + noise gate), toggle of at the start
USE_DENOISE = False

# Debug: print REF/HYP for each sample
DEBUG_ASR_PER_SAMPLE = True

# Debug: enable verbose Vosk logging
DEBUG_VOSK = False

In [29]:
# === Import project modules ===
import vosk

from config import ASSETS_DIR, MODELS_DIR, TRANSCRIPT_CSV_PATH, LANG_FOLDERS
from asr.vosk_asr import load_model
from eval.manifest import load_transcriptions
from eval.wer import (
    build_hypotheses_from_assets_vosk,
    evaluate_transcriptions,
    aggregate_corpus,
    aggregate_by_lang,
    print_sample_debug,
)

In [30]:
# === Vosk setup and basic checks ===
# Configure Vosk logging
vosk.SetLogLevel(1 if DEBUG_VOSK else -1)

# Validate languages
langs = [lang.strip().lower() for lang in LANGS]
invalid = [lang for lang in langs if lang not in LANG_FOLDERS]
if invalid:
    raise ValueError(f'Unknown languages: {invalid}. Allowed: {list(LANG_FOLDERS.keys())}')

print('=== Exercise 4 notebook run ===')
print(f'Assets folder: {ASSETS_DIR}')
print(f'Models folder: {MODELS_DIR}')
print(f'Languages: {langs}')
print(f'Denoise usage: {USE_DENOISE}')
print(f'Debug per sample: {DEBUG_ASR_PER_SAMPLE}')
print(f'Debug Vosk: {DEBUG_VOSK}')

=== Exercise 4 notebook run ===
Assets folder: /home/penguin/my_projects/cm3065_isp/exercise4/assets
Models folder: /home/penguin/my_projects/cm3065_isp/exercise4/models
Languages: ['en', 'it', 'es']
Denoise usage: False
Debug per sample: True
Debug Vosk: False


In [31]:
# === Load references for asset files (manifest) ===
references = load_transcriptions(TRANSCRIPT_CSV_PATH)
print(f'Loaded {len(references)} reference transcriptions from: {TRANSCRIPT_CSV_PATH}')

# === Load Vosk models ===
models = {lang: load_model(lang) for lang in langs}
print(f'Loaded {len(models)} Vosk model(s): {list(models.keys())}')

Loaded 22 reference transcriptions from: /home/penguin/my_projects/cm3065_isp/exercise4/transcriptions.csv
Loaded 3 Vosk model(s): ['en', 'it', 'es']


##### Comparative evaluation

In this section our ASR system is evaluated twice on the same dataset to show its effect:
1. Baseline configuration does not use any noise handling
2. Second version is passed through the denoising pipeline (band-pass + noise gate)

This allows direct comparison of Word Error Rate (WER) and helps to highlight effect of the noise suppression on recognition performance.

In [32]:
# === Pass all wavs through the ASR pipeline and create hypotheses ===
hypotheses = build_hypotheses_from_assets_vosk(models, use_denoise=USE_DENOISE)
print(f'Generated {len(hypotheses)} ASR hypotheses from assets folders.')

# For comparison we also generate hypotheses for sounds with denoising
if(not USE_DENOISE):
    hypotheses2 = build_hypotheses_from_assets_vosk(models, use_denoise=True)

Generated 17 ASR hypotheses from assets folders.


In [33]:
# === Evaluate WER by comparing hypothesis vs references ===
rows = evaluate_transcriptions(hypotheses, references)

# Alternative transcriptions w/o noise reduction pipeline
if(not USE_DENOISE):
    rows2 = evaluate_transcriptions(hypotheses2, references)

if DEBUG_ASR_PER_SAMPLE:
    print(f'=== DENOISE PIPELINE: {USE_DENOISE} ===')
    print_sample_debug(rows)

    # Optional comparison
    if(not USE_DENOISE):
        print(f'=== DENOISE PIPELINE: True ===')
        print_sample_debug(rows2)

print(f'Scored samples: {len(rows)}')

=== DENOISE PIPELINE: False ===

=== Per-sample debug ===
------------------------------
Lang    : en
File    : checkin.wav
REF     : Where is the check in desk?
HYP     : where is the check in desk
S/D/I/N : 0 / 0 / 0 / 6
WER     : 0.0000
------------------------------
Lang    : en
File    : my_sentence_1.wav
REF     : Is there a place to relax here?
HYP     : is there place to relax yeah
S/D/I/N : 1 / 1 / 0 / 7
WER     : 0.2857
------------------------------
Lang    : en
File    : my_sentence_2.wav
REF     : Can I charge my phone here?
HYP     : can they charge my phone here
S/D/I/N : 1 / 0 / 0 / 6
WER     : 0.1667
------------------------------
Lang    : en
File    : parents.wav
REF     : I have lost my parents.
HYP     : i lost my parents
S/D/I/N : 0 / 1 / 0 / 5
WER     : 0.2000
------------------------------
Lang    : en
File    : suitcase.wav
REF     : Please, I have lost my suitcase.
HYP     : please have lost my suitcase
S/D/I/N : 0 / 1 / 0 / 6
WER     : 0.1667
----------------

In [34]:
# === Display results in table ===
import pandas as pd

df_baseline = pd.DataFrame(rows)

# Alternative with denoising
df_alternative = pd.DataFrame(rows2) if (not USE_DENOISE) else None

tables = [
    ('No denoise', df_baseline),
    ('With denoise', df_alternative)
]

for title, df in tables:
    # Skip if no dataframe
    if df is None or df.empty:
        continue

    # Make a copy
    df = df.copy()
    # Add calculated field
    df['WER_%'] = (df['wer'] * 100.0).round(2)

    display_cols = ['lang', 'filename', 'WER_%']
    
    if(DEBUG_ASR_PER_SAMPLE):
        display_cols += ['ref', 'hyp']

    print(title)
    display(
        df[display_cols]
        .sort_values(['lang', 'filename'])
        .reset_index(drop=True)
    )

# Small guard clause
if (df_baseline is None or df_baseline.empty) and (df_alternative is None or df_alternative.empty):
    print('No scored rows. Check that assets filenames match transcriptions.csv.')

No denoise


Unnamed: 0,lang,filename,WER_%,ref,hyp
0,en,checkin.wav,0.0,Where is the check in desk?,where is the check in desk
1,en,my_sentence_1.wav,28.57,Is there a place to relax here?,is there place to relax yeah
2,en,my_sentence_2.wav,16.67,Can I charge my phone here?,can they charge my phone here
3,en,parents.wav,20.0,I have lost my parents.,i lost my parents
4,en,suitcase.wav,16.67,"Please, I have lost my suitcase.",please have lost my suitcase
5,en,what_time.wav,0.0,What time is my plane?,what time is my plane
6,en,where.wav,0.0,Where are the restaurants and shops?,where are the restaurants and shops
7,es,checkin_es.wav,0.0,¿Dónde están los mostradores?,dónde están los mostradores
8,es,parents_es.wav,0.0,He perdido a mis padres.,he perdido a mis padres
9,es,suitcase_es.wav,0.0,"Por favor, he perdido mi maleta.",por favor he perdido mi maleta


With denoise


Unnamed: 0,lang,filename,WER_%,ref,hyp
0,en,checkin.wav,0.0,Where is the check in desk?,where is the check in desk
1,en,my_sentence_1.wav,14.29,Is there a place to relax here?,is there a place to relax yeah
2,en,my_sentence_2.wav,16.67,Can I charge my phone here?,can they charge my phone here
3,en,parents.wav,20.0,I have lost my parents.,i lost my parents
4,en,suitcase.wav,16.67,"Please, I have lost my suitcase.",please have lost my suitcase
5,en,what_time.wav,0.0,What time is my plane?,what time is my plane
6,en,where.wav,0.0,Where are the restaurants and shops?,where are the restaurants and shops
7,es,checkin_es.wav,25.0,¿Dónde están los mostradores?,dónde están los mostrador
8,es,parents_es.wav,0.0,He perdido a mis padres.,he perdido a mis padres
9,es,suitcase_es.wav,0.0,"Por favor, he perdido mi maleta.",por favor he perdido mi maleta


In [35]:
# === Aggregate results (overall + language grouped) ===

all_rows = [
    ('No denoise', rows),
    ('With denoise', rows2)
]

for label, row in all_rows:
    overall = aggregate_corpus(row)
    by_lang = aggregate_by_lang(row)

    print(f'\n=== Summary {label} ===')
    print(f"Overall WER: {overall['wer']:.4f}  (S={overall['S']}, D={overall['D']}, I={overall['I']}, N={overall['N']})")

    for lang, stats in sorted(by_lang.items()):
        print(f"{lang.upper()} WER: {stats['wer']:.4f}  (S={stats['S']}, D={stats['D']}, I={stats['I']}, N={stats['N']})")


=== Summary No denoise ===
Overall WER: 0.1212  (S=5, D=7, I=0, N=99)
EN WER: 0.1220  (S=2, D=3, I=0, N=41)
ES WER: 0.1786  (S=3, D=2, I=0, N=28)
IT WER: 0.0667  (S=0, D=2, I=0, N=30)

=== Summary With denoise ===
Overall WER: 0.1111  (S=5, D=6, I=0, N=99)
EN WER: 0.0976  (S=2, D=2, I=0, N=41)
ES WER: 0.1786  (S=3, D=2, I=0, N=28)
IT WER: 0.0667  (S=0, D=2, I=0, N=30)
