In [72]:
import wave
import json
from vosk import Model, KaldiRecognizer
import soundfile as sf
from noisereduce import reduce_noise
from tabulate import tabulate
import librosa

# self written code start

# Preprocess audio: noise reduction, normalisation, resampling
def preprocess_audio(file_path, output_path):
    # Load audio with librosa
    y, sr = librosa.load(file_path, sr=None)
    
    # Apply noise reduction
    reduced_noise = reduce_noise(y=y, sr=sr)
    
    # Normalise the audio
    normalised_audio = librosa.util.normalize(reduced_noise)
    
    # Resample to 16kHz (if needed)
    if sr != 16000:
        normalised_audio = librosa.resample(normalised_audio, orig_sr=sr, target_sr=16000)
        sr = 16000
    
    # Save preprocessed audio
    sf.write(output_path, normalised_audio, sr)
    return output_path

# Recognise speech from an audio file
def recognise_audio(file_path, model_path):
    # Open the preprocessed audio file
    wf = wave.open(file_path, "rb")
    
    # Load the Vosk model for the given language
    model = Model(model_path)
    recogniser = KaldiRecognizer(model, wf.getframerate())
    
    # Initialise result
    result = ""
    
    # Read frames and process them
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if recogniser.AcceptWaveform(data):
            result = recogniser.Result()
    
    # Final result with the remaining data
    result = recogniser.FinalResult()
    return result

# Calculate WER
def calculate_wer(reference, recognised):
    ref_words = reference.split()
    rec_words = recognised.split()
    
    # Calculate edit distance
    substitutions = deletions = insertions = 0
    for ref_word, rec_word in zip(ref_words, rec_words):
        if ref_word != rec_word:
            substitutions += 1
    deletions = len(ref_words) - len(rec_words)
    insertions = len(rec_words) - len(ref_words)
    N = len(ref_words)
    wer = (substitutions + deletions + insertions) / N * 100
    return wer

# Process files with preprocessing
languages = {
    'en': 'models/en',
    'it': 'models/it',
    'es': 'models/es'
}

audio_files = [
    {'file': '../exercise4/EN/checkin.wav', 'language': 'en', 'reference': "where is the check in desk"},
    {'file': '../exercise4/EN/parents.wav', 'language': 'en', 'reference': "i've lost my parents"},
    {'file': '../exercise4/EN/suitcase.wav', 'language': 'en', 'reference': "please i've lost my suitcase"},
    {'file': '../exercise4/EN/what_time.wav', 'language': 'en', 'reference': "what time is my plane"},
    {'file': '../exercise4/EN/where.wav', 'language': 'en', 'reference': "where are the restaurants and shops"},

    {'file': '../exercise4/EN/your_sentence1.wav', 'language': 'en', 'reference': "where is the washroom"},
    {'file': '../exercise4/EN/your_sentence2.wav', 'language': 'en', 'reference': "im late to my flight"},

    {'file': '../exercise4/ES/checkin_es.wav', 'language': 'es', 'reference': "dónde están los mostradores"},
    {'file': '../exercise4/ES/parents_es.wav', 'language': 'es', 'reference': "he perdido a mis padres"},
    {'file': '../exercise4/ES/suitcase_es.wav', 'language': 'es', 'reference': "por favor he perdido mi maleta"},
    {'file': '../exercise4/ES/what_time_es.wav', 'language': 'es', 'reference': "a qué hora es mi avión"},
    {'file': '../exercise4/ES/where_es.wav', 'language': 'es', 'reference': "dónde están los restaurantes y las tiendas"},

    {'file': '../exercise4/IT/checkin_it.wav', 'language': 'it', 'reference': "dove il bancone"},
    {'file': '../exercise4/IT/parents_it.wav', 'language': 'it', 'reference': "ho perso i miei genitori"},
    {'file': '../exercise4/IT/suitcase_it.wav', 'language': 'it', 'reference': "per favore ho perso la mia valigia"},
    {'file': '../exercise4/IT/what_time_it.wav', 'language': 'it', 'reference': "a che ora e’ il mio aereo"},
    {'file': '../exercise4/IT/where_it.wav', 'language': 'it', 'reference': "dove sono i ristoranti e i negozi"}
]

results = []

for audio in audio_files:
    file_path = audio['file']
    lang = audio['language']
    model_path = languages[lang]
    
    # Preprocess the audio file
    preprocessed_path = file_path.replace('.wav', '_preprocessed.wav')
    preprocess_audio(file_path, preprocessed_path)
    
    # Recognise speech from the preprocessed file
    recognised_result = recognise_audio(preprocessed_path, model_path)
    
    # Extract text from JSON result
    recognised_text = json.loads(recognised_result).get('text', '')
    
    # Calculate WER
    wer = calculate_wer(audio['reference'], recognised_text)
    
    # Store the result
    results.append({
        'Language': lang,
        'File': file_path,
        'WER': wer
    })

# Create a table for results
table_data = []
for result in results:
    table_data.append([
        result['Language'],
        result['File'],
        f"{result['WER']:.2f}"
    ])

# Define table headers
headers = ["Language", "File", "WER"]

# Print the table
print(tabulate(table_data, headers=headers, tablefmt="grid"))

# self written code end


LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=10 max-active=3000 lattice-beam=2
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from models/en/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:282) Loading HCL and G from models/en/graph/HCLr.fst models/en/graph/Gr.fst
LOG (VoskAPI:ReadDataFiles():model.cc:303) Loading winfo models/en/graph/phones/word_boundary.int
LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=10 max-active=3000 lattice-beam=2
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:

+------------+------------------------------------+-------+
| Language   | File                               |   WER |
| en         | ../exercise4/EN/checkin.wav        |  0    |
+------------+------------------------------------+-------+
| en         | ../exercise4/EN/parents.wav        |  0    |
+------------+------------------------------------+-------+
| en         | ../exercise4/EN/suitcase.wav       | 20    |
+------------+------------------------------------+-------+
| en         | ../exercise4/EN/what_time.wav      | 20    |
+------------+------------------------------------+-------+
| en         | ../exercise4/EN/where.wav          |  0    |
+------------+------------------------------------+-------+
| en         | ../exercise4/EN/your_sentence1.wav |  0    |
+------------+------------------------------------+-------+
| en         | ../exercise4/EN/your_sentence2.wav | 20    |
+------------+------------------------------------+-------+
| es         | ../exercise4/ES/checkin_e