In [2]:
import torch
import whisper
import json
from pathlib import Path
import numpy as np
import os 
from model_loader import load_model_simple, WhisperModelLoader
import wave 
import whisper

In [2]:
def get_wav_length(path):
    with wave.open(path, 'r') as wf:
        frames = wf.getnframes()
        rate = wf.getframerate()
        return frames / float(rate)

In [3]:
# load the fine tuned model 

checkpoint_dir = 'whisper-torgo-finetuned/best_model_epoch_7_wer_0.2290/'
model = load_model_simple(checkpoint_dir)

50257

In [26]:
audio_dir_path = os.path.expanduser("~/torgo_data/M02/Session1/wav_headMic")
second_path = os.path.expanduser("~/torgo_data/M02/Session1/wav_headMic")
file_lists = [f for f in os.listdir(audio_dir_path) if os.path.isfile(os.path.join(audio_dir_path, f))]
second_list = [f for f in os.listdir(second_path) if os.path.isfile(os.path.join(second_path, f))]

In [18]:
def finetuned_infer(audio_dir_path, audio_file, prompt_path):
    audio_path = Path(audio_file)
    txt_file = audio_path.with_suffix('.txt')

    print(audio_file)
    result = WhisperModelLoader.transcribe_with_finetuned_model(
        model, 
        os.path.join(audio_dir_path, audio_path),
        language='en',
        beam_size = 5
    )
    with open(os.path.join(prompt_path, txt_file)) as txt_file: 
        contents = txt_file.read()
        print(f"Transcribed: {result.text}, Ground Truth: {contents}")

In [9]:
def infer(audio_dir_path, audio_file, prompt_path):
    if prompt_path is None:
        pass
    else:
        audio_path = Path(audio_file)
        txt_file = audio_path.with_suffix(".txt")
        
        print(audio_file)
        audio = whisper.load_audio(os.path.join(audio_dir_path, audio_file))
        audio = whisper.pad_or_trim(audio)
        mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)
        _, probs = model.detect_language(mel)
    
        
        options = whisper.DecodingOptions()
        result = whisper.decode(model, mel, options)
    
        # print the recognized text
        with open(os.path.join(prompt_path, txt_file)) as txt_file: 
            contents = txt_file.read()
            print(f"Transcribed: {result.text}, Ground Truth: {contents}")


In [27]:
prompt_path = os.path.expanduser("~/torgo_data/M02/Session1/prompts")

for index, f in enumerate(file_lists):
    print(f"index: {index}")
    finetuned_infer(audio_dir_path, f, prompt_path)

index: 0
0013.wav
Transcribed: I liked the wash and dishes and the guy putting away the dishes and falling off his stool., Ground Truth: input/images/kitchen.jpg
index: 1
0137.wav
Transcribed: Everything went real smooth, the sheriff said., Ground Truth: Everything went real smooth, the sheriff said. 
index: 2
0220.wav
Transcribed: freedom, Ground Truth: fear
index: 3
0130.wav
Transcribed: I tried to tell people in the community,, Ground Truth: I tried to tell people in the community.
index: 4
0227.wav
Transcribed: much, Ground Truth: much
index: 5
0068.wav
Transcribed: floor, Ground Truth: floor
index: 6
0014.wav
Transcribed: tiger, Ground Truth: dagger
index: 7
0229.wav
Transcribed: Shh., Ground Truth: [relax your mouth in its normal position]
index: 8
0066.wav
Transcribed: yet he still thinks that swiftly as ever., Ground Truth: yet he still thinks as swiftly as ever.
index: 9
0142.wav
Transcribed: whoop, Ground Truth: whoop
index: 10
0193.wav
Transcribed: go, Ground Truth: though
i

In [30]:
from IPython.display import Audio
Audio(os.path.join(audio_dir_path, file_lists[177]))