In [1]:
!pip install numpy
!pip install torch torchvision torchaudio
!pip install pandas
!pip install -U openai-whisper



In [2]:
import os
import numpy as np

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
import whisper
import torchaudio

from tqdm.notebook import tqdm


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


In [3]:


class LibriSpeech(torch.utils.data.Dataset):
    """
    A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.
    It will drop the last few seconds of a very small portion of the utterances.
    """
    def __init__(self, split="test-other", device=DEVICE):
        self.dataset = torchaudio.datasets.LIBRISPEECH(
            root=os.path.expanduser("~/.cache"),
            url=split,
            download=True,
        )
        self.device = device

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, item):
        audio, sample_rate, text, _, _, _ = self.dataset[item]
        assert sample_rate == 16000
        audio = whisper.pad_or_trim(audio.flatten()).to(self.device)
        mel = whisper.log_mel_spectrogram(audio)
        
        return (mel, text)



In [4]:
dataset = LibriSpeech("test-other")
loader = torch.utils.data.DataLoader(dataset, batch_size=4)


In [4]:
model = whisper.load_model("base.en")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

  checkpoint = torch.load(fp, map_location=device)


Model is English-only and has 71,825,408 parameters.


In [5]:
model = whisper.load_model("medium.en")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

  checkpoint = torch.load(fp, map_location=device)


Model is English-only and has 762,320,896 parameters.


In [6]:
# predict without timestamps for short-form transcription
options = whisper.DecodingOptions(language="en", without_timestamps=True)


In [7]:
hypotheses = []
references = []

for mels, texts in tqdm(loader):
    results = model.decode(mels, options)
    hypotheses.extend([result.text for result in results])
    references.extend(texts)


  0%|          | 0/735 [00:00<?, ?it/s]

In [8]:
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
data

Unnamed: 0,hypothesis,reference
0,"There's iron, they say, in all our blood, And ...",THERE'S IRON THEY SAY IN ALL OUR BLOOD AND A G...
1,"ìMargaret,î said Mr. Hale, as he returned from...",MARGARET SAID MISTER HALE AS HE RETURNED FROM ...
2,you don't mean that you thought me so silly,YOU DON'T MEAN THAT YOU THOUGHT ME SO SILLY
3,"I really like that account of himself, better ...",I REALLY LIKED THAT ACCOUNT OF HIMSELF BETTER ...
4,His statement of having been a shop boy was th...,HIS STATEMENT OF HAVING BEEN A SHOP BOY WAS TH...
...,...,...
2934,"Poor Isaac was hurried off accordingly, and ex...",POOR ISAAC WAS HURRIED OFF ACCORDINGLY AND EXP...
2935,The assurance that she possessed some friend i...,THE ASSURANCE THAT SHE POSSESSED SOME FRIEND I...
2936,She gazed accordingly upon a scene which might...,SHE GAZED ACCORDINGLY UPON A SCENE WHICH MIGHT...
2937,At his feet was placed a table occupied by two...,AT HIS FEET WAS PLACED A TABLE OCCUPIED BY TWO...


In [9]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()



In [10]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data



Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,"There's iron, they say, in all our blood, And ...",THERE'S IRON THEY SAY IN ALL OUR BLOOD AND A G...,there is iron they say in all our blood and a ...,there is iron they say in all our blood and a ...
1,"ìMargaret,î said Mr. Hale, as he returned from...",MARGARET SAID MISTER HALE AS HE RETURNED FROM ...,imargaret i said mister hale as he returned fr...,margaret said mister hale as he returned from ...
2,you don't mean that you thought me so silly,YOU DON'T MEAN THAT YOU THOUGHT ME SO SILLY,you do not mean that you thought me so silly,you do not mean that you thought me so silly
3,"I really like that account of himself, better ...",I REALLY LIKED THAT ACCOUNT OF HIMSELF BETTER ...,i really like that account of himself better t...,i really liked that account of himself better ...
4,His statement of having been a shop boy was th...,HIS STATEMENT OF HAVING BEEN A SHOP BOY WAS TH...,his statement of having been a shop boy was th...,his statement of having been a shop boy was th...
...,...,...,...,...
2934,"Poor Isaac was hurried off accordingly, and ex...",POOR ISAAC WAS HURRIED OFF ACCORDINGLY AND EXP...,poor isaac was hurried off accordingly and exp...,poor isaac was hurried off accordingly and exp...
2935,The assurance that she possessed some friend i...,THE ASSURANCE THAT SHE POSSESSED SOME FRIEND I...,the assurance that she possessed some friend i...,the assurance that she possessed some friend i...
2936,She gazed accordingly upon a scene which might...,SHE GAZED ACCORDINGLY UPON A SCENE WHICH MIGHT...,she gazed accordingly upon a scene which might...,she gazed accordingly upon a scene which might...
2937,At his feet was placed a table occupied by two...,AT HIS FEET WAS PLACED A TABLE OCCUPIED BY TWO...,at his feet was placed a table occupied by 2 s...,at his feet was placed a table occupied by 2 s...


In [11]:


wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")



WER: 5.83 %
