<a href="https://colab.research.google.com/github/ypyo01/Thesis/blob/main/timit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install git+https://github.com/openai/whisper.git
! pip install jiwer

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-wsli0w19
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-wsli0w19
  Resolved https://github.com/openai/whisper.git to commit 279133e3107392276dc509148da1f41bfb532c7e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20231117)
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting triton>=2.0.0 (from openai-whisper==20231117)
  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [None]:
import os
import numpy as np

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
import whisper
import torchaudio

from tqdm.notebook import tqdm


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


In [None]:
import os
import torch
import torchaudio
import whisper
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from tqdm import tqdm

class TIMITDataset(torch.utils.data.Dataset):
    """
    A simple class to load TIMIT data and trim/pad the audio to 30 seconds.
    """
    def __init__(self, root_dir, device="cuda" if torch.cuda.is_available() else "cpu"):
        self.root_dir = root_dir
        self.device = device
        self.file_paths = []

        # Walk through the dataset directory and gather file paths
        for subdir, _, files in os.walk(root_dir):
            for file in files:
                if file.endswith(".WAV"):
                    self.file_paths.append(os.path.join(subdir, file))

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, item):
        wav_path = self.file_paths[item]
        txt_path = wav_path.replace(".WAV", ".TXT")

        # Load the audio file
        audio, sample_rate = torchaudio.load(wav_path)

        assert sample_rate == 16000  # Ensure sample rate is 16kHz

        # Trim or pad the audio to 30 seconds
        audio = whisper.pad_or_trim(audio.flatten()).to(self.device)

        # Convert audio to log-Mel spectrogram
        mel = whisper.log_mel_spectrogram(audio)

        # Load the transcription
        with open(txt_path, 'r') as f:
            transcription = f.read().strip()

        return (mel, transcription)



In [None]:
%ls

dataset.zip  [0m[01;34msample_data[0m/


In [None]:
!unzip dataset.zip
%ls

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: __MACOSX/dataset/timit/TIMIT/TRAIN/DR6/MSMR0/._SX145.TXT  
  inflating: dataset/timit/TIMIT/TRAIN/DR6/MSMR0/SI1150.WRD  
  inflating: __MACOSX/dataset/timit/TIMIT/TRAIN/DR6/MSMR0/._SI1150.WRD  
  inflating: dataset/timit/TIMIT/TRAIN/DR6/MSMR0/SX415.WRD  
  inflating: __MACOSX/dataset/timit/TIMIT/TRAIN/DR6/MSMR0/._SX415.WRD  
  inflating: dataset/timit/TIMIT/TRAIN/DR6/MSMR0/SI1150.PHN  
  inflating: __MACOSX/dataset/timit/TIMIT/TRAIN/DR6/MSMR0/._SI1150.PHN  
  inflating: dataset/timit/TIMIT/TRAIN/DR6/MSMR0/SX415.PHN  
  inflating: __MACOSX/dataset/timit/TIMIT/TRAIN/DR6/MSMR0/._SX415.PHN  
  inflating: dataset/timit/TIMIT/TRAIN/DR6/MSMR0/SX145.WAV  
  inflating: __MACOSX/dataset/timit/TIMIT/TRAIN/DR6/MSMR0/._SX145.WAV  
  inflating: dataset/timit/TIMIT/TRAIN/DR6/MSMR0/SA2.PHN  
  inflating: __MACOSX/dataset/timit/TIMIT/TRAIN/DR6/MSMR0/._SA2.PHN  
  inflating: dataset/timit/TIMIT/TRAIN/DR6/MSMR0/SX55.TXT  
  inf

In [None]:
%cd dataset

/content/dataset


In [None]:
dataset = TIMITDataset("TEST")
loader = DataLoader(TIMITDataset(root_dir="/content/dataset/timit/TIMIT"), batch_size=4, shuffle=False)


In [None]:
model = whisper.load_model("large-v1")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

100%|█████████████████████████████████████| 2.87G/2.87G [24:46<00:00, 2.08MiB/s]


Model is multilingual and has 1,541,384,960 parameters.


In [None]:
# predict without timestamps for short-form transcription
options = whisper.DecodingOptions(language="en", without_timestamps=True)


In [None]:
hypotheses = []
references = []

for mels, texts in tqdm(loader):
    results = model.decode(mels, options)
    hypotheses.extend([result.text for result in results])
    references.extend(texts)


100%|██████████| 1575/1575 [1:08:45<00:00,  2.62s/it]


In [None]:
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
data

Unnamed: 0,hypothesis,reference
0,But she suffered in her off-duty hours.,0 41165 But she suffered in her off-duty hours.
1,Shaving cream is a popular item on Halloween.,0 58880 Shaving cream is a popular item on Hal...
2,Help Greg to pick a peck of potatoes.,0 121140 Help Greg to pick a peck of potatoes.
3,The misquote was retracted with an apology.,0 50279 The misquote was retracted with an apo...
4,She had your dark suit and greasy wash water a...,0 66560 She had your dark suit in greasy wash ...
...,...,...
6295,Brush fires are common in the dry underbrush o...,0 53248 Brush fires are common in the dry unde...
6296,She had your dark suit and greasy wash water a...,0 58880 She had your dark suit in greasy wash ...
6297,We know that actors can learn to portray a wid...,0 66048 We know that actors can learn to portr...
6298,I just saw Jim near the new archaeological mus...,0 55501 I just saw Jim near the new archeologi...


In [None]:
# Split the reference string by the first space and keep the second part
#data['reference'] = data['reference'].apply(lambda x: ' '.join(x.split(' ')[1:]))

# Now, the 'reference' column should contain only the transcription text without the numeric codes
data['reference'] = data['reference'].apply(lambda x: x.split(' ', 1)[1] if ' ' in x else x)

print(data.head())

data

                                          hypothesis  \
0            But she suffered in her off-duty hours.   
1      Shaving cream is a popular item on Halloween.   
2              Help Greg to pick a peck of potatoes.   
3        The misquote was retracted with an apology.   
4  She had your dark suit and greasy wash water a...   

                                           reference  \
0            But she suffered in her off-duty hours.   
1      Shaving cream is a popular item on Halloween.   
2              Help Greg to pick a peck of potatoes.   
3        The misquote was retracted with an apology.   
4  She had your dark suit in greasy wash water al...   

                                    hypothesis_clean  \
0             but she suffered in her off duty hours   
1       shaving cream is a popular item on halloween   
2               help greg to pick a peck of potatoes   
3         the misquote was retracted with an apology   
4  she had your dark suit and greasy wash wate

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,But she suffered in her off-duty hours.,But she suffered in her off-duty hours.,but she suffered in her off duty hours,41165 but she suffered in her off duty hours
1,Shaving cream is a popular item on Halloween.,Shaving cream is a popular item on Halloween.,shaving cream is a popular item on halloween,58880 shaving cream is a popular item on hallo...
2,Help Greg to pick a peck of potatoes.,Help Greg to pick a peck of potatoes.,help greg to pick a peck of potatoes,121140 help greg to pick a peck of potatoes
3,The misquote was retracted with an apology.,The misquote was retracted with an apology.,the misquote was retracted with an apology,50279 the misquote was retracted with an apology
4,She had your dark suit and greasy wash water a...,She had your dark suit in greasy wash water al...,she had your dark suit and greasy wash water a...,66560 she had your dark suit in greasy wash wa...
...,...,...,...,...
6295,Brush fires are common in the dry underbrush o...,Brush fires are common in the dry underbrush o...,brush fires are common in the dry underbrush o...,53248 brush fires are common in the dry underb...
6296,She had your dark suit and greasy wash water a...,She had your dark suit in greasy wash water al...,she had your dark suit and greasy wash water a...,58880 she had your dark suit in greasy wash wa...
6297,We know that actors can learn to portray a wid...,We know that actors can learn to portray a wid...,we know that actors can learn to portray a wid...,66048 we know that actors can learn to portray...
6298,I just saw Jim near the new archaeological mus...,I just saw Jim near the new archeological museum.,i just saw jim near the new archaeological museum,55501 i just saw jim near the new archaeologic...


In [None]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()



In [None]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data



Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,But she suffered in her off-duty hours.,But she suffered in her off-duty hours.,but she suffered in her off duty hours,but she suffered in her off duty hours
1,Shaving cream is a popular item on Halloween.,Shaving cream is a popular item on Halloween.,shaving cream is a popular item on halloween,shaving cream is a popular item on halloween
2,Help Greg to pick a peck of potatoes.,Help Greg to pick a peck of potatoes.,help greg to pick a peck of potatoes,help greg to pick a peck of potatoes
3,The misquote was retracted with an apology.,The misquote was retracted with an apology.,the misquote was retracted with an apology,the misquote was retracted with an apology
4,She had your dark suit and greasy wash water a...,She had your dark suit in greasy wash water al...,she had your dark suit and greasy wash water a...,she had your dark suit in greasy wash water al...
...,...,...,...,...
6295,Brush fires are common in the dry underbrush o...,Brush fires are common in the dry underbrush o...,brush fires are common in the dry underbrush o...,brush fires are common in the dry underbrush o...
6296,She had your dark suit and greasy wash water a...,She had your dark suit in greasy wash water al...,she had your dark suit and greasy wash water a...,she had your dark suit in greasy wash water al...
6297,We know that actors can learn to portray a wid...,We know that actors can learn to portray a wid...,we know that actors can learn to portray a wid...,we know that actors can learn to portray a wid...
6298,I just saw Jim near the new archaeological mus...,I just saw Jim near the new archeological museum.,i just saw jim near the new archaeological museum,i just saw jim near the new archaeological museum


In [None]:


wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")



WER: 2.91 %
