<a href="https://colab.research.google.com/github/ypyo01/Thesis/blob/main/timit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -U openai-whisper

Collecting openai-whisper
  Downloading openai-whisper-20231117.tar.gz (798 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/798.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m788.5/798.6 kB[0m [31m26.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.6/798.6 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting triton<3,>=2.0.0 (from openai-whisper)
  Downloading triton-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading triton-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux201

In [4]:
!unzip dataset.zip
%ls

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: dataset/timit/TIMIT/TRAIN/DR4/MGAG0/SX421.WRD  
  inflating: dataset/timit/TIMIT/TRAIN/DR4/MGAG0/SA2.TXT  
  inflating: dataset/timit/TIMIT/TRAIN/DR4/MGAG0/SX421.PHN  
  inflating: dataset/timit/TIMIT/TRAIN/DR4/MGAG0/SA1.TXT  
  inflating: dataset/timit/TIMIT/TRAIN/DR4/MGAG0/SI645.PHN  
  inflating: dataset/timit/TIMIT/TRAIN/DR4/MGAG0/SX241.TXT  
  inflating: dataset/timit/TIMIT/TRAIN/DR4/MGAG0/SI645.WAV  
  inflating: dataset/timit/TIMIT/TRAIN/DR4/MGAG0/SI691.TXT  
  inflating: dataset/timit/TIMIT/TRAIN/DR4/MGAG0/SX421.WAV  
  inflating: dataset/timit/TIMIT/TRAIN/DR4/MGAG0/SI645.TXT  
  inflating: dataset/timit/TIMIT/TRAIN/DR4/MGAG0/SA1.PHN  
  inflating: dataset/timit/TIMIT/TRAIN/DR4/MGAG0/SI645.WRD  
  inflating: dataset/timit/TIMIT/TRAIN/DR4/MGAG0/SX61.WRD  
  inflating: dataset/timit/TIMIT/TRAIN/DR4/MGAG0/SX151.PHN  
  inflating: dataset/timit/TIMIT/TRAIN/DR4/MGAG0/SI691.WAV  
  inflating: dataset/timit/

In [5]:
%cd dataset
%ls

/content/dataset
[0m[01;34mtimit[0m/


In [3]:
import os
import torch
import torchaudio
import whisper
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from tqdm import tqdm

class TIMITDataset(torch.utils.data.Dataset):
    """
    A simple class to load TIMIT data and trim/pad the audio to 30 seconds.
    """
    def __init__(self, root_dir, device="cuda" if torch.cuda.is_available() else "cpu"):
        self.root_dir = root_dir
        self.device = device
        self.file_paths = []

        # Walk through the dataset directory and gather file paths
        for subdir, _, files in os.walk(root_dir):
            for file in files:
                if file.endswith(".WAV"):
                    self.file_paths.append(os.path.join(subdir, file))

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, item):
        wav_path = self.file_paths[item]
        txt_path = wav_path.replace(".WAV", ".TXT")

        # Load the audio file
        audio, sample_rate = torchaudio.load(wav_path)

        assert sample_rate == 16000  # Ensure sample rate is 16kHz

        # Trim or pad the audio to 30 seconds
        audio = whisper.pad_or_trim(audio.flatten()).to(self.device)

        # Convert audio to log-Mel spectrogram
        mel = whisper.log_mel_spectrogram(audio)

        # Load the transcription
        with open(txt_path, 'r') as f:
            transcription = f.read().strip()

        return (mel, transcription)


In [6]:
# Define the path to your TIMIT dataset
dataset_path = "/content/dataset/timit/TIMIT"

# Create the dataset and data loader
dataset = TIMITDataset(root_dir=dataset_path)
loader = DataLoader(dataset, batch_size=1, shuffle=False)


In [7]:
# Load the Whisper model
model = whisper.load_model("large-v1")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)


100%|█████████████████████████████████████| 2.87G/2.87G [02:21<00:00, 21.9MiB/s]
  checkpoint = torch.load(fp, map_location=device)


Model is multilingual and has 1,541,384,960 parameters.


In [8]:
# Set up decoding options
options = whisper.DecodingOptions(language="en", without_timestamps=True)


In [None]:
import torch
!python --version
print(torch.cuda.is_available())
print(torch.__version__)
!nvcc --version

Python 3.10.12
True
2.4.0
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Thu_Nov_18_09:45:30_PST_2021
Cuda compilation tools, release 11.5, V11.5.119
Build cuda_11.5.r11.5/compiler.30672275_0


In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
import re
hypotheses = []
references = []

for mels, texts in tqdm(loader):
    results = model.decode(mels, options)
    hypotheses.extend([result.text for result in results])

    # Clean up the references by removing leading numbers
    cleaned_references = [re.sub(r'^\d+\s*\d{5}\s*', '', text) for text in texts]
    references.extend(cleaned_references)
# Store the results in a DataFrame
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
data


 82%|████████▏ | 5159/6300 [1:26:25<19:35,  1.03s/it]

In [None]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [None]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

In [None]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")