### load korean dataset

In [None]:
from datasets import load_dataset, Dataset
from datasets import Audio
from evaluate import load
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
import torch
from jiwer import compute_measures
import pandas as pd

Dataset.cleanup_cache_files
dataset = load_dataset("mozilla-foundation/common_voice_17_0", "ko", split="train", streaming=True)
seed = 42  # 원하는 seed 값
test_datasets = dataset.shuffle(seed=seed).take(100)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
wer_metric = load("wer")
def Eval(reference, prediction):
    wer = wer_metric.compute(references=[reference], predictions=[prediction])
    return wer

In [3]:
def replace_sentence(reference):
    reference = reference.replace("'","").replace('"',"")
    reference = reference.strip()
    return reference

In [4]:
device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)

processor = AutoProcessor.from_pretrained(model_id)
model.to(device)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

Device set to use cuda:0


In [5]:
for i in test_datasets:
    print(i)
    break

Reading metadata...: 0it [00:00, ?it/s]

Reading metadata...: 376it [00:00, 1987.10it/s]


{'client_id': 'c6d812c8e94ecbb24fae83f2cd97e626f074b18a28f27bfdbdcd9038e80ef241e5fa751e85dd1ef11043d7966ea9deb57ff69e5398b4a84d3dbfe01b80c60684', 'path': 'ko_train_0/common_voice_ko_39744859.mp3', 'audio': {'path': 'ko_train_0/common_voice_ko_39744859.mp3', 'array': array([-4.08252167e-13, -3.11643480e-12, -1.19700678e-12, ...,
        8.09599365e-08, -5.31813565e-07, -3.35610241e-07]), 'sampling_rate': 48000}, 'sentence': '"쇰을 잡아채지 그냥 둬, 이 바보야!"', 'up_votes': 4, 'down_votes': 0, 'age': 'twenties', 'gender': 'male_masculine', 'accent': '서울', 'locale': 'ko', 'segment': '', 'variant': ''}


In [6]:
import os

In [10]:
Eval_list = []

n = 1
for test_dataset in test_datasets:
    if test_dataset['down_votes'] == 0:
        audio_info = test_dataset['audio']
        language = "korean"
        file_path = audio_info['path']
        file_name = os.path.basename(file_path)
        reference = replace_sentence(test_dataset['sentence'])
        generate = pipe(audio_info, generate_kwargs={"language": "korean"})
        prediction = generate['text'].strip()
        measures = compute_measures(reference, prediction)
        substitutions = measures['substitutions']
        insertions = measures['insertions']
        deletions = measures['deletions']
        total_words = len(reference.split())
        wer = Eval(reference, prediction)
        Eval_list.append({
            'index': n,
            'language': language,
            'file_path': file_path,
            'file_name': file_name,
            'reference': reference,
            'prediction': prediction,
            'S' : substitutions,
            'I' : insertions,
            'D' : deletions,
            'N' : total_words,
            'WER' : wer,
        })
        n += 1
        torch.cuda.empty_cache()
    if n == 31:
        break

df = pd.DataFrame(Eval_list)
df.to_csv("Eval/Korean_Eval.csv", encoding="utf-8-sig", index=False)
averages = df["wer"].mean()
print(f"WER : {averages}")

Reading metadata...: 376it [00:00, 1046.67it/s]
