In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
 
# 디바이스 GPU 설정
device = "cuda:0" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 
# 모델 이름 설정
model_id = "openai/whisper-large-v3"
 
# 모델 불러오기
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, dtype=dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
 
processor = AutoProcessor.from_pretrained(model_id)
 
# pipline()으로 설정하기.
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)
 
# 데이터세트 불러오기
dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]
 
result = pipe(sample)
print(result["text"])

In [5]:
from huggingface_hub import notebook_login
 
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
from datasets import load_dataset, DatasetDict
common_voice = DatasetDict()
 
common_voice["train"] = load_dataset("mozilla-foundation/common_voice_17_0", "ko", split="train+validation", trust_remote_code=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_17_0", "ko", split="test", trust_remote_code=True)
 
# 불필요한 데이터 삭제
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])
common_voice

Generating train split: 0 examples [00:00, ? examples/s]

Reading metadata...: 376it [00:00, 68155.85it/s]


Generating validation split: 0 examples [00:00, ? examples/s]

Reading metadata...: 330it [00:00, 36651.85it/s]


Generating test split: 0 examples [00:00, ? examples/s]

Reading metadata...: 339it [00:00, 56544.54it/s]


Generating other split: 0 examples [00:00, ? examples/s]

Reading metadata...: 2057it [00:00, 85076.41it/s]


Generating invalidated split: 0 examples [00:00, ? examples/s]

Reading metadata...: 270it [00:00, 49604.12it/s]


Generating validated split: 0 examples [00:00, ? examples/s]

Reading metadata...: 1046it [00:00, 104652.50it/s]


DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence', 'variant'],
        num_rows: 706
    })
    test: Dataset({
        features: ['audio', 'sentence', 'variant'],
        num_rows: 339
    })
})

In [9]:
%pip install soundfile

Collecting soundfile
  Downloading soundfile-0.13.1-py2.py3-none-win_amd64.whl.metadata (16 kB)
Downloading soundfile-0.13.1-py2.py3-none-win_amd64.whl (1.0 MB)
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   -------------------- ------------------- 0.5/1.0 MB 1.9 MB/s eta 0:00:01
   ---------------------------------------- 1.0/1.0 MB 2.0 MB/s  0:00:00
Installing collected packages: soundfile
Successfully installed soundfile-0.13.1
Note: you may need to restart the kernel to use updated packages.


In [11]:
from transformers import WhisperFeatureExtractor
 
# feature_extractor 불러오기
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")


preprocessor_config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [12]:
from transformers import WhisperTokenizer
 
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Korean", task="transcribe")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

In [16]:
%pip install librosa
def prepare_dataset(batch):
    # 리샘플링 from 48 to 16kHz
    audio = batch["audio"]
 
    # audio array -> log-Mel spectrogram
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
 
    # label 추가
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch
    
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=4)


Collecting librosa
  Downloading librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-1.0.0-cp311-cp311-win_amd64.whl.metadata (5.6 kB)
Downloading librosa-0.11.0-py3-none-any.whl (260 kB)
Downloading audioread-3.0.1-py3-none-any.whl (23 kB)
Downloading soxr-1.0.0-cp311-cp311-win_amd64.whl (173 kB)
Installing collected packages: soxr, audioread, librosa

   -------------------------- ------------- 2/3 [librosa]
   -------------------------- ------------- 2/3 [librosa]
   ---------------------------------------- 3/3 [librosa]

Successfully installed audioread-3.0.1 librosa-0.11.0 soxr-1.0.0
Note: you may need to restart the kernel to use updated packages.


Map (num_proc=4):   0%|          | 0/706 [00:00<?, ? examples/s]

NameError: name 'feature_extractor' is not defined

In [None]:
import torch
 
from dataclasses import dataclass
from typing import Any, Dict, List, Union
 
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
 
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
 
        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
 
        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
 
        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]
 
        batch["labels"] = labels
 
        return batch
 
 
# Data Collator 이니셜라이즈
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)