In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

device = "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "/home/khj6051/whisper/whisper-turbo-ko/checkpoint-2000"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained("openai/whisper-large-v3-turbo")

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 10894.30it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 11949.58it/s]
Device set to use cpu


In [2]:
import librosa

y, sr = librosa.load("/home/khj6051/whisper/whisper-turbo-ko/samples/step_001000/sample_08.wav", sr=16000)

print(y.shape)
print(sr)

import IPython.display as ipd
ipd.Audio(y, rate=sr)

(59184,)
16000


In [3]:
pipe(torch.tensor(y).to(device))

`return_token_timestamps` is deprecated for WhisperFeatureExtractor and will be removed in Transformers v5. Use `return_attention_mask` instead, as the number of frames can be inferred from it.
`generation_config` default values have been modified to match model-specific defaults: {'max_new_tokens': 128}. If this is not desired, please set these values explicitly.
`generation_config` default values have been modified to match model-specific defaults: {'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50359, 50360, 50361, 50362, 50363], 'begin_suppress_tok

KeyboardInterrupt: 

In [7]:
print(processor.tokenizer.eos_token_id)
print(processor.tokenizer.pad_token_id)

50257
50257


In [8]:
texts = ["Hello", "So what is your name?"]
tok = processor.tokenizer(
    texts, return_tensors="pt", padding=True, truncation=True, max_length=128,
    add_special_tokens=True,  # EOS 포함 보장
)
labels = tok.input_ids
attn = tok.attention_mask

In [9]:
labels

tensor([[50258, 50364, 15947, 50257, 50257, 50257, 50257, 50257, 50257],
        [50258, 50364,  6455,   437,   307,   428,  1315,    30, 50257]])

In [None]:
from datasets import load_dataset
path = "Emilia-YODAS/KO/*.tar" # Same for Emilia; just replace "Emilia-YODAS/" with "Emilia/"
dataset = load_dataset("amphion/Emilia-Dataset", data_files={"train": path}, split="train", streaming=True)
print(dataset) # here should only shows 1983 n_shards


Downloading readme: 100%|██████████| 13.9k/13.9k [00:00<00:00, 11.5MB/s]


IterableDataset({
    features: ['__key__', '__url__', 'json', 'mp3'],
    n_shards: 208
})
{'__key__': 'KO_vxuhJOtKu-c_W000000', '__url__': 'hf://datasets/amphion/Emilia-Dataset@d7f2f7340a6385696f3766c8049fa920a4707c07/Emilia-YODAS/KO/KO-B000000.tar', 'json': {'_id': 'KO_vxuhJOtKu-c_W000000', 'dnsmos': 3.1546, 'duration': 13.48, 'language': 'ko', 'phone_count': 93, 'speaker': 'KO_vxuhJOtKu-c_SPEAKER_01', 'text': ' 코리안 시트, 시티 파크래요. 해전 파크라고 불립니다. 이렇게 오면, 어, 여기에, 저기, 정자가 있어요.'}, 'mp3': {'path': 'KO_vxuhJOtKu-c_W000000.mp3', 'array': array([ 0.00551886,  0.00785224,  0.00119063, ..., -0.00414539,
       -0.00259165, -0.00125598], shape=(323544,)), 'sampling_rate': 24000}}


In [10]:
from datasets import load_dataset

path = "Emilia/KO/*.tar" # Same for Emilia; just replace "Emilia-YODAS/" with "Emilia/"
dataset = load_dataset("amphion/Emilia-Dataset", data_files={"train": path}, split="train", streaming=True)
print(dataset) # here should only shows 1983 n_shards

IterableDataset({
    features: ['__key__', '__url__', 'json', 'mp3'],
    n_shards: 40
})


In [None]:
for idx, data in enumerate(dataset):
    if idx<500:
        continue
    print(data['json']['text'])
    if idx>600:
        break