In [2]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "ghost613/whisper-large-v3-turbo-korean"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

In [2]:
# Load model directly
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

koprocessor = AutoProcessor.from_pretrained("o0dimplz0o/Whisper-Large-v3-turbo-STT-Zeroth-KO-v2")
komodel = AutoModelForSpeechSeq2Seq.from_pretrained("o0dimplz0o/Whisper-Large-v3-turbo-STT-Zeroth-KO-v2")

komodel.save_pretrained("/workspace/o0dimplz0o_zeroth_ko_v2")

In [3]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=komodel,
    tokenizer=koprocessor.tokenizer,
    feature_extractor=koprocessor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

Device set to use cuda:0


In [4]:
import io
import base64
import time
import soundfile as sf
import librosa

# base64 문자열 예시
with open("eo_05.mp3", "rb") as f:
    b64_audio = base64.b64encode(f.read()).decode()

audio_bytes = base64.b64decode(b64_audio)
stt = time.time()
data, sr = sf.read(io.BytesIO(audio_bytes), dtype="float32", always_2d=True)
mono = data.mean(axis=1)  # 모노 변환
mono_16k = librosa.resample(mono, orig_sr=sr, target_sr=16000)
print(time.time() - stt)

10.055109977722168


In [7]:
import time

params = dict(
    beam_size=1,                # 1이면 그리디; 빔서치(>=2)는 느림. 보통 1~2 추천
    best_of=1,                  # 그리디일 때는 의미 없음. 작게.
    temperature=0.0,            # 안정/속도↑. (불확실 시 fallback으로 [0.0, 0.2, 0.4] 등 가능)
    vad_filter=True,            # 무음 제거 → 디코딩량 ↓
    vad_parameters=dict(min_silence_duration_ms=500),
    chunk_length=15,            # 초 단위. 10~20초 권장(긴 문맥 필요하면 20~30)
    no_speech_threshold=0.6,    # 무음 판단 강화로 낭비 줄이기
    log_prob_threshold=-1.0,    # 너무 낮으면 실패로 간주하고 temperature fallback 고려
    word_timestamps=False,      # 단어 타임스탬프 끄면 속도↑ (필요 시만 True)
    condition_on_previous_text=False,  # 청크 간 의존 줄여 속도/안정 ↑
)
st = time.time()
result = pipe(
    mono_16k,
    generate_kwargs={"language":"korean"}
)
print(result, "\n", time.time() - st)

{'text': '그런 얘기들 되게 많이 하시는데 정석을 급히 작겠어요'} 
 0.40485095977783203


In [20]:
import inspect
print(inspect.signature(model.forward))

(input_features: Optional[torch.FloatTensor] = None, attention_mask: Optional[torch.LongTensor] = None, decoder_input_ids: Optional[torch.LongTensor] = None, decoder_attention_mask: Optional[torch.LongTensor] = None, head_mask: Optional[torch.Tensor] = None, decoder_head_mask: Optional[torch.Tensor] = None, cross_attn_head_mask: Optional[torch.Tensor] = None, encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None, past_key_values: Optional[transformers.cache_utils.Cache] = None, decoder_inputs_embeds: Optional[tuple[torch.FloatTensor]] = None, decoder_position_ids: Optional[tuple[torch.LongTensor]] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None) -> Union[tuple[torch.Tensor], transformers.modeling_outputs.Seq2SeqLMOutput]


변환 완료: output.mp3


In [None]:
from datasets import load_dataset

ds = load_dataset("o0dimplz0o/Zeroth-STT-Korean")

In [12]:
texts = set(ds['train']['text'])

In [14]:
len(ds['train']['text'])

102263

In [1]:
import nemo.collections.asr as nemo_asr

asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/canary-1b-v2")

[NeMo I 2025-08-19 01:46:35 nemo_logging:393] Tokenizer CanaryBPETokenizer initialized with 16384 tokens


[NeMo W 2025-08-19 01:46:35 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    use_lhotse: true
    skip_missing_manifest_entries: true
    input_cfg: null
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    num_workers: 4
    pin_memory: true
    prompt_format: canary2
    max_duration: 40.0
    min_duration: 0.01
    text_field: answer
    lang_field: target_lang
    use_bucketing: true
    max_tps: null
    bucket_duration_bins: null
    bucket_batch_size: null
    num_buckets: null
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2025-08-19 01:46:35 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the 

[NeMo I 2025-08-19 01:46:35 nemo_logging:393] PADDING: 0
[NeMo I 2025-08-19 01:46:47 nemo_logging:393] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b-v2/snapshots/efbc7e5840703084f201cd086cf936c02d363b22/canary-1b-v2.nemo.


In [2]:
import io
import base64
import time
import soundfile as sf
import librosa
import re

# # base64 문자열 예시
# with open("eo_05.mp3", "rb") as f:
#     b64_audio = base64.b64encode(f.read()).decode()

# audio_bytes = base64.b64decode(b64_audio)
# data, sr = sf.read(io.BytesIO(audio_bytes), dtype="float32", always_2d=True)
# mono = data.mean(axis=1)  # 모노 변환
# mono_16k = librosa.resample(mono, orig_sr=sr, target_sr=16000)

audio, sr = librosa.load('talks.mp3', sr=16000, mono=True)
st = time.time()
transcriptions = asr_model.transcribe([audio])
cleaned = re.sub(r"<\|.*?\|>", "", transcriptions[0].text).strip()
print(cleaned, "\n", time.time() - st)

Transcribing: 100%|██████████| 1/1 [00:01<00:00,  1.87s/it]

They walked hand in hand back to their village, the rising sun illuminating their path. The villagers, who had long mourned Finn, gasped in disbelief, then erupted in joyous cries. Elara had not only found her brother, but she had also unveiled a truth long hidden, proving that even in the face of the unknown, hope and an unyielding will could bridge the impossible. The veiled realm remained, a mystery just beyond their perception, but for Elara and Finn, the most important journey had been the one back home. 
 1.8883419036865234





In [3]:
transcriptions

[Hypothesis(score=0.0, y_sequence=tensor([   16,    64,     5,     9,    11,    13,  6103,  9538,  3908,  3327,
          1220,  3327,  6004,  1237,  3755,  5130,  2472, 16067,  1289,  3324,
          1260,  8218,  7766,  1276,  1170,  9403,  3755,  2118, 16071, 16073,
          1839,  5130,  1264,  1385, 16067,  3602,  2421,  5639,  1510,  3830,
          1212,  7184, 16067, 13685, 14437,  1220,  1857,  6750,  6184, 16067,
          4551,  1359,  2128,  2960,  1220,  1783, 16081,  3636,  6937,  1172,
         16073,  2445,  1781,  2421,  2019,  5397,  9633,  2173,  4033, 16058,
          1871, 16067,  2867,  3930,  2421,  2935,  1243,  1395,  1194,  1212,
          1168,  1434,  6224,  5639, 13004,  1598, 16067,  5939,  1260,  1644,
          4499,  1220,  1289,  4314,  1381,  1289,  1243, 16066,  1358,  2210,
         16067, 14777,  1392,  1274,  1243, 16081, 16056,  2319,  1260,  2735,
          4972,  2099, 10312,  1289,  2062,  1198,  1423,  2655, 16073,  1839,
          1501,  11

In [None]:
import time

st = time.time()
transcriptions = asr_model.transcribe(["eo_05.mp3"])
print(transcriptions, "\n", time.time() - st)