In [2]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "ghost613/whisper-large-v3-turbo-korean"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

In [2]:
# Load model directly
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

koprocessor = AutoProcessor.from_pretrained("o0dimplz0o/Whisper-Large-v3-turbo-STT-Zeroth-KO-v2")
komodel = AutoModelForSpeechSeq2Seq.from_pretrained("o0dimplz0o/Whisper-Large-v3-turbo-STT-Zeroth-KO-v2")

komodel.save_pretrained("/workspace/o0dimplz0o_zeroth_ko_v2")

In [3]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=komodel,
    tokenizer=koprocessor.tokenizer,
    feature_extractor=koprocessor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

Device set to use cuda:0


In [4]:
import io
import base64
import time
import soundfile as sf
import librosa

# base64 문자열 예시
with open("eo_05.mp3", "rb") as f:
    b64_audio = base64.b64encode(f.read()).decode()

audio_bytes = base64.b64decode(b64_audio)
stt = time.time()
data, sr = sf.read(io.BytesIO(audio_bytes), dtype="float32", always_2d=True)
mono = data.mean(axis=1)  # 모노 변환
mono_16k = librosa.resample(mono, orig_sr=sr, target_sr=16000)
print(time.time() - stt)

10.055109977722168


In [7]:
import time

params = dict(
    beam_size=1,                # 1이면 그리디; 빔서치(>=2)는 느림. 보통 1~2 추천
    best_of=1,                  # 그리디일 때는 의미 없음. 작게.
    temperature=0.0,            # 안정/속도↑. (불확실 시 fallback으로 [0.0, 0.2, 0.4] 등 가능)
    vad_filter=True,            # 무음 제거 → 디코딩량 ↓
    vad_parameters=dict(min_silence_duration_ms=500),
    chunk_length=15,            # 초 단위. 10~20초 권장(긴 문맥 필요하면 20~30)
    no_speech_threshold=0.6,    # 무음 판단 강화로 낭비 줄이기
    log_prob_threshold=-1.0,    # 너무 낮으면 실패로 간주하고 temperature fallback 고려
    word_timestamps=False,      # 단어 타임스탬프 끄면 속도↑ (필요 시만 True)
    condition_on_previous_text=False,  # 청크 간 의존 줄여 속도/안정 ↑
)
st = time.time()
result = pipe(
    mono_16k,
    generate_kwargs={"language":"korean"}
)
print(result, "\n", time.time() - st)

{'text': '그런 얘기들 되게 많이 하시는데 정석을 급히 작겠어요'} 
 0.40485095977783203


In [20]:
import inspect
print(inspect.signature(model.forward))

(input_features: Optional[torch.FloatTensor] = None, attention_mask: Optional[torch.LongTensor] = None, decoder_input_ids: Optional[torch.LongTensor] = None, decoder_attention_mask: Optional[torch.LongTensor] = None, head_mask: Optional[torch.Tensor] = None, decoder_head_mask: Optional[torch.Tensor] = None, cross_attn_head_mask: Optional[torch.Tensor] = None, encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None, past_key_values: Optional[transformers.cache_utils.Cache] = None, decoder_inputs_embeds: Optional[tuple[torch.FloatTensor]] = None, decoder_position_ids: Optional[tuple[torch.LongTensor]] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None) -> Union[tuple[torch.Tensor], transformers.modeling_outputs.Seq2SeqLMOutput]


변환 완료: output.mp3


In [1]:
from datasets import load_dataset

ds = load_dataset("o0dimplz0o/Zeroth-STT-Korean")

README.md:   0%|          | 0.00/926 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/23 [00:00<?, ?files/s]

data/train-00000-of-00023.parquet:   0%|          | 0.00/453M [00:00<?, ?B/s]

data/train-00001-of-00023.parquet:   0%|          | 0.00/449M [00:00<?, ?B/s]

data/train-00002-of-00023.parquet:   0%|          | 0.00/443M [00:00<?, ?B/s]

data/train-00003-of-00023.parquet:   0%|          | 0.00/443M [00:00<?, ?B/s]

data/train-00004-of-00023.parquet:   0%|          | 0.00/441M [00:00<?, ?B/s]

data/train-00005-of-00023.parquet:   0%|          | 0.00/450M [00:00<?, ?B/s]

data/train-00006-of-00023.parquet:   0%|          | 0.00/446M [00:00<?, ?B/s]

data/train-00007-of-00023.parquet:   0%|          | 0.00/444M [00:00<?, ?B/s]

data/train-00008-of-00023.parquet:   0%|          | 0.00/444M [00:00<?, ?B/s]

data/train-00009-of-00023.parquet:   0%|          | 0.00/438M [00:00<?, ?B/s]

data/train-00010-of-00023.parquet:   0%|          | 0.00/445M [00:00<?, ?B/s]

data/train-00011-of-00023.parquet:   0%|          | 0.00/439M [00:00<?, ?B/s]

data/train-00012-of-00023.parquet:   0%|          | 0.00/438M [00:00<?, ?B/s]

data/train-00013-of-00023.parquet:   0%|          | 0.00/444M [00:00<?, ?B/s]

data/train-00014-of-00023.parquet:   0%|          | 0.00/444M [00:00<?, ?B/s]

data/train-00015-of-00023.parquet:   0%|          | 0.00/442M [00:00<?, ?B/s]

data/train-00016-of-00023.parquet:   0%|          | 0.00/440M [00:00<?, ?B/s]

data/train-00017-of-00023.parquet:   0%|          | 0.00/442M [00:00<?, ?B/s]

data/train-00018-of-00023.parquet:   0%|          | 0.00/452M [00:00<?, ?B/s]

data/train-00019-of-00023.parquet:   0%|          | 0.00/437M [00:00<?, ?B/s]

data/train-00020-of-00023.parquet:   0%|          | 0.00/445M [00:00<?, ?B/s]

data/train-00021-of-00023.parquet:   0%|          | 0.00/444M [00:00<?, ?B/s]

data/train-00022-of-00023.parquet:   0%|          | 0.00/441M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/102263 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/23 [00:00<?, ?it/s]