In [8]:
import torch
import tempfile
import torchaudio
from transformers import pipeline
from datasets import load_dataset
import os

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"

asr_pipeline = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-large-v3",
    chunk_length_s=20,                 
    return_timestamps=True,
    torch_dtype=torch.float16,         
    device=device
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
def transcribe_audio_file(audio_bytes: bytes, original_filename: str, return_timestamps: bool = False):
    """
    –¢—Ä–∞–Ω—Å–∫—Ä–∏–±–∏—Ä—É–µ—Ç –∞—É–¥–∏–æ—Ñ–∞–π–ª, –≤–æ–∑–≤—Ä–∞—â–∞—è —Ç–µ–∫—Å—Ç –∏–ª–∏ —á–∞–Ω–∫–∏ —Å —Ç–∞–π–º–∫–æ–¥–∞–º–∏.

    :param audio_bytes: —Å–æ–¥–µ—Ä–∂–∏–º–æ–µ –∞—É–¥–∏–æ—Ñ–∞–π–ª–∞ –≤ –±–∞–π—Ç–∞—Ö
    :param original_filename: –∏–º—è —Ñ–∞–π–ª–∞ (–Ω—É–∂–Ω–æ –¥–ª—è –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è —Ä–∞—Å—à–∏—Ä–µ–Ω–∏—è)
    :param return_timestamps: –≤–æ–∑–≤—Ä–∞—â–∞—Ç—å –ª–∏ —Ç–∞–π–º–∫–æ–¥—ã
    :return: —Å—Ç—Ä–æ–∫–∞ —Ç–µ–∫—Å—Ç–∞ –∏–ª–∏ —Å–ø–∏—Å–æ–∫ —á–∞–Ω–∫–æ–≤ —Å —Ç–∞–π–º–∫–æ–¥–∞–º–∏
    """
    ext = os.path.splitext(original_filename)[1].lower()

    with tempfile.NamedTemporaryFile(suffix=ext, delete=True) as temp_audio:
        temp_audio.write(audio_bytes)
        temp_audio.flush()

        # –ó–∞–≥—Ä—É–∂–∞–µ–º –∞—É–¥–∏–æ
        waveform, sample_rate = torchaudio.load(temp_audio.name)

        # –ü—Ä–∏–≤–æ–¥–∏–º –∫ 16kHz, –µ—Å–ª–∏ –Ω—É–∂–Ω–æ
        if sample_rate != 16000:
            waveform = torchaudio.functional.resample(waveform, sample_rate, 16000)

        # –ü—Ä–∏–≤–æ–¥–∏–º –∫ –º–æ–Ω–æ, –µ—Å–ª–∏ —Å—Ç–µ—Ä–µ–æ
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        # Whisper –æ–∂–∏–¥–∞–µ—Ç NumPy-–º–∞—Å—Å–∏–≤
        audio_input = waveform.squeeze().numpy()

        result = asr_pipeline(
            audio_input,
            batch_size=8,
            return_timestamps=return_timestamps,
        )

        return result["chunks"] if return_timestamps else result["text"]


In [11]:
import yt_dlp
def download_audio_from_youtube(url: str) -> tuple[bytes, str]:
    """
    –°–∫–∞—á–∏–≤–∞–µ—Ç –∞—É–¥–∏–æ —Å YouTube –∏ –≤–æ–∑–≤—Ä–∞—â–∞–µ—Ç –µ–≥–æ –∫–∞–∫ –±–∞–π—Ç—ã –∏ –∏–º—è —Ñ–∞–π–ª–∞.
    """
    with tempfile.TemporaryDirectory() as tmpdir:
        output_path = os.path.join(tmpdir, "%(title)s.%(ext)s")
        ydl_opts = {
            "format": "bestaudio/best",
            "outtmpl": output_path,
            "noplaylist": True,
            "quiet": True,
            "postprocessors": [{
                "key": "FFmpegExtractAudio",
                "preferredcodec": "mp3",
                "preferredquality": "192",
            }],
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info_dict = ydl.extract_info(url, download=True)
            filename = ydl.prepare_filename(info_dict).rsplit(".", 1)[0] + ".mp3"

        with open(filename, "rb") as f:
            audio_bytes = f.read()

        return audio_bytes, os.path.basename(filename)


In [13]:
if __name__ == "__main__":
    filename = ""
    # with open(filename, "rb") as f:
    #     audio = f.read()
    youtube_url = input("–í—Å—Ç–∞–≤—å—Ç–µ —Å—Å—ã–ª–∫—É –Ω–∞ YouTube: ").strip()

    print("\nüì• –°–∫–∞—á–∏–≤–∞–µ–º –∞—É–¥–∏–æ...")
    audio, filename = download_audio_from_youtube(youtube_url)
    
    print("\n=== –¢–æ–ª—å–∫–æ —Ç–µ–∫—Å—Ç ===")
    text = transcribe_audio_file(audio, filename, return_timestamps=False)
    print(text)

    print("\n=== –° —Ç–∞–π–º–∫–æ–¥–∞–º–∏ ===")
    chunks = transcribe_audio_file(audio, filename, return_timestamps=True)
    for chunk in chunks:
        print(chunk)


üì• –°–∫–∞—á–∏–≤–∞–µ–º –∞—É–¥–∏–æ...
                                                         
=== –¢–æ–ª—å–∫–æ —Ç–µ–∫—Å—Ç ===
 –†–æ–º–∞, –Æ—Ç—É–± –Ω–µ –≤–µ—á–Ω—ã–π, —Ä–∞–Ω–æ –∏–ª–∏ –ø–æ–∑–¥–Ω–æ –±–∞–±–æ—Å–∏–∫–∏ –∫–∞–ø–∞—Ç—å –ø–µ—Ä–µ—Å—Ç–∞–Ω—É—Ç, –∞ —É —Ç–µ–±—è –Ω–µ–æ—Å–≤–æ–µ–Ω–Ω–æ–π –ø—Ä–æ—Ñ–µ—Å—Å–∏–∏, –Ω–µ –æ–ø—ã—Ç–∞ —Ä–∞–±–æ—Ç—ã, —á—Ç–æ –±—É–¥–µ—à—å –¥–µ–ª–∞—Ç—å, –∫–æ–≥–¥–∞ –∑–æ–ª–æ—Ç–æ–π –æ—Å—ë–ª –∏—Å—Å—è–∫–Ω–µ—Ç? –ò —ç—Ç–æ—Ç –ø–∞—Ä–µ–Ω—å –ø—Ä–∞–≤, —á—É–≤–∞–∫–∏, —Å–µ—Ä—å—ë–∑–Ω–æ, –æ—Å–æ–±–µ–Ω–Ω–æ —É—á–∏—Ç—ã–≤–∞—è, —á—Ç–æ —è —ç—Ç–æ —Å–∞–º –∏ –Ω–∞–ø–∏—Å–∞–ª, —á—Ç–æ–±—ã —É –º–µ–Ω—è –±—ã–ª–∞ –ø–æ–¥–≤–æ–¥–∫–∞ –∫ –≤–∏–¥–æ—Å—É. –ê –≤–æ—Ç –ø—Ä—É—Ñ—ã. –ù–æ –∫–∞–∫ –∂–µ –º–Ω–µ –Ω–∞–π—Ç–∏ –Ω–æ—Ä–º–∞–ª—å–Ω—É—é —Ä–∞–±–æ—Ç—É, –µ—Å–ª–∏ —è –Ω–µ —É–º–µ—é –µ–µ —Ä–∞–±–æ—Ç–∞—Ç—å? –Ø –Ω–∞–¥–µ—é—Å—å, –≤ —ç—Ç–æ–º –º–Ω–µ —Å–º–æ–≥—É—Ç –ø–æ–º–æ—á—å –¥–≤–∞ —á–µ–ª–æ–≤–µ—á–∫–∞. –¢–æ –µ—Å—Ç—å —ç—Ç–æ –ö–æ–¥–µ—Ä –ü–∞–≤–µ–ª —Å –æ–≥—Ä–æ–º–Ω—ã–º —Ä–∞–±–æ—á–∏–º —Ö–µ—Ä–æ–º –∏ —Å—Ç–∞–∂–µ–º, –∏ –ú–∞–∫—Å–æ—Å. –†–µ–±—è—Ç

In [None]:
if __name__ == "__main__":
    youtube_url = input("–í—Å—Ç–∞–≤—å—Ç–µ —Å—Å—ã–ª–∫—É –Ω–∞ YouTube: ").strip()

    print("\nüì• –°–∫–∞—á–∏–≤–∞–µ–º –∞—É–¥–∏–æ...")
    audio, filename = download_audio_from_youtube(youtube_url)

    print("\n=== –¢–æ–ª—å–∫–æ —Ç–µ–∫—Å—Ç ===")
    text = transcribe_audio_file(audio, filename, return_timestamps=False)
    print(text)

    print("\n=== –° —Ç–∞–π–º–∫–æ–¥–∞–º–∏ ===")
    chunks = transcribe_audio_file(audio, filename, return_timestamps=True)
    for chunk in chunks:
        print(chunk)

In [2]:
import os
import yt_dlp
from pathlib import Path

def download_youtube_audio(url: str):
    # –ü—É—Ç—å –∫ –ø–∞–ø–∫–µ "–ó–∞–≥—Ä—É–∑–∫–∏" (—Ä–∞–±–æ—Ç–∞–µ—Ç –Ω–∞ Windows/Linux/macOS)
    downloads_path = str(Path.home() / "Downloads")

    ydl_opts = {
        "format": "bestaudio/best",
        "outtmpl": os.path.join(downloads_path, "%(title)s.%(ext)s"),
        "noplaylist": True,
        "quiet": False,
        "postprocessors": [
            {
                "key": "FFmpegExtractAudio",
                "preferredcodec": "mp3",
                "preferredquality": "192",
            }
        ],
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        print(f"üì• –°–∫–∞—á–∏–≤–∞–µ–º {url} –≤ MP3...")
        ydl.download([url])
        print(f"‚úÖ –°–∫–∞—á–∞–Ω–æ –≤: {downloads_path}")

if __name__ == "__main__":
    youtube_url = input("–í—Å—Ç–∞–≤—å—Ç–µ —Å—Å—ã–ª–∫—É –Ω–∞ YouTube-–≤–∏–¥–µ–æ: ").strip()
    download_youtube_audio(youtube_url)


üì• –°–∫–∞—á–∏–≤–∞–µ–º https://youtu.be/MrNbqT7-S-Y?si=CwJXMduWxFmoa1pa –≤ MP3...
[youtube] Extracting URL: https://youtu.be/MrNbqT7-S-Y?si=CwJXMduWxFmoa1pa
[youtube] MrNbqT7-S-Y: Downloading webpage
[youtube] MrNbqT7-S-Y: Downloading tv client config
[youtube] MrNbqT7-S-Y: Downloading player afb1da51-main
[youtube] MrNbqT7-S-Y: Downloading tv player API JSON
[youtube] MrNbqT7-S-Y: Downloading ios player API JSON
[youtube] MrNbqT7-S-Y: Downloading m3u8 information
[info] MrNbqT7-S-Y: Downloading 1 format(s): 251
[download] Destination: /home/deniska/Downloads/–í–Ω—É—à–∏–ª 56-–ª–µ—Ç–Ω–µ–º—É —Ç–∞–Ω–∫–∏—Å—Ç—É, —á—Ç–æ –æ–Ω –¥–æ–±—ã–≤–∞–µ—Ç —Ä–µ–∞–ª—å–Ω—ã–π –±–∏—Ç–∫–æ–∏–Ω –≤ –ú–∞–π–Ω–∫—Ä–∞—Ñ—Ç–µ, –∏ –º–µ—Å—è—Ü –ø–ª–∞—Ç–∏–ª –µ–º—É –∑–∞ —ç—Ç–æ.webm
[download] 100% of   64.47MiB in 00:00:09 at 6.51MiB/s   
[ExtractAudio] Destination: /home/deniska/Downloads/–í–Ω—É—à–∏–ª 56-–ª–µ—Ç–Ω–µ–º—É —Ç–∞–Ω–∫–∏—Å—Ç—É, —á—Ç–æ –æ–Ω –¥–æ–±—ã–≤–∞–µ—Ç —Ä–µ–∞–ª—å–Ω—ã–π –±–∏—Ç–∫–æ–∏–Ω –≤ –ú–∞–π–Ω–∫—Ä–∞—Ñ—Ç–µ, –∏