In [1]:
import yt_dlp
import re
import os
import glob
import json
import unicodedata

def safe_filename(s: str) -> str:
    s = unicodedata.normalize('NFKD', s)
    s = re.sub(r'[<>:"/\\|?*\n\r\t]+', '', s)
    s = s.strip()
    return s or "video"

def find_subtitle_file(title: str, lang: str):
    title_safe = safe_filename(title)
    patterns = [f"{title_safe}.{lang}.vtt",
        f"{title_safe}.{lang}.srt",
        f"{title_safe}.vtt",
        f"{title_safe}.srt",
        f"{title_safe}*.vtt",
        f"{title_safe}*.srt",]
    for pat in patterns:
        for path in glob.glob(pat):
            return path
    return None

def parse_vtt(content: str):
    entries = []
    content = re.sub(r'(?m)^WEBVTT.*\n?', '', content)
    content = re.sub(r'(?m)^NOTE.*(?:\n.*)*?(?=\n|$)', '', content)
    # Remove lines that are timestamps only, keep the block of text
    blocks = re.split(r'\n{2,}', content)
    timestamp_pattern = re.compile(r'(\d{1,2}:\d{2}:\d{2}\.\d{3})\s*-->\s*(\d{1,2}:\d{2}:\d{2}\.\d{3})')
    for block in blocks:
        lines = block.strip().splitlines()
        if not lines:
            continue
        time_match = None # Find timestamp line
        text_lines = []
        for line in lines:
            match = timestamp_pattern.match(line)
            if match:
                time_match = match.groups()
            else:
                text_lines.append(line.strip())
        if time_match and text_lines:
            start_time = time_match[0]
            text = " ".join(text_lines)
            text = re.sub(r'<\d{1,2}:\d{2}:\d{2}(?:\.\d{1,3})?>', '', text)  # Clean text: remove inline timestamps and tags
            text = re.sub(r'<[^>]+>', '', text)
            text = text.strip()
            if text:
                entries.append((start_time, text))
    return entries

def download_and_create_outputs(video_url: str, lang: str = 'en'):
    ydl_opts = {
        'writesubtitles': True,
        'writeautomaticsub': True,
        'subtitleslangs': [lang],
        'skip_download': True,
        'quiet': True,
        'outtmpl': '%(title)s.%(ext)s',}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(video_url, download=True)
    title = info.get('title') or info.get('id') or 'video'
    title_safe = safe_filename(title)
    subtitle_file = find_subtitle_file(title, lang)
    if not subtitle_file:
        raise FileNotFoundError(f"Subtitle file not found for {title_safe}")
    with open(subtitle_file, 'r', encoding='utf-8', errors='ignore') as f:
        raw_content = f.read()
    entries = parse_vtt(raw_content)
    #text transcript
    transcript_file = f"{title_safe}.transcript.txt"
    with open(transcript_file, 'w', encoding='utf-8') as f_txt:
        for _, text in entries:
            f_txt.write(text + "\n")
    print(f"Transcript saved as {transcript_file}")
    #timestamped JSON
    json_file = f"{title_safe}.transcript.json"
    data = [{"timestamp": ts, "text": txt} for ts, txt in entries]
    with open(json_file, 'w', encoding='utf-8') as f_json:
        json.dump(data, f_json, indent=2)
    print(f"Timestamped JSON saved {json_file}")
    return transcript_file, json_file

if __name__ == "__main__":
    video_url = "https://www.youtube.com/watch?v=0Goz0PnhEg8"
    try:
        transcript, json_out = download_and_create_outputs(video_url, lang='en')
        print("Files created:")
        print("Transcript:", transcript)
        print("JSON:", json_out)
    except Exception as e:
        print("Error:", str(e))

         player = https://www.youtube.com/s/player/b66835e2/player_es6.vflset/en_US/base.js
         n = QaCH3-KG0mFgmg72Zlb6 ; player = https://www.youtube.com/s/player/b66835e2/player_es6.vflset/en_US/base.js
         player = https://www.youtube.com/s/player/b66835e2/player_es6.vflset/en_US/base.js
         n = 1G9i5MLs71LYlOnyG9DV ; player = https://www.youtube.com/s/player/b66835e2/player_es6.vflset/en_US/base.js


Transcript saved as ABC7 Chicago meteorologist has hilarious realization that his TV is a touchscreen live on the air.transcript.txt
Timestamped JSON saved ABC7 Chicago meteorologist has hilarious realization that his TV is a touchscreen live on the air.transcript.json
Files created:
Transcript: ABC7 Chicago meteorologist has hilarious realization that his TV is a touchscreen live on the air.transcript.txt
JSON: ABC7 Chicago meteorologist has hilarious realization that his TV is a touchscreen live on the air.transcript.json
