# Import Dependencies

In [1]:
# Import Main Dependencies
import os, re, ffmpeg, whisper
from pytubefix import YouTube, Stream
from pytubefix.cli import on_progress
from pytubefix.innertube import _default_clients

# Import Other Dependencies
import torch
from tqdm.auto import tqdm

# Define Utilities

In [2]:
def sanitize_filename(filename: str) -> str:
    # Escape Double Quotes
    filename = filename.replace('"', '\\"')

    # Replace Invalid Characters with "_"
    invalid_chars = re.compile(r'[<>:"/\\|?*]')
    sanitized_filename = invalid_chars.sub("_", filename)

    return sanitized_filename
    
def read_unique_items_from_file(file: str) -> list:
    with open(file, "r") as f:
        return list(set(url.strip() for url in f.readlines() if url.strip()))

# Set Configurations

In [3]:
# File Names
yt_video_links_filename = "YouTube Video Links.txt"
transcript_sentences_filename = "transcript_sentences.csv"

# Folder Names
video_output_path = "Video"
audio_output_path = "Audio"
transcription_output_path = "Transcription"

# Boolean Flags
remove_video = True
remove_audio = True

# Additional Dependency Configurations
_default_clients["ANDROID_MUSIC"] = _default_clients["ANDROID_CREATOR"]

# Collect Data (YouTube Videos)

In [4]:
def download_youtube_video(video_filename: str, stream: Stream) -> tuple[str, str]:
    # Create Video Directory
    os.makedirs(video_output_path, exist_ok=True)
    
    # Set Path for Video File
    video_file = os.path.join(video_output_path, video_filename)
    
    # Delete Old Existing Video File (note: to clean any corrupted file)
    if os.path.exists(video_file):
        os.remove(video_file)
        
    # Download Video File
    print("") # Just New Line for Better Output
    print(f'Downloading (Video): {video_filename}')
    print("") # Just New Line for Better Output
    stream.download(output_path=video_output_path, filename=video_filename)
    print("") # Just New Line for Better Output
    print("") # Just New Line for Better Output
    
    # Return Video File and Name
    return video_file, video_filename

# Audio Extraction (Video to Audio)

In [5]:
def extract_audio_from_video(video_file: str, video_filename: str) -> tuple[str, str]:
    # Create the Audio Directory
    os.makedirs(audio_output_path, exist_ok=True)

    # Set Audio File Name ("[YouTube Video ID] [title].mp3")
    audio_filename = f'{os.path.splitext(video_filename)[0]}.mp3'

    # Set Path for Audio File
    audio_file = os.path.join(audio_output_path, audio_filename)
    
    # Delete Old Existing Audio File (note: to clean any corrupted file)
    if os.path.exists(audio_file):
        os.remove(audio_file)
    
    # Extract Audio File
    print(f'Extracting (Audio): {audio_filename}')
    print("") # Just New Line for Better Output
    (
        ffmpeg
        .input(video_file)
        .output(audio_file, format="mp3", acodec="libmp3lame", loglevel="info")
        .run(overwrite_output=True)
    )
    
    # Return Audio File and Name
    return audio_file, audio_filename

# Transcription (Audio to Text)

In [6]:
def transcribe_audio_to_text(audio_file: str, audio_filename: str):
    # Create the Transcription Directory
    os.makedirs(transcription_output_path, exist_ok=True)
    
    # Set Transcription File Name ("[YouTube Video ID] [title].txt")
    transcription_filename = f'{os.path.splitext(audio_filename)[0]}.txt'
    
    # Set Path for Transcription File
    transcription_file = os.path.join(transcription_output_path, transcription_filename)
            
    # Get/Download OpenAI Whisper Model
    """ 
    Models: 
        tiny, base, small, medium, large, turbo
    English-Only:
        tiny.en, base.en, small.en, medium.en
    
    Required VRAM:              Speed:
        1) 1GB - tiny, base         1) 10x - tiny
        2) 2GB - small              2) 8x - turbo
        3) 5GB - medium             3) 7x - base
        4) 6GB - turbo              4) 4x - small
        5) 10GB - large             5) 2x - medium
                                    6) 1x - large
    
    Quote from OpenAI: 
        - The .en models for English-only applications tend to perform better, especially for the tiny.en and base.en models.
        We observed that the difference becomes less significant for the small.en and medium.en models.
    
    Note: 4GB lang VRAM ko kaya small.en ginamit
    """  
    print(f'Transcribing (Text): {transcription_filename}')
    print("") # Just New Line for Better Output
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
    model = whisper.load_model("small.en", device=device)
    
    # Transcribe Audio File (Saves Whole Text in Memory Before Disk to Avoid Corruption)
    result = model.transcribe(audio_file, fp16=False, verbose=False)
    try:
        with open(transcription_file, "w", encoding="utf-8") as f:
            f.write(result["text"])
    except:
        if os.path.exists(transcription_file):
            os.remove(transcription_file)

# Execute Data Gathering

In [None]:
yt_urls = read_unique_items_from_file(yt_video_links_filename)

with tqdm(total=len(yt_urls), desc="Getting YouTube URLs") as pbar:
    for index, url in enumerate(yt_urls):        
        try:
            current = f'{index+1}/{len(yt_urls)}'
    
            # Get Video Information
            yt = YouTube(url, on_progress_callback=on_progress)
            stream = yt.streams.get_audio_only()
            # Sanitize Video File Name and Add YouTube Video ID
            video_filename = f'[{yt.video_id}] {sanitize_filename(stream.default_filename)}'
            
            # Get File Name Without Extension (e.g., ".mp4")
            filename = os.path.splitext(video_filename)[0]
            
            # Skip If Transcription Already Exists
            transcription_exists = False
            pbar.set_description(f'Checking Existing Transcriptions [{current} File]')
            if os.path.exists(transcription_output_path):
                transcription_filename = f'{filename}.txt'
                for existing_transcription_filename in os.listdir(transcription_output_path):
                    if existing_transcription_filename == transcription_filename:
                        existing_transcription_path = os.path.join(transcription_output_path, existing_transcription_filename)
                        if os.path.exists(existing_transcription_path):
                            transcription_exists = True
            if transcription_exists:
                # Delete/Keep Video File
                if remove_video:
                    video_file = os.path.join(video_output_path, video_filename)
                    if os.path.exists(video_file):
                        os.remove(video_file)
                        
                # Delete/Keep Audio File
                if remove_audio:
                    audio_filename = f'{filename}.mp3'
                    audio_file = os.path.join(audio_output_path, audio_filename)
                    if os.path.exists(audio_file):
                        os.remove(audio_file)
                        
                pbar.update(1)
                continue
    
            # Log YouTube URL being Processed
            print("") # Just New Line for Better Output
            print(f'Found YouTube Video (URL): {url}')
            
            # Download YouTube Video
            pbar.set_description(f'Downloading [{current} Video] ')
            video_file, video_filename = download_youtube_video(video_filename, stream)
            
            # Extract Audio from Video -> Delete/Keep Video File
            pbar.set_description(f'Extracting [{current} Audio]')
            audio_file, audio_filename = extract_audio_from_video(video_file, video_filename)
            if remove_video: os.remove(video_file)
            
            # Transcribe Audio to Text -> Delete/Keep Audio File
            pbar.set_description(f'Transcribing [{current} Text]')
            transcribe_audio_to_text(audio_file, audio_filename)
            if remove_audio: os.remove(audio_file)
                
            pbar.update(1)
        except Exception as e: 
            print(e)
            pbar.update(1)
    pbar.set_description("Finished Data Gathering")

Getting YouTube URLs:   0%|          | 0/269 [00:00<?, ?it/s]


Found YouTube Video (URL): https://www.youtube.com/watch?v=P46C21DTz6A

Downloading (Video): [P46C21DTz6A] Kamala Harris Touts Popular Progressive Policy During Pennsylvania Stop.mp4

 ↳ |█████████████████████████████████████████| 100.0%

Extracting (Audio): [P46C21DTz6A] Kamala Harris Touts Popular Progressive Policy During Pennsylvania Stop.mp3

Transcribing (Text): [P46C21DTz6A] Kamala Harris Touts Popular Progressive Policy During Pennsylvania Stop.txt



  checkpoint = torch.load(fp, map_location=device)

  0%|                                        | 0/48032 [00:00<?, ?frames/s][A
  6%|█▋                          | 2882/48032 [00:08<02:18, 325.63frames/s][A
 11%|███▏                        | 5514/48032 [00:14<01:52, 376.90frames/s][A
 18%|████▉                       | 8434/48032 [00:21<01:35, 414.73frames/s][A
 23%|██████▎                    | 11270/48032 [00:29<01:38, 374.60frames/s][A
 29%|███████▉                   | 14166/48032 [00:37<01:30, 372.31frames/s][A
 36%|█████████▌                 | 17118/48032 [00:45<01:22, 376.64frames/s][A
 42%|███████████▏               | 19990/48032 [00:51<01:07, 413.25frames/s][A
 47%|████████████▋              | 22498/48032 [00:58<01:06, 386.88frames/s][A
 52%|██████████████             | 25010/48032 [01:04<00:58, 393.52frames/s][A
 58%|███████████████▌           | 27698/48032 [01:13<00:55, 364.82frames/s][A
 63%|████████████████▉          | 30146/48032 [01:19<00:48, 370.76frames/s][A



Found YouTube Video (URL): https://www.youtube.com/watch?v=_1pbeLlqvt0

Downloading (Video): [_1pbeLlqvt0] Trump lost, confused on stage for 39 minutes while music plays.mp4

 ↳ |█████████████████████████████████████████| 100.0%

Extracting (Audio): [_1pbeLlqvt0] Trump lost, confused on stage for 39 minutes while music plays.mp3

Transcribing (Text): [_1pbeLlqvt0] Trump lost, confused on stage for 39 minutes while music plays.txt




  0%|                                        | 0/40709 [00:00<?, ?frames/s][A
  6%|█▋                          | 2460/40709 [00:05<01:19, 482.27frames/s][A
 13%|███▋                        | 5442/40709 [00:12<01:22, 427.78frames/s][A
 21%|█████▊                      | 8426/40709 [00:23<01:33, 345.19frames/s][A
 27%|███████▎                   | 10978/40709 [00:31<01:31, 325.82frames/s][A
 32%|████████▌                  | 12962/40709 [00:36<01:20, 344.17frames/s][A
 39%|██████████▌                | 15962/40709 [00:44<01:10, 351.26frames/s][A
 47%|████████████▌              | 18962/40709 [00:52<01:00, 360.77frames/s][A
 54%|██████████████▍            | 21798/40709 [00:59<00:50, 374.44frames/s][A
 61%|████████████████▍          | 24714/40709 [01:07<00:42, 377.68frames/s][A
 64%|█████████████████▎         | 26118/40709 [01:11<00:38, 375.42frames/s][A
 71%|███████████████████▎       | 29070/40709 [01:15<00:26, 446.81frames/s][A
 79%|█████████████████████▎     | 32070/40709 [01:2


Found YouTube Video (URL): https://www.youtube.com/watch?v=UGPWo4nK50I

Downloading (Video): [UGPWo4nK50I] Black Men Slam Barack Obama for “Lecturing” Them on Voting For Kamala Harris _ Firstpost America.mp4

 ↳ |█████████████████████████████████████████| 100.0%

Extracting (Audio): [UGPWo4nK50I] Black Men Slam Barack Obama for “Lecturing” Them on Voting For Kamala Harris _ Firstpost America.mp3

Transcribing (Text): [UGPWo4nK50I] Black Men Slam Barack Obama for “Lecturing” Them on Voting For Kamala Harris _ Firstpost America.txt




  0%|                                        | 0/49414 [00:00<?, ?frames/s][A
  6%|█▌                          | 2812/49414 [00:07<02:10, 358.34frames/s][A
 11%|██▉                         | 5194/49414 [00:12<01:46, 414.01frames/s][A
 16%|████▌                       | 8100/49414 [00:20<01:45, 390.59frames/s][A
 22%|██████                     | 11062/49414 [00:27<01:34, 407.90frames/s][A
 28%|███████▌                   | 13810/49414 [00:35<01:33, 381.04frames/s][A
 34%|█████████▏                 | 16794/49414 [00:41<01:18, 415.85frames/s][A
 39%|██████████▋                | 19496/49414 [00:46<01:06, 447.01frames/s][A
 45%|████████████▎              | 22422/49414 [00:54<01:03, 424.42frames/s][A
 51%|█████████████▉             | 25422/49414 [00:56<00:44, 538.86frames/s][A
 56%|███████████████▏           | 27908/49414 [01:01<00:39, 549.42frames/s][A
 62%|████████████████▊          | 30852/49414 [01:09<00:39, 467.98frames/s][A
 68%|██████████████████▍        | 33728/49414 [01:2


Found YouTube Video (URL): https://www.youtube.com/watch?v=uSIuSWRNzf4

Downloading (Video): [uSIuSWRNzf4] Incredible Arizona, Nevada and Pennsylvania Polls for October! Trump VS Harris.mp4

 ↳ |█████████████████████████████████████████| 100.0%

Extracting (Audio): [uSIuSWRNzf4] Incredible Arizona, Nevada and Pennsylvania Polls for October! Trump VS Harris.mp3

Transcribing (Text): [uSIuSWRNzf4] Incredible Arizona, Nevada and Pennsylvania Polls for October! Trump VS Harris.txt




  0%|                                        | 0/41456 [00:00<?, ?frames/s][A
  6%|█▋                          | 2584/41456 [00:06<01:38, 395.30frames/s][A
 12%|███▍                        | 5032/41456 [00:12<01:30, 404.63frames/s][A
 19%|█████▏                      | 7728/41456 [00:19<01:24, 401.05frames/s][A
 25%|██████▊                    | 10536/41456 [00:25<01:14, 416.60frames/s][A
 32%|████████▋                  | 13256/41456 [00:32<01:10, 402.24frames/s][A
 38%|██████████▏                | 15736/41456 [00:39<01:04, 398.09frames/s][A
 45%|████████████               | 18448/41456 [00:45<00:56, 404.55frames/s][A
 51%|█████████████▊             | 21168/41456 [00:52<00:50, 404.33frames/s][A
 58%|███████████████▌           | 23904/41456 [00:59<00:44, 398.81frames/s][A
 65%|█████████████████▍         | 26792/41456 [01:06<00:36, 402.64frames/s][A
 71%|███████████████████▏       | 29424/41456 [01:13<00:30, 400.93frames/s][A
 77%|████████████████████▊      | 32032/41456 [01:2


Found YouTube Video (URL): https://www.youtube.com/watch?v=f6clkqEJyUI

Downloading (Video): [f6clkqEJyUI] DANGER_ TRUMP PENNSYLVANIA CROWD LEGITIMATELY MASSIVE.mp4

 ↳ |█████████████████████████████████████████| 100.0%

Extracting (Audio): [f6clkqEJyUI] DANGER_ TRUMP PENNSYLVANIA CROWD LEGITIMATELY MASSIVE.mp3

Transcribing (Text): [f6clkqEJyUI] DANGER_ TRUMP PENNSYLVANIA CROWD LEGITIMATELY MASSIVE.txt




  0%|                                        | 0/27736 [00:00<?, ?frames/s][A
 10%|██▋                         | 2652/27736 [00:07<01:09, 361.96frames/s][A
 20%|█████▌                      | 5480/27736 [00:14<00:59, 371.98frames/s][A
 30%|████████▎                   | 8232/27736 [00:22<00:52, 369.94frames/s][A
 40%|██████████▊                | 11064/27736 [00:32<00:50, 327.95frames/s][A
 50%|█████████████▋             | 14004/27736 [00:40<00:40, 335.13frames/s][A
 61%|████████████████▎          | 16782/27736 [00:49<00:32, 334.28frames/s][A
 70%|██████████████████▊        | 19350/27736 [00:56<00:25, 334.32frames/s][A
 79%|█████████████████████▍     | 22048/27736 [01:05<00:17, 321.16frames/s][A
 89%|████████████████████████   | 24704/27736 [01:13<00:09, 328.03frames/s][A
100%|███████████████████████████| 27736/27736 [01:23<00:00, 331.14frames/s][A



Found YouTube Video (URL): https://www.youtube.com/watch?v=TsMh1YTa7aI

Downloading (Video): [TsMh1YTa7aI] Donald Trump Must Win Pennsylvania!.mp4

 ↳ |█████████████████████████████████████████| 100.0%

Extracting (Audio): [TsMh1YTa7aI] Donald Trump Must Win Pennsylvania!.mp3

Transcribing (Text): [TsMh1YTa7aI] Donald Trump Must Win Pennsylvania!.txt




  0%|                                        | 0/84657 [00:00<?, ?frames/s][A
  3%|▉                           | 2948/84657 [00:07<03:39, 372.78frames/s][A
  7%|█▉                          | 5926/84657 [00:15<03:29, 374.96frames/s][A
 10%|██▉                         | 8800/84657 [00:24<03:36, 350.97frames/s][A
 14%|███▋                       | 11540/84657 [00:32<03:31, 346.46frames/s][A
 17%|████▌                      | 14244/84657 [00:40<03:22, 348.28frames/s][A
 20%|█████▎                     | 16706/84657 [00:47<03:16, 345.00frames/s][A
 23%|██████▏                    | 19328/84657 [00:54<03:06, 349.68frames/s][A
 26%|███████                    | 22136/84657 [01:02<02:58, 350.13frames/s][A
 29%|███████▊                   | 24498/84657 [01:10<02:58, 337.77frames/s][A
 32%|████████▌                  | 27042/84657 [01:17<02:45, 348.09frames/s][A
 35%|█████████▌                 | 29976/84657 [01:24<02:26, 374.40frames/s][A
 39%|██████████▍                | 32876/84657 [01:3


Found YouTube Video (URL): https://www.youtube.com/watch?v=lKz-E0Y2dbo

Downloading (Video): [lKz-E0Y2dbo] Poll_ Trump-Harris Tied in Pennsylvania at 49% With 1% Undecided.mp4

 ↳ |█████████████████████████████████████████| 100.0%

Extracting (Audio): [lKz-E0Y2dbo] Poll_ Trump-Harris Tied in Pennsylvania at 49% With 1% Undecided.mp3

Transcribing (Text): [lKz-E0Y2dbo] Poll_ Trump-Harris Tied in Pennsylvania at 49% With 1% Undecided.txt




  0%|                                        | 0/32201 [00:00<?, ?frames/s][A
  9%|██▍                         | 2760/32201 [00:07<01:21, 359.18frames/s][A
 18%|████▉                       | 5640/32201 [00:15<01:14, 357.48frames/s][A