# Import Dependencies

In [1]:
# Import Main Dependencies
import os, re, ffmpeg, whisper
from pytubefix import YouTube, Stream
from pytubefix.cli import on_progress
from pytubefix.innertube import _default_clients

# Import Other Dependencies
import torch
from tqdm.auto import tqdm

# Define Utilities

In [2]:
def sanitize_filename(filename: str) -> str:
    # Escape Double Quotes
    filename = filename.replace('"', '\\"')

    # Replace Invalid Characters with "_"
    invalid_chars = re.compile(r'[<>:"/\\|?*]')
    sanitized_filename = invalid_chars.sub("_", filename)

    return sanitized_filename
    
def read_unique_items_from_file(file: str) -> list:
    with open(file, "r") as f:
        return list(set(url.strip() for url in f.readlines() if url.strip()))

# Set Configurations

In [3]:
# File Names
yt_video_links_filename = "YouTube Video Links.txt"
transcript_sentences_filename = "transcript_sentences.csv"

# Folder Names
video_output_path = "Video"
audio_output_path = "Audio"
transcription_output_path = "Transcription"

# Boolean Flags
remove_video = True
remove_audio = True

# Additional Dependency Configurations
_default_clients["ANDROID_MUSIC"] = _default_clients["ANDROID_CREATOR"]

# Collect Data (YouTube Videos)

In [4]:
def download_youtube_video(video_filename: str, stream: Stream) -> tuple[str, str]:
    # Create Video Directory
    os.makedirs(video_output_path, exist_ok=True)
    
    # Set Path for Video File
    video_file = os.path.join(video_output_path, video_filename)
    
    # Delete Old Existing Video File (note: to clean any corrupted file)
    if os.path.exists(video_file):
        os.remove(video_file)
        
    # Download Video File
    print("") # Just New Line for Better Output
    print(f'Downloading (Video): {video_filename}')
    print("") # Just New Line for Better Output
    stream.download(output_path=video_output_path, filename=video_filename)
    print("") # Just New Line for Better Output
    print("") # Just New Line for Better Output
    
    # Return Video File and Name
    return video_file, video_filename

# Audio Extraction (Video to Audio)

In [5]:
def extract_audio_from_video(video_file: str, video_filename: str) -> tuple[str, str]:
    # Create the Audio Directory
    os.makedirs(audio_output_path, exist_ok=True)

    # Set Audio File Name ("[YouTube Video ID] [title].mp3")
    audio_filename = f'{os.path.splitext(video_filename)[0]}.mp3'

    # Set Path for Audio File
    audio_file = os.path.join(audio_output_path, audio_filename)
    
    # Delete Old Existing Audio File (note: to clean any corrupted file)
    if os.path.exists(audio_file):
        os.remove(audio_file)
    
    # Extract Audio File
    print(f'Extracting (Audio): {audio_filename}')
    print("") # Just New Line for Better Output
    (
        ffmpeg
        .input(video_file)
        .output(audio_file, format="mp3", acodec="libmp3lame", loglevel="info")
        .run(overwrite_output=True)
    )
    
    # Return Audio File and Name
    return audio_file, audio_filename

# Transcription (Audio to Text)

In [6]:
def transcribe_audio_to_text(audio_file: str, audio_filename: str):
    # Create the Transcription Directory
    os.makedirs(transcription_output_path, exist_ok=True)
    
    # Set Transcription File Name ("[YouTube Video ID] [title].txt")
    transcription_filename = f'{os.path.splitext(audio_filename)[0]}.txt'
    
    # Set Path for Transcription File
    transcription_file = os.path.join(transcription_output_path, transcription_filename)
            
    # Get/Download OpenAI Whisper Model
    """ 
    Models: 
        tiny, base, small, medium, large, turbo
    English-Only:
        tiny.en, base.en, small.en, medium.en
    
    Required VRAM:              Speed:
        1) 1GB - tiny, base         1) 10x - tiny
        2) 2GB - small              2) 8x - turbo
        3) 5GB - medium             3) 7x - base
        4) 6GB - turbo              4) 4x - small
        5) 10GB - large             5) 2x - medium
                                    6) 1x - large
    
    Quote from OpenAI: 
        - The .en models for English-only applications tend to perform better, especially for the tiny.en and base.en models.
        We observed that the difference becomes less significant for the small.en and medium.en models.
    
    Note: 4GB lang VRAM ko kaya small.en ginamit
    """  
    print(f'Transcribing (Text): {transcription_filename}')
    print("") # Just New Line for Better Output
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
    model = whisper.load_model("small.en", device=device)
    
    # Transcribe Audio File (Saves Whole Text in Memory Before Disk to Avoid Corruption)
    result = model.transcribe(audio_file, fp16=False, verbose=False)
    try:
        with open(transcription_file, "w") as f:
            f.write(result["text"])
    except:
        if os.path.exists(transcription_file):
            os.remove(transcription_file)

# Execute Data Gathering

In [None]:
yt_urls = read_unique_items_from_file(yt_video_links_filename)

with tqdm(total=len(yt_urls), desc="Getting YouTube URLs") as pbar:
    for index, url in enumerate(yt_urls):        
        try:
            current = f'{index+1}/{len(yt_urls)}'
    
            # Get Video Information
            yt = YouTube(url, on_progress_callback=on_progress)
            stream = yt.streams.get_audio_only()
            video_id = yt.video_id
            
            # Sanitize Video File Name and Add YouTube Video ID
            video_filename = f'[{video_id}] {sanitize_filename(stream.default_filename)}'
            video_id_pattern_in_filename = re.compile(r'\[(.*?)\]')
            
            # Get File Name Without Extension (e.g., ".mp4")
            filename = os.path.splitext(video_filename)[0]
            
            # Skip If Transcription with YouTube ID Already Exists
            has_transcription_file = False
            pbar.set_description(f'Checking Existing Transcriptions [{current} File]')
            if video_id and os.path.exists(transcription_output_path):
                for existing_transcription_filename in os.listdir(transcription_output_path):
                    if (
                        video_id_pattern_in_filename.search(existing_transcription_filename)
                        and video_id_pattern_in_filename.search(existing_transcription_filename).group(1) == video_id
                    ): has_transcription_file = True
            if has_transcription_file:
                # Delete/Keep Video File
                if remove_video:
                    for existing_video_filename in os.listdir(video_output_path):
                        if (
                            video_id_pattern_in_filename.search(existing_video_filename)
                            and video_id_pattern_in_filename.search(existing_video_filename).group(1) == video_id
                        ): os.remove(existing_video_filename)
                            
                # Delete/Keep Audio File
                if remove_audio:
                    for existing_audio_filename in os.listdir(audio_output_path):
                        if (
                            video_id_pattern_in_filename.search(existing_audio_filename)
                            and video_id_pattern_in_filename.search(existing_audio_filename).group(1) == video_id
                        ): os.remove(existing_audio_filename)
                        
                pbar.update(1)
                continue
                
            # Log YouTube URL being Processed
            print("") # Just New Line for Better Output
            print(f'Found YouTube Video (URL): {url}')
            
            # Download YouTube Video
            pbar.set_description(f'Downloading [{current} Video] ')
            video_file, video_filename = download_youtube_video(video_filename, stream)
            
            # Extract Audio from Video -> Delete/Keep Video File
            pbar.set_description(f'Extracting [{current} Audio]')
            audio_file, audio_filename = extract_audio_from_video(video_file, video_filename)
            if remove_video: os.remove(video_file)
            
            # Transcribe Audio to Text -> Delete/Keep Audio File
            pbar.set_description(f'Transcribing [{current} Text]')
            transcribe_audio_to_text(audio_file, audio_filename)
            if remove_audio: os.remove(audio_file)
                
            pbar.update(1)
        except Exception as e: 
            print(f'{e}: {url}')
            
            pbar.update(1)
            
    pbar.set_description("Finished Data Gathering")

Getting YouTube URLs:   0%|          | 0/269 [00:00<?, ?it/s]


Found YouTube Video (URL): https://www.youtube.com/watch?v=qxlYlY8z6ug

Downloading (Video): [qxlYlY8z6ug] Why Obama's support is important for Harris campaign, Democrats.mp4

 ↳ |█████████████████████████████████████████| 100.0%

Extracting (Audio): [qxlYlY8z6ug] Why Obama's support is important for Harris campaign, Democrats.mp3

Transcribing (Text): [qxlYlY8z6ug] Why Obama's support is important for Harris campaign, Democrats.txt



  checkpoint = torch.load(fp, map_location=device)

  0%|                                        | 0/31850 [00:00<?, ?frames/s][A
  9%|██▍                         | 2724/31850 [00:06<01:10, 413.51frames/s][A
 18%|████▉                       | 5676/31850 [00:13<01:04, 406.89frames/s][A
 27%|███████▍                    | 8446/31850 [00:21<01:00, 385.40frames/s][A
 35%|█████████▍                 | 11146/31850 [00:28<00:54, 380.90frames/s][A
 44%|███████████▉               | 14062/31850 [00:37<00:49, 360.73frames/s][A
 52%|██████████████▏            | 16676/31850 [00:45<00:42, 354.78frames/s][A
 61%|████████████████▍          | 19326/31850 [00:52<00:34, 364.48frames/s][A
 69%|██████████████████▋        | 22022/31850 [01:00<00:28, 350.30frames/s][A
 77%|████████████████████▋      | 24446/31850 [01:08<00:21, 337.62frames/s][A
 86%|███████████████████████▏   | 27362/31850 [01:15<00:12, 352.16frames/s][A
 94%|█████████████████████████▍ | 29982/31850 [01:25<00:05, 326.18frames/s][A



Found YouTube Video (URL): https://www.youtube.com/watch?v=YBMP19BF26w

Downloading (Video): [YBMP19BF26w] Democrats DOMINATING Early Voting in Pennsylvania! Can Kamala Harris win the state.mp4

IncompleteRead(8961984 bytes read, 475200 more expected): https://www.youtube.com/watch?v=YBMP19BF26w

Found YouTube Video (URL): https://www.youtube.com/watch?v=UTpIrlywFlI

Downloading (Video): [UTpIrlywFlI] VP Harris & President Biden hit the campaign trail in Pennsylvania.mp4

 ↳ |█████████████████████████████████████████| 100.0%

Extracting (Audio): [UTpIrlywFlI] VP Harris & President Biden hit the campaign trail in Pennsylvania.mp3

Transcribing (Text): [UTpIrlywFlI] VP Harris & President Biden hit the campaign trail in Pennsylvania.txt




  0%|                                        | 0/32986 [00:00<?, ?frames/s][A
  7%|██                          | 2364/32986 [00:05<01:06, 460.33frames/s][A
 14%|████                        | 4772/32986 [00:11<01:07, 419.78frames/s][A
 24%|██████▌                     | 7772/32986 [00:17<00:56, 445.01frames/s][A
 31%|████████▎                  | 10228/32986 [00:23<00:51, 438.73frames/s][A
 38%|██████████▎                | 12532/32986 [00:28<00:45, 451.80frames/s][A
 44%|███████████▉               | 14644/32986 [00:34<00:43, 416.93frames/s][A
 52%|██████████████▏            | 17300/32986 [00:40<00:37, 413.52frames/s][A
 59%|███████████████▉           | 19416/32986 [00:46<00:34, 393.47frames/s][A
 64%|█████████████████▏         | 21040/32986 [00:52<00:33, 354.13frames/s][A
 70%|██████████████████▊        | 23000/32986 [00:58<00:28, 345.95frames/s][A
 74%|████████████████████       | 24560/32986 [01:05<00:27, 310.68frames/s][A
 80%|█████████████████████▍     | 26260/32986 [01:1


Found YouTube Video (URL): https://www.youtube.com/watch?v=Tk4V4hjONHQ

Downloading (Video): [Tk4V4hjONHQ] What Pennsylvania voters want to hear from Harris, Trump at debate.mp4

 ↳ |█████████████████████████████████████████| 100.0%

Extracting (Audio): [Tk4V4hjONHQ] What Pennsylvania voters want to hear from Harris, Trump at debate.mp3

Transcribing (Text): [Tk4V4hjONHQ] What Pennsylvania voters want to hear from Harris, Trump at debate.txt




  0%|                                        | 0/48288 [00:00<?, ?frames/s][A
  6%|█▌                          | 2752/48288 [00:06<01:55, 395.53frames/s][A
 11%|███▏                        | 5532/48288 [00:13<01:45, 403.40frames/s][A
 17%|████▋                       | 8000/48288 [00:20<01:47, 375.43frames/s][A
 23%|██████                     | 10872/48288 [00:28<01:39, 377.26frames/s][A
 28%|███████▌                   | 13536/48288 [00:36<01:38, 353.05frames/s][A
 34%|█████████                  | 16316/48288 [00:44<01:29, 356.46frames/s][A
 39%|██████████▍                | 18668/48288 [00:53<01:32, 321.02frames/s][A
 45%|████████████               | 21528/48288 [01:02<01:22, 322.92frames/s][A
 50%|█████████████▌             | 24320/48288 [01:11<01:14, 320.20frames/s][A
 56%|███████████████▎           | 27280/48288 [01:18<01:01, 343.24frames/s][A
 63%|████████████████▉          | 30200/48288 [01:27<00:53, 340.54frames/s][A
 68%|██████████████████▍        | 32976/48288 [01:3


Found YouTube Video (URL): https://www.youtube.com/watch?v=b23FBPunJJ4

Downloading (Video): [b23FBPunJJ4] Trump and Harris Neck-and-Neck in Pennsylvania and Georgia.mp4

 ↳ |█████████████████████████████████████████| 100.0%

Extracting (Audio): [b23FBPunJJ4] Trump and Harris Neck-and-Neck in Pennsylvania and Georgia.mp3

Transcribing (Text): [b23FBPunJJ4] Trump and Harris Neck-and-Neck in Pennsylvania and Georgia.txt




  0%|                                       | 0/110016 [00:00<?, ?frames/s][A
  2%|▌                          | 2414/110016 [00:05<04:01, 445.67frames/s][A
  5%|█▏                         | 5058/110016 [00:12<04:20, 403.54frames/s][A
  7%|█▊                         | 7474/110016 [00:18<04:22, 391.21frames/s][A
  9%|██▍                       | 10374/110016 [00:26<04:22, 380.08frames/s][A
 12%|███                       | 12946/110016 [00:33<04:18, 375.15frames/s][A
 14%|███▋                      | 15454/110016 [00:40<04:15, 369.41frames/s][A
 16%|████▎                     | 18086/110016 [00:47<04:09, 368.85frames/s][A
 19%|████▉                     | 20758/110016 [00:55<04:02, 368.19frames/s][A
 21%|█████▍                    | 23238/110016 [01:02<04:01, 360.03frames/s][A
 24%|██████▏                   | 26078/110016 [01:08<03:40, 380.58frames/s][A
 26%|██████▊                   | 28850/110016 [01:16<03:34, 377.53frames/s][A
 29%|███████▍                  | 31386/110016 [01:2


Found YouTube Video (URL): https://www.youtube.com/watch?v=ssndF5S-TMo

Downloading (Video): [ssndF5S-TMo] TRUMP ENDORSES OZ! - Analyzing Trump's Pennsylvania Senate Race Endorsement.mp4

 ↳ |█████████████████████████████████████████| 100.0%

Extracting (Audio): [ssndF5S-TMo] TRUMP ENDORSES OZ! - Analyzing Trump's Pennsylvania Senate Race Endorsement.mp3

Transcribing (Text): [ssndF5S-TMo] TRUMP ENDORSES OZ! - Analyzing Trump's Pennsylvania Senate Race Endorsement.txt




  0%|                                        | 0/73082 [00:00<?, ?frames/s][A
  4%|█                           | 2808/73082 [00:08<03:36, 323.96frames/s][A
  8%|██▏                         | 5808/73082 [00:18<03:39, 306.07frames/s][A
 11%|███▏                        | 8300/73082 [00:26<03:30, 307.27frames/s][A
 15%|████                       | 10828/73082 [00:35<03:26, 301.24frames/s][A
 18%|████▉                      | 13480/73082 [00:43<03:10, 313.63frames/s][A
 22%|██████                     | 16248/73082 [00:52<03:00, 315.09frames/s][A
 26%|███████                    | 19040/73082 [01:01<02:52, 313.44frames/s][A
 30%|████████                   | 21766/73082 [01:08<02:38, 323.82frames/s][A
 34%|█████████▏                 | 24746/73082 [01:18<02:30, 321.18frames/s][A
 37%|██████████                 | 27326/73082 [01:28<02:33, 297.46frames/s][A
 41%|███████████                | 30070/73082 [01:37<02:22, 302.58frames/s][A
 45%|████████████▏              | 33054/73082 [01:4


Found YouTube Video (URL): https://www.youtube.com/watch?v=QPwatK1n4uU

Downloading (Video): [QPwatK1n4uU] How Pennsylvania voters reacted to Harris-Trump debate.mp4

 ↳ |█████████████████████████████████████████| 100.0%

Extracting (Audio): [QPwatK1n4uU] How Pennsylvania voters reacted to Harris-Trump debate.mp3

Transcribing (Text): [QPwatK1n4uU] How Pennsylvania voters reacted to Harris-Trump debate.txt




  0%|                                        | 0/30557 [00:00<?, ?frames/s][A
  8%|██▏                         | 2448/30557 [00:06<01:09, 406.77frames/s][A
 15%|████                        | 4500/30557 [00:12<01:11, 362.41frames/s][A
 24%|██████▋                     | 7340/30557 [00:18<00:56, 408.88frames/s][A
 33%|████████▉                  | 10072/30557 [00:25<00:51, 396.99frames/s][A
 42%|███████████▏               | 12696/30557 [00:34<00:51, 346.56frames/s][A
 49%|█████████████▎             | 15120/30557 [00:40<00:42, 359.28frames/s][A
 58%|███████████████▋           | 17764/30557 [00:46<00:33, 386.36frames/s][A
 66%|█████████████████▊         | 20112/30557 [00:52<00:26, 394.55frames/s][A
 75%|████████████████████       | 22776/30557 [00:58<00:18, 415.92frames/s][A
 82%|██████████████████████▏    | 25136/30557 [01:03<00:12, 425.74frames/s][A
 92%|████████████████████████▊  | 28024/30557 [01:10<00:06, 420.66frames/s][A
100%|███████████████████████████| 30557/30557 [01:1


Found YouTube Video (URL): https://www.youtube.com/watch?v=LDY3qoAwjqs

Downloading (Video): [LDY3qoAwjqs] Why Tim Pool and Jack Posobiec support Trump.mp4

 ↳ |█████████████████████████████████████████| 100.0%

Extracting (Audio): [LDY3qoAwjqs] Why Tim Pool and Jack Posobiec support Trump.mp3

Transcribing (Text): [LDY3qoAwjqs] Why Tim Pool and Jack Posobiec support Trump.txt




  0%|                                        | 0/52217 [00:00<?, ?frames/s][A
  6%|█▌                          | 2968/52217 [00:11<03:11, 257.50frames/s][A
 11%|███▏                        | 5872/52217 [00:25<03:29, 221.35frames/s][A
 11%|███▏                        | 5872/52217 [00:39<03:29, 221.35frames/s][A
 17%|████▋                       | 8688/52217 [00:41<03:36, 200.78frames/s][A
 17%|████▋                       | 8688/52217 [00:59<03:36, 200.78frames/s][A
 22%|█████▉                     | 11468/52217 [00:59<03:47, 178.90frames/s][A
 22%|█████▉                     | 11468/52217 [01:10<03:47, 178.90frames/s][A
 28%|███████▍                   | 14388/52217 [01:12<03:12, 196.03frames/s][A
 33%|████████▉                  | 17176/52217 [01:25<02:52, 202.94frames/s][A
 38%|██████████▎                | 19948/52217 [01:35<02:28, 217.29frames/s][A
 44%|███████████▊               | 22736/52217 [01:48<02:13, 220.92frames/s][A
 49%|█████████████▎             | 25644/52217 [01:5


Found YouTube Video (URL): https://www.youtube.com/watch?v=bCJt_Tf7ABs

Downloading (Video): [bCJt_Tf7ABs] Kamala Harris Wins Pennsylvania in 2024.mp4

 ↳ |█████████████████████████████████████████| 100.0%

Extracting (Audio): [bCJt_Tf7ABs] Kamala Harris Wins Pennsylvania in 2024.mp3

Transcribing (Text): [bCJt_Tf7ABs] Kamala Harris Wins Pennsylvania in 2024.txt




  0%|                                        | 0/38584 [00:00<?, ?frames/s][A
  8%|██▏                         | 2952/38584 [00:06<01:15, 474.73frames/s][A
 15%|████▎                       | 5888/38584 [00:12<01:08, 476.45frames/s][A
 22%|██████▏                     | 8608/38584 [00:19<01:09, 432.84frames/s][A
 30%|███████▉                   | 11396/38584 [00:25<01:03, 429.42frames/s][A
 37%|█████████▉                 | 14136/38584 [00:33<01:00, 406.23frames/s][A
 43%|███████████▌               | 16580/38584 [00:39<00:53, 407.69frames/s][A
 50%|█████████████▍             | 19280/38584 [00:46<00:48, 401.04frames/s][A
 57%|███████████████▍           | 22032/38584 [00:52<00:39, 419.58frames/s][A
 64%|█████████████████▏         | 24600/38584 [00:58<00:34, 409.32frames/s][A
 71%|███████████████████        | 27260/38584 [01:04<00:26, 430.92frames/s][A
 77%|████████████████████▋      | 29612/38584 [01:10<00:21, 413.51frames/s][A
 83%|██████████████████████▍    | 32016/38584 [01:1


Found YouTube Video (URL): https://www.youtube.com/watch?v=1IkNvUaCwIU

Downloading (Video): [1IkNvUaCwIU] Trump ends town hall early, sways to music for over 30 minutes.mp4

 ↳ |█████████████████████████████████████████| 100.0%

Extracting (Audio): [1IkNvUaCwIU] Trump ends town hall early, sways to music for over 30 minutes.mp3

Transcribing (Text): [1IkNvUaCwIU] Trump ends town hall early, sways to music for over 30 minutes.txt




  0%|                                        | 0/64669 [00:00<?, ?frames/s][A
  5%|█▎                          | 2984/64669 [00:09<03:11, 321.77frames/s][A
  9%|██▍                         | 5720/64669 [00:17<03:04, 319.84frames/s][A
 13%|███▋                        | 8560/64669 [00:26<02:48, 332.21frames/s][A
 18%|████▊                      | 11448/64669 [00:34<02:40, 331.14frames/s][A
 18%|████▊                      | 11448/64669 [00:45<02:40, 331.14frames/s][A
 22%|█████▉                     | 14248/64669 [00:47<03:00, 278.78frames/s][A
 26%|███████▏                   | 17120/64669 [00:58<02:53, 273.37frames/s][A
 31%|████████▎                  | 20048/64669 [01:07<02:34, 288.47frames/s][A
 35%|█████████▌                 | 22808/64669 [01:17<02:26, 284.77frames/s][A
 40%|██████████▊                | 25760/64669 [01:29<02:24, 270.18frames/s][A
 44%|███████████▉               | 28728/64669 [01:43<02:24, 248.15frames/s][A
 44%|███████████▉               | 28728/64669 [01:5