# Import Dependencies

In [1]:
# Import Dependencies
import os, re, torch, ffmpeg, whisper
from pytubefix import YouTube, Stream
from pytubefix.cli import on_progress
from pytubefix.innertube import _default_clients
from tqdm.auto import tqdm

# Define Utilities

In [2]:
def sanitize_filename(filename: str) -> str:
    # Escape Double Quotes
    filename = filename.replace('"', '\\"')

    # Replace Invalid Characters with "_"
    invalid_chars = re.compile(r'[<>:"/\\|?*]')
    sanitized_filename = invalid_chars.sub("_", filename)

    return sanitized_filename
    
def read_unique_items_from_file(file: str) -> list:
    if os.path.exists(file):
        with open(file, "r") as f:
            return list(set(e.strip() for e in f.readlines() if e.strip()))
    return []

# Set Configurations

In [3]:
# File Names
yt_video_links_filename = os.path.join("YouTube URL Collection", "Used URLs.txt")

# Folder Names
video_output_path = "Video"
audio_output_path = "Audio"
transcription_output_path = "Transcription"

# Boolean Flags
remove_video = True
remove_audio = True
authorize_yt = False # Change to true if YouTube Detects you as a BOT

# Additional Dependency Configurations
_default_clients["ANDROID_MUSIC"] = _default_clients["ANDROID_CREATOR"]

# Collect Data (YouTube Videos)

In [4]:
def download_youtube_video(video_filename: str, stream: Stream) -> tuple[str, str]:
    # Create Video Directory
    os.makedirs(video_output_path, exist_ok=True)
    
    # Set Path for Video File
    video_file = os.path.join(video_output_path, video_filename)
    
    # Delete Old Existing Video File (note: to clean any corrupted file)
    if os.path.exists(video_file):
        os.remove(video_file)
        
    # Download Video File
    print("") # Just New Line for Better Output
    print(f'Downloading (Video): {video_filename}')
    print("") # Just New Line for Better Output
    stream.download(output_path=video_output_path, filename=video_filename)
    print("") # Just New Line for Better Output
    print("") # Just New Line for Better Output
    
    # Return Video File and Name
    return video_file, video_filename

# Audio Extraction (Video to Audio)

In [5]:
def extract_audio_from_video(video_file: str, video_filename: str) -> tuple[str, str]:
    # Create the Audio Directory
    os.makedirs(audio_output_path, exist_ok=True)

    # Set Audio File Name ("[YouTube Video ID] [title].mp3")
    audio_filename = f'{os.path.splitext(video_filename)[0]}.mp3'

    # Set Path for Audio File
    audio_file = os.path.join(audio_output_path, audio_filename)
    
    # Delete Old Existing Audio File (note: to clean any corrupted file)
    if os.path.exists(audio_file):
        os.remove(audio_file)
    
    # Extract Audio File
    print(f'Extracting (Audio): {audio_filename}')
    print("") # Just New Line for Better Output
    (
        ffmpeg
        .input(video_file)
        .output(audio_file, format="mp3", acodec="libmp3lame", loglevel="info")
        .run(overwrite_output=True)
    )
    
    # Return Audio File and Name
    return audio_file, audio_filename

# Transcription (Audio to Text)

In [6]:
def transcribe_audio_to_text(audio_file: str, audio_filename: str):
    # Create the Transcription Directory
    os.makedirs(transcription_output_path, exist_ok=True)
    
    # Set Transcription File Name ("[YouTube Video ID] [title].txt")
    transcription_filename = f'{os.path.splitext(audio_filename)[0]}.txt'
    
    # Set Path for Transcription File
    transcription_file = os.path.join(transcription_output_path, transcription_filename)
            
    # Get/Download OpenAI Whisper Model
    """ 
    Models: 
        tiny, base, small, medium, large, turbo
    English-Only:
        tiny.en, base.en, small.en, medium.en
    
    Required VRAM:              Speed:
        1) 1GB - tiny, base         1) 10x - tiny
        2) 2GB - small              2) 8x - turbo
        3) 5GB - medium             3) 7x - base
        4) 6GB - turbo              4) 4x - small
        5) 10GB - large             5) 2x - medium
                                    6) 1x - large
    
    Quote from OpenAI: 
        - The .en models for English-only applications tend to perform better, especially for the tiny.en and base.en models.
        We observed that the difference becomes less significant for the small.en and medium.en models.
    
    Note: 4GB lang VRAM ko kaya small.en ginamit
    """  
    print(f'Transcribing (Text): {transcription_filename}')
    print("") # Just New Line for Better Output
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
    model = whisper.load_model("small.en", device=device)
    
    # Transcribe Audio File (Saves Whole Text in Memory Before Disk to Avoid Corruption)
    result = model.transcribe(audio_file, fp16=False, verbose=False)
    try:
        with open(transcription_file, "w") as f:
            f.write(result["text"])
    except:
        if os.path.exists(transcription_file):
            os.remove(transcription_file)

# Execute Data Gathering

In [None]:
yt_urls = read_unique_items_from_file(yt_video_links_filename)

with tqdm(total=len(yt_urls), desc="Getting YouTube URLs") as pbar:
    for index, url in enumerate(yt_urls):        
        try:
            current = f'{index+1}/{len(yt_urls)}'
    
            # Get Video Information
            yt = YouTube(
                url,
                use_oauth=authorize_yt,
                allow_oauth_cache=authorize_yt,
                on_progress_callback=on_progress
            )
            stream = yt.streams.get_audio_only()
            video_id = yt.video_id
            
            # Sanitize Video File Name and Add YouTube Video ID
            video_filename = f'[{video_id}] {sanitize_filename(stream.default_filename)}'
            video_id_pattern_in_filename = re.compile(r'\[(.*?)\]')
            
            # Get File Name Without Extension (e.g., ".mp4")
            filename = os.path.splitext(video_filename)[0]
            
            # Skip If Transcription with YouTube ID Already Exists
            has_transcription_file = False
            pbar.set_description(f'Checking Existing Transcriptions [{current} File]')
            if video_id and os.path.exists(transcription_output_path):
                for existing_transcription_filename in os.listdir(transcription_output_path):
                    if existing_transcription_filename == ".ipynb_checkpoints": continue
                    if (
                        video_id_pattern_in_filename.search(existing_transcription_filename)
                        and video_id_pattern_in_filename.search(existing_transcription_filename).group(1) == video_id
                    ): 
                        has_transcription_file = True
                        break
            if has_transcription_file:
                # Delete/Keep Video File
                if remove_video:
                    for existing_video_filename in os.listdir(video_output_path):
                        if existing_video_filename == ".ipynb_checkpoints": continue
                        if (
                            video_id_pattern_in_filename.search(existing_video_filename)
                            and video_id_pattern_in_filename.search(existing_video_filename).group(1) == video_id
                        ): os.remove(existing_video_filename)
                            
                # Delete/Keep Audio File
                if remove_audio:
                    for existing_audio_filename in os.listdir(audio_output_path):
                        if existing_audio_filename == ".ipynb_checkpoints": continue
                        if (
                            video_id_pattern_in_filename.search(existing_audio_filename)
                            and video_id_pattern_in_filename.search(existing_audio_filename).group(1) == video_id
                        ): os.remove(existing_audio_filename)
                            
                pbar.update(1)
                continue
                
            # Log YouTube URL being Processed
            print("") # Just New Line for Better Output
            print(f'Found YouTube Video (URL): {url}')
            
            # Download YouTube Video
            pbar.set_description(f'Downloading [{current} Video] ')
            video_file, video_filename = download_youtube_video(video_filename, stream)
            
            # Extract Audio from Video -> Delete/Keep Video File
            pbar.set_description(f'Extracting [{current} Audio]')
            audio_file, audio_filename = extract_audio_from_video(video_file, video_filename)
            if remove_video: os.remove(video_file)
            
            # Transcribe Audio to Text -> Delete/Keep Audio File
            pbar.set_description(f'Transcribing [{current} Text]')
            transcribe_audio_to_text(audio_file, audio_filename)
            if remove_audio: os.remove(audio_file)
                
            pbar.update(1)
        except Exception as e: 
            print(f'{e}: {url}')
            
            pbar.update(1)
            
    pbar.set_description("Finished Data Gathering")

Getting YouTube URLs:   0%|          | 0/270 [00:00<?, ?it/s]

QQk3hmin-7k is unavailable: https://www.youtube.com/watch?v=QQk3hmin-7k

Found YouTube Video (URL): https://www.youtube.com/watch?v=QIyWAm84bVc

Downloading (Video): [QIyWAm84bVc] Trump makes vulgar comment about Arnold Palmer at Pennsylvania rally.mp4

 ↳ |██████████████████████████████████████████████████████████████████| 100.0%

Extracting (Audio): [QIyWAm84bVc] Trump makes vulgar comment about Arnold Palmer at Pennsylvania rally.mp3

Transcribing (Text): [QIyWAm84bVc] Trump makes vulgar comment about Arnold Palmer at Pennsylvania rally.txt



  checkpoint = torch.load(fp, map_location=device)

  0%|                                                                                    | 0/42557 [00:00<?, ?frames/s][A
  7%|████▉                                                                   | 2900/42557 [00:08<02:00, 327.86frames/s][A
 13%|█████████▍                                                              | 5600/42557 [00:16<01:50, 334.65frames/s][A
 20%|██████████████▍                                                         | 8500/42557 [00:23<01:32, 369.03frames/s][A
 26%|██████████████████▎                                                    | 11000/42557 [00:29<01:20, 393.59frames/s][A
 32%|██████████████████████▋                                                | 13600/42557 [00:35<01:10, 412.01frames/s][A
 38%|██████████████████████████▊                                            | 16100/42557 [00:43<01:10, 375.18frames/s][A
 44%|███████████████████████████████▎                                       | 18800/425


Found YouTube Video (URL): https://www.youtube.com/watch?v=usVH4wPHFAE

Downloading (Video): [usVH4wPHFAE] Trump visits a Pennsylvania McDonalds while Harris goes to a Georgia church as election nears.mp4

 ↳ |██████████████████████████████████████████████████████████████████| 100.0%

Extracting (Audio): [usVH4wPHFAE] Trump visits a Pennsylvania McDonalds while Harris goes to a Georgia church as election nears.mp3

Transcribing (Text): [usVH4wPHFAE] Trump visits a Pennsylvania McDonalds while Harris goes to a Georgia church as election nears.txt




  0%|                                                                                    | 0/20108 [00:00<?, ?frames/s][A
 11%|████████▏                                                               | 2284/20108 [00:05<00:46, 381.65frames/s][A
 23%|████████████████▎                                                       | 4544/20108 [00:11<00:37, 409.88frames/s][A
 37%|██████████████████████████▍                                             | 7400/20108 [00:17<00:30, 418.31frames/s][A
 50%|███████████████████████████████████▋                                   | 10108/20108 [00:24<00:24, 412.18frames/s][A