# Initialize Dependencies

In [200]:
import os, re, ffmpeg, whisper
from pytubefix import YouTube, Stream
from pytubefix.cli import on_progress

from tqdm.auto import tqdm

# Set Variables

In [201]:
yt_video_links_file = "YouTube Video Links.txt"
video_output_path = "Video"
audio_output_path = "Audio"
transcription_output_path = "Transcription"

# Define Utilities

In [202]:
def sanitize_filename(filename: str) -> str:
    # Escape Double Quotes
    filename = filename.replace('"', '\\"')

    # Replace Invalid Characters with '_'
    invalid_chars = re.compile(r'[<>:"/\\|?*]')
    sanitized_filename = invalid_chars.sub('_', filename)

    return sanitized_filename

# Collect Data (YouTube Videos)

In [203]:
def download_youtube_video(video_filename: str, stream: Stream) -> tuple[str, str]:
    # Create Video Directory
    os.makedirs(video_output_path, exist_ok=True)
    
    # Set Path for Video File
    video_file = os.path.join(video_output_path, video_filename)
    
    # Delete Old Existing Video File (note: to clean any corrupted file)
    if os.path.exists(video_file):
        os.remove(video_file)
        
    # Download Video File
    print(f'Downloading (Video): {video_filename}') # Just New Line for Better Output
    print("") # Just New Line for Better Output
    stream.download(output_path=video_output_path, filename=video_filename)
    print("") # Just New Line for Better Output
    print("") # Just New Line for Better Output
    
    # Return Video File and Name
    return video_file, video_filename

# Audio Extraction (Video to Audio)

In [204]:
def extract_audio_from_video(video_file: str, video_filename: str) -> tuple[str, str]:
    # Create the Audio Directory
    os.makedirs(audio_output_path, exist_ok=True)

    # Set Audio File Name (".mp3")
    audio_filename = f'{os.path.splitext(video_filename)[0]}.mp3'

    # Set Path for Audio File
    audio_file = os.path.join(audio_output_path, audio_filename)
    
    # Delete Old Existing Audio File (note: to clean any corrupted file)
    if os.path.exists(audio_file):
        os.remove(audio_file)
    
    # Extract Audio File
    print(f'Extracting (Audio): {audio_filename}') # Just New Line for Better Output
    print("") # Just New Line for Better Output
    (
        ffmpeg
        .input(video_file)
        .output(audio_file, format='mp3', acodec='libmp3lame', loglevel="info")
        .run(overwrite_output=True)
    )
    
    # Return Audio File and Name
    return audio_file, audio_filename

# Transcription (Audio to Text)

In [205]:
def transcribe_audio(audio_file: str, audio_filename: str, index: int):
    # Create the Transcription Directory
    os.makedirs(transcription_output_path, exist_ok=True)
    
    # Set Transcription File Name ("[index]....txt")
    transcription_filename = f'[{index}] {os.path.splitext(audio_filename)[0]}.txt'
    
    # Set Path for Transcription File
    transcription_file = os.path.join(transcription_output_path, transcription_filename)
    
    # Delete Old Existing Transcription File (note: to clean any corrupted file)
    if os.path.exists(transcription_file):
        os.remove(transcription_file)
      
    # Get/Download OpenAI Whisper Model
    # Models: "tiny", "base", "small", "medium", "large", "turbo"
    # English Only Models: "tiny.en", "base.en", "small.en", "medium.en"
    print(f'Transcribing (Text): {transcription_filename}') # Just New Line for Better Output
    print("") # Just New Line for Better Output
    model = whisper.load_model("small.en", device="cpu")

    # Transcribe Audio File
    result = model.transcribe(audio_file, verbose=False)
    with open(transcription_file, 'w') as f:
        f.write(result['text'])

# Execute Data Gathering

In [206]:
yt_urls = []
with open(yt_video_links_file, "r") as file:
    yt_urls = list(set(url.strip() for url in file.readlines() if url.strip()))

with tqdm(total=len(yt_urls), desc="Getting YouTube URLs") as pbar:
    for index, url in enumerate(yt_urls):
        current = f"[{index+1}/{len(yt_urls)}]"
                
        # Get Video Information
        yt = YouTube(url, on_progress_callback=on_progress)
        stream = yt.streams.get_audio_only()

        # Sanitize Video File Name
        video_filename = sanitize_filename(stream.default_filename)
        
        # Get Native File Name Without Extension (e.g., ".mp4")
        native_transcription_filename = f'{os.path.splitext(video_filename)[0]}.txt'
        
        # Skip If Transcription Already Exists
        transcription_exists = False
        pbar.set_description(f"Checking Existing Transcription File {current}")
        if os.path.exists(transcription_output_path):
            for existing_transcription_filename in os.listdir(transcription_output_path):
                existing_native_transcription_filename = re.sub(r'^\[\d+\]\s+', '', existing_transcription_filename)
                if existing_native_transcription_filename == native_transcription_filename:
                    existing_transcription_path = os.path.join(transcription_output_path, existing_transcription_filename)
                    if os.path.exists(existing_transcription_path):
                        transcription_exists = True
        if transcription_exists:
            pbar.update(1)
            continue

        # Download YouTube Video
        pbar.set_description(f"Downloading (Video) {current}")
        video_file, video_filename = download_youtube_video(video_filename, stream)
        
        # Extract Audio from Video -> Delete Video File
        pbar.set_description(f"Extracting (Audio) {current}")
        audio_file, audio_filename = extract_audio_from_video(video_file, video_filename)
        os.remove(video_file)
        
        # Transcribe Audio to Text -> Delete Audio File
        pbar.set_description(f"Transcribing (Text) {current}")
        transcribe_audio(audio_file, audio_filename, index)
        os.remove(audio_file)
        
        pbar.update(1)
    pbar.set_description("Finished Data Gathering")

Getting YouTube URLs:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (Video): Arizona Gen Z voters sit down to talk 2024 election.mp4

 ↳ |██████████████████████████████████████████████████████████████████| 100.0%

Extracting (Audio): Arizona Gen Z voters sit down to talk 2024 election.mp3

Transcribing (Text): [0] Arizona Gen Z voters sit down to talk 2024 election.txt

Detected language: English



  0%|                                                                                    | 0/46755 [00:00<?, ?frames/s][A
  6%|████                                                                    | 2636/46755 [00:12<03:36, 203.43frames/s][A
 11%|███████▉                                                                | 5168/46755 [00:25<03:25, 201.98frames/s][A
 11%|███████▉                                                                | 5168/46755 [00:36<03:25, 201.98frames/s][A
 17%|████████████▏                                                           | 7952/46755 [00:44<03:44, 172.82frames/s][A
 17%|████████████▏                                                           | 7952/46755 [00:56<03:44, 172.82frames/s][A
 23%|████████████████                                                       | 10600/46755 [01:01<03:41, 163.30frames/s][A
 23%|████████████████                                                       | 10600/46755 [01:12<03:41, 163.30frames/s][A
 29%|██████████

Downloading (Video): Michigan’s Muslims Helped Biden Win in 2020. Will They Back Harris in Nov._ _ Amanpour and Company.mp4

 ↳ |██████████████████████████████████████████████████████████████████| 100.0%

Extracting (Audio): Michigan’s Muslims Helped Biden Win in 2020. Will They Back Harris in Nov._ _ Amanpour and Company.mp3

Transcribing (Text): [1] Michigan’s Muslims Helped Biden Win in 2020. Will They Back Harris in Nov._ _ Amanpour and Company.txt

Detected language: English



  0%|                                                                                   | 0/111430 [00:00<?, ?frames/s][A
  3%|█▊                                                                     | 2902/111430 [00:15<09:34, 188.82frames/s][A
  3%|█▊                                                                     | 2902/111430 [00:27<09:34, 188.82frames/s][A
  5%|███▋                                                                   | 5784/111430 [00:32<09:59, 176.30frames/s][A
  5%|███▋                                                                   | 5784/111430 [00:47<09:59, 176.30frames/s][A
  8%|█████▌                                                                 | 8688/111430 [00:50<10:01, 170.80frames/s][A
  8%|█████▌                                                                 | 8688/111430 [01:02<10:01, 170.80frames/s][A
 10%|███████▎                                                              | 11544/111430 [01:06<09:46, 170.25frames/s][A
 10%|███████▎  

 83%|██████████████████████████████████████████████████████████            | 92526/111430 [09:42<02:00, 157.37frames/s][A
 86%|███████████████████████████████████████████████████████████▉          | 95386/111430 [09:50<01:40, 159.03frames/s][A
 86%|███████████████████████████████████████████████████████████▉          | 95386/111430 [10:02<01:40, 159.03frames/s][A
 88%|█████████████████████████████████████████████████████████████▊        | 98342/111430 [10:07<01:19, 164.39frames/s][A
 88%|█████████████████████████████████████████████████████████████▊        | 98342/111430 [10:18<01:19, 164.39frames/s][A
 91%|██████████████████████████████████████████████████████████████▋      | 101154/111430 [10:23<01:01, 166.28frames/s][A
 91%|██████████████████████████████████████████████████████████████▋      | 101154/111430 [10:38<01:01, 166.28frames/s][A
 93%|████████████████████████████████████████████████████████████████▍    | 103974/111430 [10:41<00:45, 162.80frames/s][A
 93%|███████████

Downloading (Video): The ‘battleground state’ of Pennsylvania is most important in US presidential election.mp4

 ↳ |██████████████████████████████████████████████████████████████████| 100.0%

Extracting (Audio): The ‘battleground state’ of Pennsylvania is most important in US presidential election.mp3

Transcribing (Text): [2] The ‘battleground state’ of Pennsylvania is most important in US presidential election.txt

Detected language: English



  0%|                                                                                    | 0/32589 [00:00<?, ?frames/s][A
  9%|██████▏                                                                 | 2812/32589 [00:18<03:14, 152.85frames/s][A
  9%|██████▏                                                                 | 2812/32589 [00:30<03:14, 152.85frames/s][A
 18%|████████████▋                                                           | 5768/32589 [00:34<02:40, 167.52frames/s][A
 18%|████████████▋                                                           | 5768/32589 [00:45<02:40, 167.52frames/s][A
 26%|██████████████████▉                                                     | 8572/32589 [00:53<02:30, 159.66frames/s][A
 26%|██████████████████▉                                                     | 8572/32589 [01:05<02:30, 159.66frames/s][A
 35%|████████████████████████▌                                              | 11260/32589 [01:09<02:11, 162.13frames/s][A
 35%|██████████

# Data Preprocessing

# Merge Transcriptions

# Text Cleaning: Topic Modeling and Sentence Filtering

# Data Sentiment Annotation

# Train-Validation-Test Split

# Training: Sentiment Analysis with BERT

# Validation: Hyperparameter Tuning and Model Optimization

# Testing: Model Evaluation

# Comparative Analysis 

# Proof of Concept