# Initialize Dependencies

In [20]:
import os, re, subprocess
from pytubefix import YouTube
from pytubefix.cli import on_progress

# Data Gathering

In [25]:
yt_video_links_file = "YouTube Video Links.txt"
video_output_path = "./Video"

def sanitize_filename(filename):
    filename = re.sub(r'[/\\:|<>*?"]+', " - ", filename)
    filename = re.sub(r'\s+', " ", filename)
    return filename.strip()

def download_youtube_video(url, index):
    os.makedirs(video_output_path, exist_ok=True)
    
    yt = YouTube(url, on_progress_callback=on_progress)
    stream = yt.streams.get_audio_only()

    video_filename = sanitize_filename(stream.default_filename)
    video_filename = f'[{index}] {video_filename}'
    
    file_path = os.path.join(video_output_path, video_filename)
    if not os.path.exists(file_path):
        stream.download(output_path=video_output_path, filename=video_filename)
        print(f'Downloaded: {video_filename}')
    else:
        print(f'Skipped (video already exists): {video_filename}')

with open(yt_video_links_file, "r") as file:
    urls = file.readlines()
    for index, url in enumerate(urls):
        url = url.strip()
        if url:
            download_youtube_video(url, index)

Skipped (video already exists): [0] Michigan’s Muslims Helped Biden Win in 2020. Will They Back Harris in Nov. - - Amanpour and Company.mp4
Skipped (video already exists): [1] Arizona Gen Z voters sit down to talk 2024 election.mp4
Skipped (video already exists): [2] The ‘battleground state’ of Pennsylvania is most important in US presidential election.mp4


# Audio Extraction

In [26]:
audio_output_path = "./Audio"

def extract_audio_from_video(video_filename, video_output_path, index, audio_output_path="./Audio"):
    os.makedirs(audio_output_path, exist_ok=True)  # Create audio output directory

    filename, _ = os.path.splitext(video_filename)
    audio_filename = f'{filename}.mp3'

    # Construct paths
    video_file = os.path.join(video_output_path, video_filename)
    audio_file = os.path.join(audio_output_path, audio_filename)

    # Check if the audio file already exists
    if os.path.exists(audio_file):
        print(f'Skipped (audio already exists): {audio_filename}')
        return audio_file

    try:
        (
            ffmpeg
            .input(video_file)
            .output(audio_file, format='mp3', acodec='libmp3lame')
            .run(overwrite_output=True)
        )
        print(f'Converted: {video_filename} to {audio_filename}')
    except ffmpeg.Error as e:
        print(f'Error converting {video_filename}: {e.stderr.decode()}')

    return audio_file

for video_filename in os.listdir(video_output_path):
    if video_filename.endswith(('.mp4', '.mkv', '.webm', '.flv', '.avi')):
        extract_audio_from_video(video_filename, video_output_path, 0, audio_output_path)

Skipped (audio already exists): [0] Michigan’s Muslims Helped Biden Win in 2020. Will They Back Harris in Nov. - - Amanpour and Company.mp3
Skipped (audio already exists): [1] Arizona Gen Z voters sit down to talk 2024 election.mp3
Skipped (audio already exists): [2] The ‘battleground state’ of Pennsylvania is most important in US presidential election.mp3
