# Initialize Dependencies

In [54]:
import os, re, ffmpeg, whisper
from pytubefix import YouTube
from pytubefix.cli import on_progress

from tqdm.auto import tqdm

# Set Global Variables

In [55]:
yt_video_links_file = "YouTube Video Links.txt"
video_output_path = "./Video"
audio_output_path = "./Audio"
transcriptions_output_path = "./Transcription"

# Data Gathering (YouTube Videos)

In [56]:
def sanitize_filename(filename):
    filename = re.sub(r'[/\\:|<>*?"]+', " - ", filename)
    filename = re.sub(r'\s+', " ", filename)
    return filename.strip()

def download_youtube_video(url, index):
    os.makedirs(video_output_path, exist_ok=True)
    
    yt = YouTube(url, on_progress_callback=on_progress)
    stream = yt.streams.get_audio_only()

    video_filename = sanitize_filename(stream.default_filename)
    video_filename = f'[{index}] {video_filename}'
    
    file_path = os.path.join(video_output_path, video_filename)
    if not os.path.exists(file_path):
        stream.download(output_path=video_output_path, filename=video_filename)
        print(f'Downloaded (Video): {video_filename}')
    else:
        print(f'Skipped (video already exists): {video_filename}')

yt_urls = []
with open(yt_video_links_file, "r") as file:
    yt_urls = [url.strip() for url in file.readlines() if url.strip()]

with tqdm(total=len(yt_urls), desc="Downloading YouTube Videos", unit="video") as pbar:
    for index, url in enumerate(yt_urls):
        download_youtube_video(url, index)
        pbar.update(1)
yt_urls = None

Downloading YouTube Videos:   0%|          | 0/3 [00:00<?, ?video/s]

Skipped (video already exists): [0] Michigan’s Muslims Helped Biden Win in 2020. Will They Back Harris in Nov. - - Amanpour and Company.mp4
Skipped (video already exists): [1] Arizona Gen Z voters sit down to talk 2024 election.mp4
Skipped (video already exists): [2] The ‘battleground state’ of Pennsylvania is most important in US presidential election.mp4


# Audio Extraction (Video to Audio)

In [57]:
def extract_audio_from_video(video_filename):
    os.makedirs(audio_output_path, exist_ok=True)

    filename, _ = os.path.splitext(video_filename)
    audio_filename = f'{filename}.mp3'

    video_file = os.path.join(video_output_path, video_filename)
    audio_file = os.path.join(audio_output_path, audio_filename)

    if os.path.exists(audio_file):
        print(f'Skipped (audio already exists): {audio_filename}')
    else:
        try:
            (
                ffmpeg
                .input(video_file)
                .output(audio_file, format='mp3', acodec='libmp3lame', loglevel="info")
                .run(overwrite_output=True)
            )
            print(f'Extracted (Audio): {audio_filename}')
        except ffmpeg.Error as e:
            print(f'Error converting {video_filename}: {e.stderr.decode()}')

video_files = os.listdir(video_output_path)
with tqdm(total=len(video_files), desc='Extracting Audio Files', unit="audio") as pbar:
    for video_filename in video_files:
        extract_audio_from_video(video_filename)
        pbar.update(1)
video_files = None

Extracting Audio Files:   0%|          | 0/3 [00:00<?, ?audio/s]

Skipped (audio already exists): [0] Michigan’s Muslims Helped Biden Win in 2020. Will They Back Harris in Nov. - - Amanpour and Company.mp3
Skipped (audio already exists): [1] Arizona Gen Z voters sit down to talk 2024 election.mp3
Skipped (audio already exists): [2] The ‘battleground state’ of Pennsylvania is most important in US presidential election.mp3


# Transcription (Audio to Text)

In [58]:
def transcribe_audio(audio_filename):
    os.makedirs(transcriptions_output_path, exist_ok=True)
    
    audio_file = os.path.join(audio_output_path, audio_filename)
    transcription_filename = f'{os.path.splitext(audio_filename)[0]}.txt'
    transcription_file = os.path.join(transcriptions_output_path, transcription_filename)

    if os.path.exists(transcription_file):
        print(f'Skipped (transcription already exists): {transcription_filename}')
    else:
        # Models: "tiny", "base", "small", "medium", "large", "turbo"
        # English Only Models:"tiny.en", "base.en", "small.en", "medium.en"
        model = whisper.load_model("small", device="cpu")

        result = model.transcribe(audio_file, verbose=False)

        with open(transcription_file, 'w') as f:
            f.write(result['text'])

        print(f'Transcribed (Text): {transcription_filename}')

audio_files = os.listdir(audio_output_path)
with tqdm(total=len(audio_files), desc='Transcribing Audio Files', unit="transcription") as pbar:
    for audio_filename in audio_files:
        transcribe_audio(audio_filename)
        pbar.update(1)
audio_files = None

Transcribing Audio Files:   0%|          | 0/3 [00:00<?, ?transcription/s]

Skipped (transcription already exists): [0] Michigan’s Muslims Helped Biden Win in 2020. Will They Back Harris in Nov. - - Amanpour and Company.txt
Skipped (transcription already exists): [1] Arizona Gen Z voters sit down to talk 2024 election.txt
Skipped (transcription already exists): [2] The ‘battleground state’ of Pennsylvania is most important in US presidential election.txt


# Merge Transcripts