In [None]:
pip install git+https://github.com/openai/whisper.git

In [None]:
pip install pytube pandas

In [9]:
import pytube as pt
import whisper
import pandas as pd
import os
import re

In [10]:
model = whisper.load_model("base", device="cuda")

In [17]:
playlist_url = "https://youtube.com/playlist?list=PL4A2F331EE86DCC22&si=wHXUk-zrjM2IalVh"
num_videos = 90
output_folder = "audio"

In [18]:
playlist = pt.Playlist(playlist_url)
video_data = []
missing_videos = 0

for i, url in enumerate(playlist.video_urls[:num_videos], start=1):
    print(url)
    
    try:
        # Fetch the YouTube video
        yt = pt.YouTube(url)
        
        # Filter and select the first audio stream
        stream = yt.streams.filter(only_audio=True).first()
        
        # Define the output file name
        output_file = os.path.join(output_folder, f"{i}_tagesschau.mp3")
        
        # Download the audio stream
        stream.download(filename=output_file)
        
        # Append the video data to the list
        video_data.append({
            "ID": i,
            "url": url,
            "path": output_file
        })
    
    except pt.exceptions.AgeRestrictedError:
        print(f"Video {url} is age restricted and will be skipped.")
        missing_videos += 1
    except Exception as e:
        print(f"An error occurred with video {url}: {e}")
        missing_videos += 1

https://www.youtube.com/watch?v=pWuYkzRypf8
https://www.youtube.com/watch?v=MAmyhxJoZFM
https://www.youtube.com/watch?v=AVqitc2vJ0c
https://www.youtube.com/watch?v=Nmwe8gXfu1c
https://www.youtube.com/watch?v=-okNLhbCcHc
https://www.youtube.com/watch?v=SSOlNLbcA70
https://www.youtube.com/watch?v=0CL0PHE56dg
https://www.youtube.com/watch?v=bGv-H3Xdinw


KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(video_data, columns=["ID", "url", "path"])

In [None]:
# Now get the transcription via whisper
df['text'] = ""

for index, row in df.iterrows():
    try:
        result = model.transcribe(row['path'])
        df.at[index, 'text'] = result['text']
    except Exception as e:
        print(f"An error occurred while transcribing {row['path']}: {e}")

In [None]:
def get_video_title(url):
    try:
        yt = YouTube(url)
        return yt.title
    except Exception as e:
        return str(e)

def extract_date(title):
    date_pattern = re.compile(r'\b\d{1,2}\.\d{1,2}\.\d{2,4}\b')
    match = date_pattern.search(title)
    return match.group(0) if match else None

def convert_date(date_str):
    for fmt in ('%d.%m.%Y', '%d.%m.%y'):
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            continue
    return None

In [None]:
df['title'] = df['url'].apply(get_video_title)
df['date'] = df['title'].apply(extract_date)
df['date'] = df['date'].apply(convert_date)

df.head()

In [None]:
df.to_csv("data/data.csv", index=False)