In [1]:
# Import Dependencies
import os, re
import pandas as pd
from pytubefix.contrib.search import Search, Filter
from pytubefix.innertube import _default_clients
from IPython.display import HTML, IFrame, display, clear_output

In [2]:
def read_unique_items_from_file(file: str) -> list:
    if os.path.exists(file):
        with open(file, "r") as f:
            return list(set(url.strip() for url in f.readlines() if url.strip()))
    return []
def read_dict_from_csv(file: str, key: str = "URL", value: str = "Title") -> dict:
    if os.path.exists(file):
        return pd.read_csv(file, encoding_errors="ignore").set_index(key)[value].to_dict()
    return {}
def ask_user(prompt: str) -> bool:
    while True:
        answer = input(prompt).lower()
        if ("yes").startswith(answer.lower()):
            return True
        elif ("no").startswith(answer.lower()):
            return False
def display_video(url: str):
    display(IFrame(url, width=490, height=275, extras=["controls", "autoplay", 'allow="autoplay"']))
def get_yt_id(url: str):
    # Find the part after the '?' (query parameters)
    if '?' in url: query_string = url.split('?')[-1]
    else: return None
    # Split the query parameters by '&'
    params = query_string.split('&')
    # Find the parameter by name
    for param in params:
        key, value = param.split('=')
        if key == "v": return value
    return None

In [3]:
# Folder Name
output_path = "YouTube URL Collection"
# File Name
used_urls_filename = "Used URLs.csv"
unused_urls_filename = "Unused URLs.txt"
# Boolean Flag
authorize_yt = False # Change to true if YouTube Detects you as a BOT
# Numeric Constant
max_description_height = 80
# Additional Dependency Configurations
_default_clients["ANDROID_MUSIC"] = _default_clients["ANDROID_CREATOR"]
# Print Colors
class pcolors:
    ENDC = "\033[0m"
    BRIGHTCYAN = "\033[38;2;67;227;235m"

In [None]:
# Get Existing YouTube Ids
used_yt_dict= {
    get_yt_id(url): title
    for url, title in read_dict_from_csv(os.path.join(output_path, used_urls_filename)).items()
    if get_yt_id(url) is not None and get_yt_id(url) != ""
}
unused_yt_ids = set([
    get_yt_id(url)
    for url in read_unique_items_from_file(os.path.join(output_path, unused_urls_filename))
    if get_yt_id(url) is not None and get_yt_id(url) != ""
])
# YouTube Search Query
search_query = re.sub(r'\s+', " ", """
    
    trump kamala harris donald 
    pennsylvania PA P.A.
    -survivor -assassin -shot -campaign -rally -rallies
    
""".replace("\n", " ")).strip()
# Required/Prohibited Keywords (Uncased)
required_uncased_keywords_in_title = [
    # if no words here is in title, it is skipped
    # "harris", "kamala"
] 
required_uncased_keywords_in_title_or_desc = [
    # if no words here is in title and description, it is skipped
    # "pennsylvania", "P.A."
] 
prohibited_uncased_keywords_in_title = [
    # if any word here is in title, it is skipped
    # "trump", "donald"
] 
# YouTube Filters
yt_filters = { 
    "type": Filter.get_type("Video"),
    # Under 4 minutes | 4 - 20 minutes | Over 20 minutes
    "duration": Filter.get_duration("4 - 20 minutes") 
}
# Search YouTube Videos
print(f'...Searching...')
yt_list = Search(
    query=search_query,
    filters=yt_filters,
    use_oauth=authorize_yt,
    allow_oauth_cache=authorize_yt
)
last_yt_videos_len = len(yt_list.videos)
# Loop To Search Per Batch
while True:
    for yt in yt_list.videos:
        url = yt.watch_url.strip()
        title = yt.title
        description = yt.description
        video_id = get_yt_id(url)
        # Check If Video / Url Already Seen
        if (
            title is None or title == ""
            or video_id is None or video_id == ""
            or video_id in used_yt_dict
            or video_id in unused_yt_ids
            or any(key in title.lower() for key in prohibited_uncased_keywords_in_title)
            or all(key not in title.lower() for key in required_uncased_keywords_in_title)
            or all(
                key not in title.lower() and key not in yt.description.lower()
                for key in required_uncased_keywords_in_title_or_desc
            )
        ): continue
        # Check If Year > 2020
        publish_date = yt.publish_date
        if publish_date != None and publish_date.year <= 2020: continue
        # Reset Output Console
        clear_output(wait=True)
        # Display the Vid
        display_video(f'{yt.embed_url}?autoplay=1')
        print("") # Just New Line for Better Output
        print(f'Title: {pcolors.BRIGHTCYAN}{title}{pcolors.ENDC}')
        print("") # Just New Line for Better Output
        if description is not None and description != "":
            display(HTML(f"""<div style="max-height:{max_description_height}px;overflow-y:scroll;border-bottom:1px solid #4c4c4c;padding: 0px 8px; color:rgb(67,227,235);font-family: menlo, consolas, 'DejaVu Sans Mono', monospace">{description}</div>"""))
            print("") # Just New Line for Better Output
        print(f'Publish Year: {pcolors.BRIGHTCYAN}{yt.publish_date.year if yt.publish_date != None else "None"}{pcolors.ENDC}')
        print("") # Just New Line for Better Output
        print(f'URL If Video Failed To Load: {url}')
        print("") # Just New Line for Better Output
        if ask_user("Do you want to add this Video URL (y/n)?"): 
            # Create Collection Directory
            os.makedirs(output_path, exist_ok=True)
            used_yt_dict[video_id] = title
            (
                pd.DataFrame(list(({
                    f'https://www.youtube.com/watch?v={yt_id}': title
                    for yt_id, title in used_yt_dict.items()
                    if yt_id is not None and yt_id != ""
                }).items()), columns=["URL", "Title"])
                .to_csv(os.path.join(output_path, used_urls_filename), index=False, errors="ignore")
            )
        else:
            # Create Collection Directory
            os.makedirs(output_path, exist_ok=True)
            with open(os.path.join(output_path, unused_urls_filename), "w") as file:
                unused_yt_ids.add(video_id)
                file.write("\n".join([
                    f'https://www.youtube.com/watch?v={yt_id}'
                    for yt_id in list(unused_yt_ids)
                    if yt_id is not None and yt_id != ""
                ]))
    # Search Next Batch
    print("") # Just New Line for Better Output
    print(f'...Searching [{len(yt_list.videos)} Total Video]...')
    yt_list.get_next_results()
    yt_videos_len = len(yt_list.videos)
    # Stop If No More Videos Found
    if yt_videos_len > last_yt_videos_len:
        last_yt_videos_len = yt_videos_len
        continue
    print("") # Just New Line for Better Output
    print("No More Videos Found")
    break


Title: [38;2;67;227;235mBREAKING: Union Leader WARNS Kamala Harris AGAINST Choosing Shapiro & Kelly For Vice President....[0m




Publish Year: [38;2;67;227;235m2024[0m

URL If Video Failed To Load: https://youtube.com/watch?v=gB9Hy93jec8

