In [5]:
from urllib.parse import urlparse, parse_qs
from googleapiclient.discovery import build
from dotenv import load_dotenv
import os

In [58]:
import pandas as pd
from pathlib import Path
import re

In [4]:
load_dotenv()

True

In [6]:
API_KEY = os.getenv("YOUTUBE_API_KEY")
YOUTUBE = build("youtube", "v3", developerKey=API_KEY)

In [7]:
def video_id(url):
    qs = parse_qs(urlparse(url).query)
    if "v" in qs:                # regular watch URL
        return qs["v"][0]
    return url.split("/")[-1]    # short youtu.be/VIDEO_ID

urls = [
    "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
    "https://youtu.be/3JZ_D3ELwOQ",
]

In [10]:
video_id(urls[1])

'3JZ_D3ELwOQ'

In [15]:
resp = YOUTUBE.videos().list(
    part='snippet',
    id='dQw4w9WgXcQ,3JZ_D3ELwOQ',
    maxResults=50
).execute()

In [18]:
len(resp['items'])

2

In [13]:
channel = YOUTUBE.channels().list(
    part='snippet,statistics',
    id="UCuAXFkgsw1L7xaCfnd5JJOw",
    maxResults=50
).execute()

In [14]:
channel

{'kind': 'youtube#channelListResponse',
 'etag': 'fLHT0aNl0lY-YoqQDfqLm5L0XR8',
 'pageInfo': {'totalResults': 1, 'resultsPerPage': 5},
 'items': [{'kind': 'youtube#channel',
   'etag': '36fApYAnyvJU4cXlkS3HRug1ZIU',
   'id': 'UCuAXFkgsw1L7xaCfnd5JJOw',
   'snippet': {'title': 'Rick Astley',
    'description': 'Never: The Autobiography -  Out now üìö',
    'customUrl': '@rickastleyyt',
    'publishedAt': '2015-02-01T16:32:30Z',
    'thumbnails': {'default': {'url': 'https://yt3.ggpht.com/K2ecE5j90a_DFzugHo0bW98vFlIQ1JJgs9mbcav7RGy1t7adJRnd2jaIv-oc6XzTRvDdWlFCAfc=s88-c-k-c0x00ffffff-no-rj',
      'width': 88,
      'height': 88},
     'medium': {'url': 'https://yt3.ggpht.com/K2ecE5j90a_DFzugHo0bW98vFlIQ1JJgs9mbcav7RGy1t7adJRnd2jaIv-oc6XzTRvDdWlFCAfc=s240-c-k-c0x00ffffff-no-rj',
      'width': 240,
      'height': 240},
     'high': {'url': 'https://yt3.ggpht.com/K2ecE5j90a_DFzugHo0bW98vFlIQ1JJgs9mbcav7RGy1t7adJRnd2jaIv-oc6XzTRvDdWlFCAfc=s800-c-k-c0x00ffffff-no-rj',
      'width': 800,
 

# YouTube data

In [29]:
cleaned_arena_data_root = Path("../data") / "intermediate" / "cleaned_arena_data"

In [24]:
youtube_analysis_root = Path("../data/intermediate/youtube_analysis")

In [63]:
youtube_citations_df = pd.read_parquet(youtube_analysis_root / "youtube_citations.parquet")

In [30]:
citations_df = pd.read_parquet(cleaned_arena_data_root / "citations.parquet")

In [87]:
unique_videos_df = pd.read_parquet(youtube_analysis_root / "unique_videos.parquet")

In [91]:
video_metadata_df = pd.read_parquet(youtube_analysis_root / "video_metadata.parquet")

In [103]:
unique_channels_df = pd.read_parquet(youtube_analysis_root / "unique_channels.parquet")
channel_metadata_df = pd.read_parquet(youtube_analysis_root / "channel_metadata.parquet")

In [115]:
channel_metadata_df[['title', 'description', 'country']].sample(10)

Unnamed: 0,title,description,country
5764,Mka,ŸÖŸÜ ŸÖŸáŸÜÿßÿ≤ŸÖ Ÿà ÿ™Ÿà€å ÿ¢ŸÑŸÖÿßŸÜ ⁄©ÿ≥ÿ® Ÿà ⁄©ÿßÿ± ÿÆŸàÿØŸÖŸà ÿØÿßÿ±ŸÖ ÿ®ÿß...,DE
4635,Îã§ÎÇòÏò∑,Ìå®ÏÖòÏù¥ Î¨∏ÌôîÍ∞Ä ÎêòÍ≥† Ï∑®ÎØ∏Í∞Ä Îê† Ïàò ÏûàÎèÑÎ°ù\nÎªîÌïòÏßÄ ÏïäÍ≥† Ïû¨Î∞åÎäî Ïù¥ÏïºÍ∏∞Î•º Ìï©ÎãàÎã§\n\...,
9388,Huda TV,A light in every home\nwelcome to Huda Tv YouT...,EG
6096,Shakitum,,
1314,The Kiboomers - Kids Music Channel,"\nWelcome to The Kiboomers, where learning is ...",CA
447,ÌïëÌîÑ,Ìò∏Ìò∏Ìò∏\nÏõÉÏúºÎ©¥ Î≥µÏù¥Ïò®ÎåÄ Ìò∏ÌôìÌôì\n,KR
1823,Father Daughter Duo,Just A Dad and His Daughter Reacting To Pop Cu...,US
2564,FlexiViews,‚ÄùFlexiViews‚Äù is a participant in the Amazon Se...,AU
3920,Noticias de MMA,Noticias y contenido relevante de MMA en espa√±...,MX
9491,Ters Maske,Minecraft Troll Videosu Yaparƒ±m :D\n\n,TR


In [116]:
channel_metadata_df.head()

Unnamed: 0,channel_id,title,description,custom_url,published_at,country,default_language,view_count,subscriber_count,video_count,hidden_subscriber_count,keywords,unsubscribed_trailer,privacy_status,is_linked,made_for_kids,api_fetch_date,created_year
0,UCd21m0AHf4Vx88Znty7v4Cw,ViniiTube,"Find anime-related videos which include Top's,...",@viniitube1,2015-03-25T12:58:59Z,US,,282488898,1120000,525,False,"ViniiTube ""anime tops"" ""top 10"" ""anime fights""...",02T3cBPUpHc,public,True,False,2025-06-23T17:52:46.637847,2015.0
1,UCkinYTS9IHqOEwR1Sze2JTw,SBS Îâ¥Ïä§,ÎåÄÌïúÎØºÍµ≠ No.1 SBSÎâ¥Ïä§,@sbsnews8,2014-05-02T10:13:29Z,KR,,14399958667,4900000,287635,False,"SBSNEWS sbs sbsÎâ¥Ïä§ korea KOREA ""korea news"" ""ko...",jNme9PEd4TQ,public,True,False,2025-06-23T17:52:46.637868,2014.0
2,UC8butISFwT-Wl7EV0hUK0BQ,freeCodeCamp.org,"Learn math, programming, and computer science ...",@freecodecamp,2014-12-16T21:18:48Z,US,,874889987,10900000,1871,False,"""coding bootcamp"" ""learn to code"" ""software en...",pCtkD5AMbDo,public,True,False,2025-06-23T17:52:46.637877,2014.0
3,UCz8QaiQxApLq8sLNcszYyJw,Firstpost,"At Firstpost, we focus on facts, keep the nois...",@firstpost,2019-10-10T08:29:47.771142Z,IN,,4086269380,8360000,50413,False,"firstpost ""palki sharma"" ""firstpost live"" ""eng...",hM2f1GrqknQ,public,True,False,2025-06-23T17:52:46.637885,
4,UC2WmuBuFq6gL08QYG-JjXKw,WorldofAI,World of Ai is here to push the creative use o...,@intheworldofai,2023-02-02T17:05:59.178275Z,CA,,12293693,142000,795,False,"Ai ""Artificial intelligence"" ""chat GPT"" gpt Op...",W2Ur7FGqsJE,public,True,False,2025-06-23T17:52:46.637891,


In [102]:
unique_channels_df

Unnamed: 0,channel_id,channel_title,cited_video_count,total_views_of_cited_videos,avg_views_of_cited_videos,max_views_of_cited_videos,earliest_cited_video,latest_cited_video
0,UCBEQPygCM9o0xoO0t45yXdQ,David Wowee,82,400939,4.889500e+03,17205,2022-06-09T09:15:58Z,2025-04-22T14:34:46Z
1,UCtyA0gFmjKC8sG78pwN-OOg,Wil Mak,51,213423,4.184765e+03,14327,2019-05-08T00:51:31Z,2024-04-12T03:08:16Z
2,UCPix8N6PMRI4KzgyjuZeF0g,Fahd Mirza,25,131101,5.244040e+03,16735,2023-07-20T04:52:02Z,2025-04-18T22:00:32Z
3,UChlgI3UHCOnwUGzWzbJ3H5w,YTN,24,1802831,7.511796e+04,531935,2025-02-10T01:02:49Z,2025-04-10T04:22:36Z
4,UC8ZDo-g26XlHXqp_MHFN9sw,Casual Crono,23,19909,8.656087e+02,3754,2023-07-19T22:45:05Z,2025-04-14T00:09:12Z
...,...,...,...,...,...,...,...,...
10084,UCMOBsdfCrAUFmG5Y9zE7SFg,Neeks,1,30274,3.027400e+04,30274,2017-01-03T04:36:37Z,2017-01-03T04:36:37Z
10085,UCMNOd06m6Eef2Qo8QP8X4eA,Rudik TV,1,350,3.500000e+02,350,2024-12-18T19:05:25Z,2024-12-18T19:05:25Z
10086,UCMMv7Nh_cjUnsz6nsdJtqlw,ÂªÉÂéü„É°„É¢„É™ / Haibara memory,1,4984236,4.984236e+06,4984236,2025-02-25T10:00:06Z,2025-02-25T10:00:06Z
10087,UCMLnVs6Kjbil0dRmKJXdkXA,Ìå©Ïä§ÌÜ†Î¶¨facstory,1,17096,1.709600e+04,17096,2025-04-05T09:01:09Z,2025-04-05T09:01:09Z


In [95]:
len(unique_videos_df)

13514

In [96]:
len(video_metadata_df)

13397

In [98]:
video_metadata_df

Unnamed: 0,video_id,title,description,channel_id,channel_title,published_at,category_id,default_language,tags,view_count,like_count,comment_count,duration,captions_available,privacy_status,upload_status,api_fetch_date,published_year
0,aEZ-CiML1pA,Ìè¨ÎØ∏Îãõ_Ïù¥Î¶ÑÏù¥ Î≠êÏòàÏöî? (What's Your Name? by 4minute@Mco...,2013ÎÖÑ 5Ïõî 2Ïùº Î™©ÏöîÏùº \nÌè¨ÎØ∏Îãõ_Ïù¥Î¶ÑÏù¥ Î≠êÏòàÏöî?\nWhat's Your Na...,UCbD8EppRX3ZwJSou-TVo90A,Mnet K-POP,2013-05-02T12:00:37Z,24,,"[""\uc5e0\ub137"", ""mnet"", ""kpop"", ""k-pop"", ""sta...",5904345,30559,1780,PT3M15S,False,public,processed,2025-06-23T17:35:48.359224,2013
1,rByIjbW5eLw,Ìè¨ÎØ∏Îãõ_Ïù¥Î¶ÑÏù¥ Î≠êÏòàÏöî? (What's Your Name? by 4minute@M C...,2013ÎÖÑ 5Ïõî 16Ïùº Î™©ÏöîÏùº\nÌè¨ÎØ∏Îãõ_Ïù¥Î¶ÑÏù¥ Î≠êÏòàÏöî?\nWhat's Your Na...,UCbD8EppRX3ZwJSou-TVo90A,Mnet K-POP,2013-05-16T12:19:44Z,24,,"[""\uc5e0\ub137"", ""mnet"", ""kpop"", ""k-pop"", ""sta...",430447,1867,227,PT3M6S,False,public,processed,2025-06-23T17:35:48.359259,2013
2,MCo_VNcVHHg,The Story of Malin Kundang | Stories for Kids ...,Malin Kundang is a popular Indonesian folktale...,UCkaXNqgjEepsGSNPDPfxWyA,Kirana Studio,2023-08-07T06:43:42Z,22,,[],5517,53,0,PT3M53S,False,public,processed,2025-06-23T17:35:48.359270,2023
3,QlEdrKDRxq8,Ìè¨ÎØ∏Îãõ_Ïù¥Î¶ÑÏù¥ Î≠êÏòàÏöî? (What's Your Name? by 4minute@Mco...,2013ÎÖÑ 5Ïõî 30Ïùº Î™©ÏöîÏùº \nÌè¨ÎØ∏Îãõ_Ïù¥Î¶ÑÏù¥ Î≠êÏòàÏöî?\nWhat's Your N...,UCbD8EppRX3ZwJSou-TVo90A,Mnet K-POP,2013-05-30T12:58:53Z,24,,"[""\uc5e0\ub137"", ""mnet"", ""kpop"", ""k-pop"", ""sta...",91057,711,40,PT3M6S,False,public,processed,2025-06-23T17:35:48.359284,2013
4,yemP_n_YenM,Where Was My Hero? - Sonic comic dub,This dub took over a year to complete due to s...,UCwuIB-1IEtYAnV9_W1ETTJA,RioDies va,2022-07-09T19:00:09Z,23,,[],599338,20017,1294,PT19M6S,False,public,processed,2025-06-23T17:35:48.359295,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13392,NUWu2mVbGpg,9 months learning Latin update #9: Latin Vulga...,The is my progress update of 9 months of learn...,UCL9WHWp10NHnBrw9ngXSuBA,JustinLearnsLatin,2023-03-14T05:13:10Z,22,,[],2085,46,37,PT41M42S,False,public,processed,2025-06-23T17:41:30.835611,2023
13393,NV1dRr7WGZ0,Transcribe YouTube Videos Using OpenAI Speech ...,#openai #whisper #openaiapi \n----------------...,UCKe7TlwyDiH2RM7jhI5vnXw,Skolo Online,2023-12-16T09:34:18Z,28,,"[""openai whisper"", ""openai"", ""python"", ""whispe...",3456,59,12,PT33M48S,False,public,processed,2025-06-23T17:41:30.835617,2023
13394,NV7N4roDnA4,"(Î≥¥Ïïà, ÏïàÎìúÎ°úÏù¥Îìú) apkpure ÏÇ¨Ïù¥Ìä∏ÏóêÏÑú apkÌååÏùºÏùÑ Í≤ÄÏÉâÌïòÍ≥† Îã§Ïö¥Î°úÎìú ÌïòÎäî ...",Î≥¥ÏïàÌîÑÎ°úÏ†ùÌä∏( www.boanproject.com )ÏóêÏÑú Ï†úÍ≥µÌïòÎäî ITÎ≥¥Ïïà Î¨¥Î£å Í∞ï...,UC7X47oW4QqJS9z4uanD9qIw,Î≥¥ÏïàÌîÑÎ°úÏ†ùÌä∏[boanproject],2018-11-18T13:57:27Z,28,,"[""\uc548\ub4dc\ub85c\uc774\ub4dc"", ""apk"", ""\ub...",10184,23,7,PT5M35S,False,public,processed,2025-06-23T17:41:30.835622,2018
13395,NVBe8hqFOPQ,–í—Å–µ –∞–≤—Ç–æ–º–æ–±–∏–ª–∏ –≤ Need For Speed: Unbound / All...,–í –¥–∞–Ω–Ω–æ–º —Ä–æ–ª–∏–∫–µ –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω—ã –≤—Å–µ –∞–≤—Ç–æ–º–æ–±–∏–ª–∏ –≤ ...,UCUO8htVNJA_licrMD3KDGNQ,TOPDRIVE GAMES,2022-12-01T21:31:24Z,20,ru,"[""1080p"", ""60FPS"", ""HD"", ""PC"", ""NFS Unbound"", ...",15711,166,48,PT7M33S,False,public,processed,2025-06-23T17:41:30.835630,2022


In [97]:
unique_videos_df

Unnamed: 0,video_id,citation_count,unique_response_count,sample_url,min_citation_number,max_citation_number,min_citation_order,max_citation_order
0,aEZ-CiML1pA,60,60,https://www.youtube.com/watch?v=aEZ-CiML1pA,2,8,1,7
1,rByIjbW5eLw,44,44,https://www.youtube.com/watch?v=rByIjbW5eLw,2,4,1,3
2,MCo_VNcVHHg,27,27,https://www.youtube.com/watch?v=MCo_VNcVHHg,3,7,2,6
3,QlEdrKDRxq8,21,21,https://www.youtube.com/watch?v=QlEdrKDRxq8,4,6,3,5
4,yemP_n_YenM,19,19,https://www.youtube.com/watch?v=yemP_n_YenM,1,7,0,6
...,...,...,...,...,...,...,...,...
13509,NUWu2mVbGpg,1,1,https://www.youtube.com/watch?v=NUWu2mVbGpg,5,5,4,4
13510,NV1dRr7WGZ0,1,1,https://www.youtube.com/watch?v=NV1dRr7WGZ0,12,12,11,11
13511,NV7N4roDnA4,1,1,https://www.youtube.com/watch?v=NV7N4roDnA4,4,4,3,3
13512,NVBe8hqFOPQ,1,1,https://www.youtube.com/watch?v=NVBe8hqFOPQ,10,10,9,9


In [64]:
youtube_citations_df

Unnamed: 0,citation_id,response_id,citation_number,url,domain_full,domain,url_valid,citation_order,video_id
0,cite_00000020,resp_00000003,10,https://www.youtube.com/watch?v=yqyqsr15aKc,youtube.com,youtube.com,True,9,yqyqsr15aKc
1,cite_00000070,resp_00000011,4,https://www.youtube.com/watch?v=0HjUPnh_2Vw,youtube.com,youtube.com,True,3,0HjUPnh_2Vw
2,cite_00000076,resp_00000012,5,https://www.youtube.com/watch?v=_S0OzTJ9dS0,youtube.com,youtube.com,True,4,_S0OzTJ9dS0
3,cite_00000088,resp_00000016,1,https://www.youtube.com/watch?v=QCHogqIDY_M,youtube.com,youtube.com,True,0,QCHogqIDY_M
4,cite_00000090,resp_00000016,3,https://www.youtube.com/watch?v=QpH2KQ8mHjg,youtube.com,youtube.com,True,2,QpH2KQ8mHjg
...,...,...,...,...,...,...,...,...,...
19118,cite_00365819,resp_00065717,5,https://www.youtube.com/watch?v=amvlNWsWGq0,youtube.com,youtube.com,True,4,amvlNWsWGq0
19119,cite_00365854,resp_00065722,8,https://www.youtube.com/watch?v=eL8oSnBDt4M,youtube.com,youtube.com,True,7,eL8oSnBDt4M
19120,cite_00366003,resp_00065745,8,https://www.youtube.com/watch?v=CURb2tJBpIA,youtube.com,youtube.com,True,7,CURb2tJBpIA
19121,cite_00366012,resp_00065747,5,https://www.youtube.com/watch?v=vBIX_gshils,youtube.com,youtube.com,True,4,vBIX_gshils


In [65]:
all_youtube_df = citations_df.query('domain == "youtube.com"')

In [66]:
filtered_out_citations = list(set(all_youtube_df.citation_id) - set(youtube_citations_df.citation_id))

In [67]:
len(filtered_out_citations)

626

In [86]:
all_youtube_df.query(f"citation_id == '{filtered_out_citations[17]}'").iloc[0]['url']

'https://www.youtube.com/c/samanthamaria'

In [60]:
def extract_video_id_from_url(url):
    """
    Extract YouTube video ID from various URL formats.

    Supports:
    - https://www.youtube.com/watch?v=VIDEO_ID
    - https://youtu.be/VIDEO_ID
    - https://www.youtube.com/embed/VIDEO_ID
    - https://youtube.com/watch?v=VIDEO_ID
    """
    if not isinstance(url, str):
        return None

    # Handle youtu.be format
    if "youtu.be/" in url:
        try:
            return url.split("youtu.be/")[-1].split("?")[0].split("&")[0]
        except:
            return None

    # Handle youtube.com formats
    if "youtube.com" in url:
        try:
            parsed = urlparse(url)
            query_params = parse_qs(parsed.query)

            # Standard watch URL
            if "v" in query_params:
                return query_params["v"][0]

            # Embed URL
            if "/embed/" in parsed.path:
                return parsed.path.split("/embed/")[-1].split("?")[0]

        except:
            return None

    return None

def is_youtube_video_url(url):
    """Check if URL is a YouTube video URL (not just a page mentioning YouTube)."""
    if not isinstance(url, str):
        return False

    # YouTube video patterns
    youtube_patterns = [
        r"youtube\.com/watch\?.*[&?]v=",  # watch URL with v parameter (can have other params before)
        r"youtube\.com/watch\?v=",       # watch URL with v as first parameter
        r"youtu\.be/",                   # short URL format
        r"youtube\.com/embed/",          # embed URL format
        r"m\.youtube\.com/watch\?.*[&?]v=",  # mobile with v parameter (can have other params before)
        r"m\.youtube\.com/watch\?v=",    # mobile with v as first parameter
    ]

    return any(re.search(pattern, url, re.IGNORECASE) for pattern in youtube_patterns)

In [61]:
extract_video_id_from_url('https://www.youtube.com/watch?app=desktop&v=7fLRgrVwLS4')

'7fLRgrVwLS4'

In [62]:
is_youtube_video_url('https://www.youtube.com/watch?app=desktop&v=7fLRgrVwLS4')

True