# API Data Collection

This notebook details the collection of necessary data from the Youtube API.

We would like to source our data from various political news sources:

- CNN
- MSNBC
- ABC
- Reuters
- FOX
- Tucker Carlson
- The Daily Wire

## 0. Import Dependencies

In [156]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
scopes = ["https://www.googleapis.com/auth/youtube.readonly"]

from dotenv import load_dotenv
import os
import sys
from pathlib import Path

from typing import List, Dict, Any, Optional
from typing_extensions import TypedDict
from datetime import datetime

from pprint import pprint

sys.path.append(Path(os.path.abspath(os.getcwd())).parent) # Get project root directory for .env file
load_dotenv()

True

## 1. API Configuration/Exploration

We test for the API functionality. Here, we would make a small example of receiving information from the Youtube API

In [157]:
CHANNEL_HANDLES = [
    "msnbc",
    "cnn",
    "FoxNews",
    "briantylercohen",
    "DailyWirePlus",
    "ABCNews",
    "TuckerCarlson"
]

In [None]:
API_KEY = os.getenv("GOOGLE_API_KEY")
API_VERSION = "v3"
API_SERVICE_NAME = "youtube"

youtube = build(API_SERVICE_NAME, API_VERSION, developerKey=API_KEY)

In [None]:
from functools import wraps

def api_status_handler(f: callable):
    @wraps(f)
    def wrapper(*args, **kwargs):
        try:
            return {
                "status_code": 200,
                "data": f(*args, **kwargs)
            }
        except HttpError as e:
            status_code = e.resp.status
            error_content = e.content.decode("utf-8")
            print(f"HTTP Error {status_code}: {error_content}")
            
            if status_code == 403:
                print("Comments disabled or quota exceeded")
            elif status_code == 404:
                print("Video not found")
            elif status_code == 400:
                print("Invalid video ID")
            
            return {
                "status_code": status_code
            }

    return wrapper

### 1.1 Retrieving Youtube Channel

In [124]:
@api_status_handler
def get_channel(handle: str):
    # Create request via the Google API (This is a quite convenient abstraction)
    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        forHandle=handle
    )

    channel_response = request.execute()
    return channel_response

response = get_channel(CHANNEL_HANDLES[0])
if response["status_code"] == 200:
    channel = response["data"]
else:
    channel = {}
pprint(channel)

{'etag': 'p097-GXn5FKgAptEff-v5n7UpIc',
 'items': [{'contentDetails': {'relatedPlaylists': {'likes': '',
                                                    'uploads': 'UUaXkIU1QidjPwiAYu6GcHjg'}},
            'etag': 'updSCvQ6eNwbLQTPjXjgHL5sTRY',
            'id': 'UCaXkIU1QidjPwiAYu6GcHjg',
            'kind': 'youtube#channel',
            'snippet': {'customUrl': '@msnbc',
                        'description': 'The official MSNBC YouTube Channel. '
                                       'MSNBC is the premier destination for '
                                       'in-depth analysis of the news, '
                                       'insightful political commentary and '
                                       'diverse perspectives. \n',
                        'localized': {'description': 'The official MSNBC '
                                                     'YouTube Channel. MSNBC '
                                                     'is the premier '
                   

### 1.2 Retrieving Channel's Uploads

In [125]:
# Retrieve the uploads playlist ID for playlistItems() request
@api_status_handler
def get_channel_uploads(uploads_playlist_id: str):
    request = youtube.playlistItems().list(
        part="contentDetails,id,snippet,status",
        playlistId=uploads_playlist_id,
    )

    uploads_response = request.execute()
    return uploads_response

playlist_id = channel["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
response = get_channel_uploads(playlist_id)
if response["status_code"] == 200:
    uploads = response["data"]
else:
    uploads = {}

pprint(uploads)

{'etag': 'Jxz2j242d9zLrizL8YZrQ8_npRA',
 'items': [{'contentDetails': {'videoId': 'LHfr1qngP-c',
                               'videoPublishedAt': '2025-10-10T22:41:57Z'},
            'etag': 'GS-sSaB9WdaHZkl7NFpjT0bn738',
            'id': 'VVVhWGtJVTFRaWRqUHdpQVl1NkdjSGpnLkxIZnIxcW5nUC1j',
            'kind': 'youtube#playlistItem',
            'snippet': {'channelId': 'UCaXkIU1QidjPwiAYu6GcHjg',
                        'channelTitle': 'MSNBC',
                        'description': 'The Trump administration announced '
                                       'that "substantial" layoffs of federal '
                                       'workers "have begun," as the '
                                       'government shutdown continues. The '
                                       'move, according to a White House '
                                       'official, could impact "thousands." '
                                       'MSNBC White House correspondent Laura '
          

### 1.3 Paginating Through Channel Uploads

We can expand upon the above section by paginating through for more pages

In [126]:
next_page_token = ""
for i in range(5):
    request = youtube.playlistItems().list(
        part="contentDetails,id,snippet,status",
        playlistId=playlist_id,
        pageToken=next_page_token
    )
    
    page_response = request.execute()
    next_page_token = page_response["nextPageToken"]
    
    print(f"NextPageToken: {next_page_token}")

NextPageToken: EAAaHlBUOkNBVWlFREUwUmpNeFJUVkZPVGcyTkVORk5VSQ
NextPageToken: EAAaHlBUOkNBb2lFRVF3UWpKRU9FSXdNVVZHTURNMk9FRQ
NextPageToken: EAAaHlBUOkNBOGlFRFpETlRBek5rWTRNMEkwUmpGRk9ERQ
NextPageToken: EAAaHlBUOkNCUWlFREkyTkRVM05ETTNOa0ZEUVVFd056Zw
NextPageToken: EAAaHlBUOkNCa2lFRUkxT1RkRU4wSkdNall3TVRRek1UTQ


### 1.4 Comment Section Retrieval

Now that we have access to the video lists, we can obtain the comments under specific videos

In [127]:
request = youtube.playlistItems().list(
    part="contentDetails,id,snippet,status",
    playlistId=playlist_id,
)
video_list_response = request.execute()

# Get a single item from the list and its video ID
video_id = video_list_response["items"][3]["contentDetails"]["videoId"]

In [128]:
video_id

'5zroLgAcCmY'

In [129]:
@api_status_handler
def get_comment_thread(video_id: str):
    # Get the comment thread
    request = youtube.commentThreads().list(
        part="id,replies,snippet",
        videoId=video_id
    )
    comment_thread_response = request.execute()
    return comment_thread_response

response = get_comment_thread(video_id)

if response["status_code"] == 200:
    comment_thread = response["data"]
else:
    comment_thread = {}
pprint(comment_thread)

{'etag': 'xiCyplm8EETPwKPGFL84VLqt4rg',
 'items': [{'etag': 'te_ZzwOCuXWLoNy_4V0tXPnECHs',
            'id': 'UgwMczLTEV4RCfursEV4AaABAg',
            'kind': 'youtube#commentThread',
            'snippet': {'canReply': True,
                        'channelId': 'UCaXkIU1QidjPwiAYu6GcHjg',
                        'isPublic': True,
                        'topLevelComment': {'etag': 'YT2ZX2_nRMHMEKCi94Gxhzho3PU',
                                            'id': 'UgwMczLTEV4RCfursEV4AaABAg',
                                            'kind': 'youtube#comment',
                                            'snippet': {'authorChannelId': {'value': 'UCAFB0tUYdwlKJ8p2CE1U_Ww'},
                                                        'authorChannelUrl': 'http://www.youtube.com/@deanbeavers7372',
                                                        'authorDisplayName': '@deanbeavers7372',
                                                        'authorProfileImageUrl': 'https://yt3.ggpht.com

### 1.5 Commenter Details Retrieval

We should get insights on the commenter's channel, such as
- account creation date
- account subscriber count (if applicable)
- account country origin

In [144]:
@api_status_handler
def get_commenter_details(account_id: str):
    request = youtube.channels().list(
        part="id,snippet,statistics",
        id=account_id
    )
    response = request.execute()
    
    return response

## 2. Comment API Scraping

We will construct a more involed scraper, utilizing what we found above

In [None]:
class CommentData(TypedDict):
    author_display_name: str
    author_channel_id: str
    video_channel_id: str
    video_id: str
    channel_id: str
    text: str
    like_count: int
    updated_at: str | datetime
    published_at: str | datetime
    is_reply: bool
    
    commenter_created_at: str | datetime
    commenter_sub_count: int
    commenter_video_count: int
    

def scraper():
    """
    Retrieves comments from the 5 most recent videos from all channels.
    """
    all_data = []
    for channel_handle in CHANNEL_HANDLES:
        print(f"="*80)
        print(f"\tRetrieving comments from: {channel_handle:^30}")
        print(f"="*80)
        
        # Get channel info
        channel_response = get_channel(channel_handle)
        if channel_response["status_code"] == 200:
            channel = channel_response["data"]
        else:
            print(f"Unable to retrieve channel data; skipping...")
            continue
        
        # Retrieve uploads ID and get videos
        playlist_id = channel["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
        uploads_response = get_channel_uploads(playlist_id)     # Should retrieve 5 videos per page, 
                                                                #   as per default pagination settings                      
        if uploads_response["status_code"] == 200:
            uploads = uploads_response["data"]["items"]
        else:
            print(f"Unable to process {channel_handle} uploads playlist; skipping...")
            continue
        
        comments = process_videos(uploads)
        all_data.extend(comments)
        
    return all_data
        

def process_videos(uploads: List[Dict[str, Any]]) -> List[CommentData]:
    """
    Helper function for parsing videos from a list of uploads from a channel.
    
    Args:
        uploads (List[Dict]): A list of video items, represented as a Dict
    
    Returns:
        List[CommentData]: A list of all retrieved comments
    """
    comments = []
    for video_entry in uploads:
        # Get an entry's video ID
        video_id = video_entry["contentDetails"]["videoId"]
        video_title = video_entry["snippet"]["title"]
        
        # Use video ID for comment thread
        comment_thread_response = get_comment_thread(video_id)
        if comment_thread_response["status_code"] == 200:
            comment_thread = comment_thread_response["data"]["items"]
        else:
            print(f"Video [{video_id}] has comments disabled. Skipping...")
            continue
        
        # Retrieve all comments from video's comment thread
        video_comments = process_comment_thread(comment_thread)
        comments.extend(video_comments)
        
    return comments


def process_comment_thread(comment_thread: List[Dict[str, Any]]) -> List[CommentData]:
    """
    Helper function for parsing comments from a single video's comment thread.
    Retrieves head comments and their replies, if any.
    
    Args:
        comment_thread (List[Dict]): A list of comments from the comment thread
        
    Returns:
        List[CommentData]: A list of retrieved comments
    """
    comments = []
    for head_comment in comment_thread:
        # Get video ID
        video_id = head_comment["snippet"]["videoId"]
        
        # Get head comment's comment data
        comment = head_comment["snippet"]["topLevelComment"]
        comment = process_comment(comment, video_id=video_id, is_reply=False)
        comments.append(comment)
        
        # Get head comment's ID (for replies)
        head_comment_id = head_comment["snippet"]["topLevelComment"]["id"]
        
        # If the head comment has replies, get those comments
        replies = head_comment.get("replies", {})
        thread_replies = replies.get("comments", [])
        for reply in thread_replies:
            comment = process_comment(reply, video_id=video_id, is_reply=True, head_comment_id=head_comment_id)
            comments.append(comment)
    
    return comments

def process_comment(comment: Dict[str, Any], video_id: str, is_reply: bool, head_comment_id: Optional[str] = None) -> CommentData:
    """
    Parses a single comment from a comment thread
    
    Args:
        comment: The Comment Resource (https://developers.google.com/youtube/v3/docs/comments#resource)
        video_id: The underlying video in which the comment exists
        is_reply: Whether or not the comment is a reply within a thread
        
    Returns:
        CommentData: The aggregated data retrieved
    """
    snippet = comment["snippet"]
    
    author_display_name = snippet["authorDisplayName"]
    like_count = snippet["likeCount"]
    text = snippet["textOriginal"]
    author_channel_id = snippet["authorChannelId"]["value"]
    video_channel_id = snippet["channelId"]
    updated_at = snippet["updatedAt"]
    published_at = snippet["publishedAt"]
    is_updated = updated_at == published_at
    
    account_details_response = get_commenter_details(account_id=author_channel_id)
    
    if account_details_response["status_code"] == 200:
        account_details = account_details_response["data"]["items"][0]
        
        commenter_created_at = account_details["snippet"]["publishedAt"],
        is_hidden_sub_count = account_details["statistics"]["hiddenSubscriberCount"]
        commenter_sub_count = account_details["statistics"]["subscriberCount"] \
            if not is_hidden_sub_count else 0
        commenter_video_count = account_details["statistics"]["videoCount"]
        
        return CommentData(
            author_display_name = author_display_name,
            author_channel_id = author_channel_id,
            like_count = like_count,
            text = text,
            video_id = video_id,
            video_channel_id = video_channel_id,
            updated_at = updated_at,
            published_at = published_at,
            is_updated = is_updated,
            is_reply = is_reply,
            head_comment_id = head_comment_id,
            commenter_created_at = commenter_created_at,
            commenter_is_hidden_sub_count = is_hidden_sub_count,
            commenter_sub_count = commenter_sub_count,
            commenter_video_count = commenter_video_count
        )
    else:
        print(f"Unable to retrieve commenter account info for: {author_display_name}")
        return CommentData(
            author_display_name = author_display_name,
            author_channel_id = author_channel_id,
            like_count = like_count,
            text = text,
            video_id = video_id,
            video_channel_id = video_channel_id,
            updated_at = updated_at,
            published_at = published_at,
            is_updated = is_updated,
            is_reply = is_reply,
            head_comment_id = head_comment_id,
        )
    
        
scraper_results = scraper()

	Retrieving comments from:             msnbc             
	Retrieving comments from:              cnn              
	Retrieving comments from:            FoxNews            
	Retrieving comments from:            Reuters            
HTTP Error 403: {
  "error": {
    "code": 403,
    "message": "The video identified by the \u003ccode\u003e\u003ca href=\"/youtube/v3/docs/commentThreads/list#videoId\"\u003evideoId\u003c/a\u003e\u003c/code\u003e parameter has disabled comments.",
    "errors": [
      {
        "message": "The video identified by the \u003ccode\u003e\u003ca href=\"/youtube/v3/docs/commentThreads/list#videoId\"\u003evideoId\u003c/a\u003e\u003c/code\u003e parameter has disabled comments.",
        "domain": "youtube.commentThread",
        "reason": "commentsDisabled",
        "location": "videoId",
        "locationType": "parameter"
      }
    ]
  }
}

Comments disabled or quota exceeded
Video [og0hOz3indU] has comments disabled. Skipping...
HTTP Error 403: {
  "error": {

In [158]:
len(scraper_results)

604