In [3]:
import os
import pandas as pd
from googleapiclient.discovery import build
import regex

API_KEY: str = os.getenv('GOOGLE_API_KEY')
service = build('youtube', 'v3', developerKey=API_KEY)


def is_valid_sentence(text: str) -> bool:
    """Checks if the sentence is valid based on the given pattern."""
    pattern = regex.compile(r'^\p{So}*\s*\P{So}\s*[\p{L}\d\s.,!?"\'():;-]*\s*\P{So}\s*$')
    return bool(pattern.match(text))


def snippet_to_dict(snippet: dict, performer: str) -> dict:
    """Converts a comment snippet to a language-added dictionary."""
    return {
        'ChannelName': performer,
        'VideoId': snippet['videoId'],
        'Author': snippet['authorDisplayName'],
        'Comment': snippet['textOriginal'],
    }


def get_channel_id_by_name(channel_name: str) -> tuple | None:
    search_response = service.search().list(
        q=channel_name,
        part='snippet',
        type='channel',
        maxResults=1
    ).execute()
    if search_response['items']:
        channel_id = search_response['items'][0]['snippet']['channelId']
        channel_title = search_response['items'][0]['snippet']['title']
        return channel_id, channel_title
    else:
        return None


def get_channels_id() -> dict:
    channels = {}
    numbers_of_channels = int(input("Enter number of channels: "))
    for i in range(numbers_of_channels):
        nickname: str = input("Enter channel nickname: ex'@nickname'")
        channel_id, channel_title = get_channel_id_by_name(nickname)
        if channel_id is not None:
            channels[channel_id] = channel_title
    return channels


def get_comments() -> pd.DataFrame:
    comments: list[dict] = []
    channels = get_channels_id()
    pages = int(input("Input pages numbers: "))
    for ids, performer in channels.items():
        args = {
            'allThreadsRelatedToChannelId': ids,
            'part': 'id, snippet, replies',
            'maxResults': 20
        }
        for page in range(pages):
            comment_threads = service.commentThreads().list(**args).execute()
            for item in comment_threads['items']:
                top_level_comment = item['snippet']['topLevelComment']
                comment_snippet = top_level_comment['snippet']
                comments.append(snippet_to_dict(comment_snippet, performer))
                if 'replies' in item:
                    reply = item['replies']
                    for rep in reply['comments']:
                        comments.append(snippet_to_dict(rep['snippet'], performer))
            args['pageToken'] = comment_threads.get('nextPageToken')
            if not args['pageToken']:
                break
    return pd.DataFrame(comments)

In [4]:
comments = get_comments()

In [9]:
comments.head()

Unnamed: 0,ChannelName,VideoId,Author,Comment
0,Taylor Swift,9Wx2It8SbLs,@enzaiandolo9652,Benjamin is so cuteeee
1,Taylor Swift,zz9YgUoisOA,@MaddieWilliams-gb8qo,I love your videos and songs TAYLOR
2,Taylor Swift,ilkvTXfv2TY,@LynnJeremy-xy2hf,Wow taylor
3,Taylor Swift,pQq9eP5OFhw,@lamabambam,"Stop, your losing me to sea taffy! Yeah, you'r..."
4,Taylor Swift,Mi-FBBqUuJE,@Agggtm22_11,Am I the only one who wants the first fans to ...


In [12]:
comments.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421 entries, 0 to 420
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ChannelName  421 non-null    object
 1   VideoId      421 non-null    object
 2   Author       421 non-null    object
 3   Comment      421 non-null    object
dtypes: object(4)
memory usage: 183.8 KB
