In [None]:
import googleapiclient.discovery
import googleapiclient.errors
import pandas as pd

API_KEY = 'AIzaSyAE3LhQlmckC7boqdjcPjZmTfyrw9TS3ZA'
CHANNEL_ID = 'UC0zYzxRCvUD5p1XWwE00Zzw'

def get_youtube_service():
    return googleapiclient.discovery.build('youtube', 'v3', developerKey=API_KEY)

def get_channel_videos(youtube, channel_id):
    videos = []
    request = youtube.search().list(
        part='id',
        channelId=channel_id,
        maxResults=50,
        type='video'
    )

    while request is not None:
        response = request.execute()
        videos += response['items']
        request = youtube.search().list_next(request, response)

    return videos

def get_video_metadata(youtube, video_id):
    request = youtube.videos().list(
        part='snippet,contentDetails,statistics',
        id=video_id
    )
    response = request.execute()

    try:
        response = response['items'][0]
        data = {
            'ID': response['id'],
            'TITLE': response['snippet']['title'],
            'DESCRIPTION': response['snippet']['description'],
            'PUBLISHED_AT': response['snippet']['publishedAt'],
            'CATEGORY_ID': response['snippet']['categoryId'],
            'DURATION': response['contentDetails']['duration'],
            'CAPTION': response['contentDetails']['caption'],
            'LIKE_COUNT': response['statistics']['likeCount'],
            'COMMENT_COUNT': response['statistics']['commentCount'],
            'VIEW_COUNT': response['statistics']['viewCount'],
        }
        return data
    except Exception as e:
        print(f"Error: {e}")
        return None

def get_video_comments(youtube, video_id):
    next_page_token = None
    comments = []

    try:
        while True:
            comment_thread = youtube.commentThreads().list(
                part="snippet,replies",
                order="relevance",
                videoId=video_id,
                maxResults=100,
                pageToken=next_page_token,
            ).execute()

            for item in comment_thread['items']:
                snippet = item['snippet']['topLevelComment']['snippet']
                comment = {
                    'REPLY_COUNT': item['snippet']['totalReplyCount'],
                    'AUTHOR': snippet['authorDisplayName'],
                    'TEXT': snippet['textDisplay'],
                    'LIKE_COUNT': snippet['likeCount'],
                    'PUBLISHED_AT': snippet['publishedAt'],
                    'VIDEO_ID': video_id,
                }
                comments.append(comment)

                if 'replies' in item:
                    for reply_item in item['replies']['comments']:
                        reply_snippet = reply_item['snippet']
                        reply = {
                            'REPLY_COUNT': 0,
                            'AUTHOR': reply_snippet['authorDisplayName'],
                            'TEXT': reply_snippet['textDisplay'],
                            'LIKE_COUNT': reply_snippet['likeCount'],
                            'PUBLISHED_AT': reply_snippet['publishedAt'],
                            'VIDEO_ID': video_id,
                        }
                        comments.append(reply)

            next_page_token = comment_thread.get("nextPageToken")
            if not next_page_token:
                break
    except Exception as e:
        print(f"Error fetching comments: {e}")
        return []

    return comments

def main():
    youtube = get_youtube_service()

    videos = get_channel_videos(youtube, CHANNEL_ID)
    video_ids = [video['id']['videoId'] for video in videos]

    all_videos_data = []
    all_comments_data = []

    for i, video_id in enumerate(video_ids, 1):
        print(f"Processing video {i}/{len(video_ids)}: {video_id}")

        video_metadata = get_video_metadata(youtube, video_id)
        if video_metadata:
            all_videos_data.append(video_metadata)

        comments = get_video_comments(youtube, video_id)
        all_comments_data.extend(comments)

    videos_df = pd.DataFrame(all_videos_data)
    videos_df.to_csv('videos.csv', index=False)

    comments_df = pd.DataFrame(all_comments_data)
    comments_df.to_csv('comments.csv', index=False)

    print(f"\nDone! Saved {len(all_videos_data)} videos and {len(all_comments_data)} comments.")

if __name__ == '__main__':
    main()

Processing video 1/228: 3t5-YzRuduw
Processing video 2/228: GE1hRKeQQDE
Processing video 3/228: uW02XBOhsDk
Processing video 4/228: KeYHxf0Bnc8
Processing video 5/228: 3xcLLI9qU3U
Processing video 6/228: ioVUC5ZSnLk
Processing video 7/228: 7-O1ViYpXV8
Processing video 8/228: cNAAdEEYZF8
Processing video 9/228: C9HSMFW6jJY
Processing video 10/228: dczISLNK3FI
Processing video 11/228: biOL0_HVWek
Processing video 12/228: C4mZGuluzus
Processing video 13/228: VKCEUSlTFRI
Processing video 14/228: nfIF45yiLME
Processing video 15/228: w-Yu8XuBk8g
Processing video 16/228: ueCRrtz201k
Processing video 17/228: j056PyJcWZA
Processing video 18/228: Ll4NrKXEcL8
Processing video 19/228: BlL0UmRpEuo
Processing video 20/228: KrYCISP2A6o
Processing video 21/228: fRO2aQLYTDI
Processing video 22/228: dkTtv3UB9m4
Processing video 23/228: o5siMJVKSKo
Processing video 24/228: 0lhmEFmXoM0
Processing video 25/228: yBUgPaqU7ck
Processing video 26/228: p5UlQlp_ef0
Processing video 27/228: yigba9-gMfU
Processing



Processing video 218/228: Jt2i2CEzBSo
Error: 'commentCount'
Error fetching comments: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet%2Creplies&order=relevance&videoId=Jt2i2CEzBSo&maxResults=100&key=AIzaSyAE3LhQlmckC7boqdjcPjZmTfyrw9TS3ZA&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">
Processing video 219/228: V8efNQoLa0o
Processing video 220/228: rx5rDMPSjnI
Processing video 221/228: n6qWskrXS4w
Processing video 222/228: kWrtyD5zkZk
Processing video 223/228: xIFbIKUxaHw
Processing video 224/228: Lm6z9DPOsuE
Processing video 225/228: 0nK8cs

In [None]:
import re
from html import unescape
import string
from textblob import TextBlob
import pandas as pd

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = unescape(text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def iso8601_to_minutes(duration) -> int:
    # Check if duration is already a number or not a string
    if isinstance(duration, (int, float)):
        return int(duration)

    # Convert to string if it isn't already
    duration = str(duration)

    hours = re.search(r'(\d+)H', duration)
    minutes = re.search(r'(\d+)M', duration)
    seconds = re.search(r'(\d+)S', duration)

    hours = int(hours.group(1)) if hours else 0
    minutes = int(minutes.group(1)) if minutes else 0
    seconds = int(seconds.group(1)) if seconds else 0

    total_minutes = hours * 60 + minutes + (seconds / 60)
    return int(total_minutes)

def get_sentiment_score(text):
    # Handle non-string values
    if not isinstance(text, str):
        return 0.0
    blob = TextBlob(text)
    return blob.sentiment.polarity

def main():
    comments_df = pd.read_csv('/content/comments_preprocessed.csv', encoding='utf-8')
    videos_df = pd.read_csv('/content/videos_preprocessed.csv', encoding='utf-8')

    comments_df.dropna(inplace=True)
    comments_df['TEXT'] = comments_df['TEXT'].apply(clean_text)
    comments_df['SCORE'] = comments_df['TEXT'].apply(get_sentiment_score)

    videos_df['DURATION'] = videos_df['DURATION'].apply(iso8601_to_minutes)

    comments_df.to_csv('/content/comments_preprocessed.csv', index=False, encoding='utf-8-sig')
    videos_df.to_csv('/content/videos_preprocessed.csv', index=False, encoding='utf-8-sig')

    print(f"Processed {len(comments_df)} comments and {len(videos_df)} videos")

if __name__ == "__main__":
    main()

Processed 93109 comments and 226 videos
