In [None]:
"""
STEPS:
1. Get list of video IDs in playlist from calling YT Data API v3 (playlistItems list)
2. Get video details and metadata from video ID list (videos list)
    > snippet (tags, published date, title, description)
    > statistics (comment, like, dislike, view counts)
3. Get publishedAt, user ID, video ID for comments on each video (commentThread list)
4. Convert publishedAt to timestamp
5. Use boto3 to import to S3
6. Use Glue to create schema in avro format
7. Create model with AWS Personalize
"""

In [None]:
import requests
import pandas as pd
import xlsxwriter
import openpyxl

api_key = '<API_KEY>'
playlist_id = 'PLvahqwMqN4M2N01FfQy2wXkyVyucAL86b'

In [None]:
def get_vid_list(playlist_id, token=''):
    videos = []
    while token or token=='':
        api_call = 'https://youtube.googleapis.com/youtube/v3/playlistItems?part=contentDetails&playlistId={}&key={}'.format(playlist_id, api_key)
        if token:
            api_call+= '&pageToken={}'.format(token) 
        result = requests.get(api_call).json()
        videos += [item['contentDetails']['videoId'] for item in result['items']]
        try:
            token= result['nextPageToken']
        except:
            token = False
    return videos

videos = get_vid_list(playlist_id)


def get_vid_data(video_id):
    api_call = 'https://youtube.googleapis.com/youtube/v3/videos?part=snippet%2Cstatistics&id={}&key={}'.format(video_id, api_key)
    raw = requests.get(api_call).json()
    try:
        combined = {**raw['items'][0]['statistics'],**raw['items'][0]['snippet']}
        fields = ['publishedAt','title','description','tags','viewCount','likeCount','commentCount']
        result = {'video_id': video_id}
        for i in combined.keys():
            if i in fields:
                result[i] = combined[i]
        return result
    except:
        next

video_details = [get_vid_data(video_id) for video_id in videos]
videos_df = pd.DataFrame([i for i in video_details if type(i)==dict])
videos_df.set_index('video_id').to_csv('videos_df.csv')

In [None]:
def get_sub_comments(parentId, video_id='', token=''): 
    sub_comments = []
    while token or token=='':
        api_call = 'https://youtube.googleapis.com/youtube/v3/comments?part=snippet&parentId={}&key={}'.format(parentId, api_key)
        if token:
            api_call+= '&pageToken={}'.format(token) 
        result = requests.get(api_call).json()
        for item in result['items']:
            record = {"id": item['id'], "videoId": video_id}
            record.update(item['snippet'])
            sub_comments += [record]
        try:
            token= result['nextPageToken']
        except:
            token = False
    return sub_comments

def get_all_comments(video_id, token=''): 
    comments = []
    while token or token=='':
        api_call = 'https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId={}&key={}'.format(video_id, api_key)
        if token:
            api_call+= '&pageToken={}'.format(token) 
        result = requests.get(api_call).json()    
        for item in result['items']:
            record = item['snippet']['topLevelComment']['snippet']
            record['id'] = item['id']
            comments += [record]
            if item['snippet']['totalReplyCount']>0:
                comments+=get_sub_comments(item['id'], video_id)
        try:
            token= result['nextPageToken']
        except:
            token = False
                    
    # build pandas dataframe
    comments_df = pd.DataFrame(comments)
    comments_df['authorId'] = comments_df['authorChannelId'].apply(lambda x: x['value'] if type(x)==dict else '')
    comments_df = comments_df[['videoId','id','parentId','textOriginal','authorId','authorDisplayName','publishedAt','likeCount']]
    return comments_df

In [None]:
# all_comments=pd.DataFrame()
# videos_df=pd.DataFrame()

# if continuing from where we left off, import from the saved file
all_comments = pd.read_excel('all_comments_df.xlsx')
videos_df = pd.read_csv('videos_df.csv')

errors=[]

In [None]:
for v in videos_df.sort_values('publishedAt')[100:105]['video_id']:
    all_comments = all_comments[all_comments['videoId']!=v]
    print(v)
    try:
        all_comments = all_comments.append(get_all_comments(v))
    except:
        errors.append(v)
        next        
        
all_comments

c078AVNTjM4
nz7F2K4-waw
OjljgkCQv5c
tPsLcrVlwt4
aAZac21Y9D8


Unnamed: 0,id,videoId,parentId,textOriginal,authorId,authorDisplayName,publishedAt,likeCount
0,UgzmjkZjEVFcarVg9wR4AaABAg,BroESGFxWM4,,So cringey.,UC39z85yXGRZ6MGiwpW0qn6w,Mat G,2021-11-12T09:39:56Z,1
1,Ugx__BHUFiS59gREuel4AaABAg,BroESGFxWM4,,He's amazing! 😂😂😂,UC0IIqpv4hp_yjbOpMLw2NAw,Mj.,2021-04-01T13:19:59Z,1
2,UgxK-iLVzB-M7R3fylZ4AaABAg,BroESGFxWM4,,If only MLKJ knew his words would be used like...,UCRiVFpllPbQquw28P33Qe5g,Rachel Chen,2021-02-10T21:05:06Z,0
3,UgwuFIGeaVa2cqf2UX54AaABAg,BroESGFxWM4,,DO NOT LIKE THIS COMMENT.,UC6RbjI1f8owJl78DLK4w2_w,Francis Dourado,2020-12-22T11:38:53Z,0
4,Ugy77deTOW-4fMiNPn94AaABAg,BroESGFxWM4,,This guy is proof that anyone with a microphon...,UCsWzFTspp19uPTdydMWOYqQ,Guang,2020-09-25T05:53:11Z,0
...,...,...,...,...,...,...,...,...
4593,UgxXkHIz3u0DRcIGrc54AaABAg,aCv29JKmHNY,,Bill!,UCggCMFFovyEpAJYjbL3ApTg,Adam Henry,2019-08-29T15:04:31Z,1
4594,UgwUIJv_3pHGBcLaV1h4AaABAg,aCv29JKmHNY,,cool,UC3R7rD_g5Dzs-Ga_FQ-M9EQ,Sovath Zebra,2019-08-29T15:03:43Z,0
4595,UgwWgo7kzWs9OCrI3Vt4AaABAg,aCv29JKmHNY,,*if this is blue you watch Netflix*\n👇🏻(I'm gi...,UC6rpfofXt4duimS0ame8NSQ,Subscribe to me For no reason,2019-08-29T15:03:36Z,7
4596,Ugwx3ObNwTHndXaWsMl4AaABAg,aCv29JKmHNY,,Фу виндавс гавна,UCf_-sYO_dj1JSbfX0qn_E8w,ПОНЯШКИТОН,2019-08-29T15:03:07Z,0


In [None]:
# DEBUGGING

In [None]:
# video_id='NfpXeLVzJIw'
# token=''
# comments = []
# while token or token=='':
#     api_call = 'https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId={}&key={}'.format(video_id, api_key)
#     if token:
#         api_call+= '&pageToken={}'.format(token) 
#     result = requests.get(api_call).json()
# #         return result
#     for item in result['items']:
#         record = item['snippet']['topLevelComment']['snippet']
#         record['id'] = item['id']
#         comments += [record]
#         if item['snippet']['totalReplyCount']>0:
#             comments+=get_sub_comments(item['id'], video_id)
#     try:
#         token= result['nextPageToken']
#     except:
#         token = False
# #     return comments

# # build pandas dataframe
# comments_df = pd.DataFrame(comments)
# comments_df['authorId'] = comments_df['authorChannelId'].apply(lambda x: x['value'] if type(x)==dict else '')
# comments_df = comments_df[['videoId','id','parentId','textOriginal','authorId','authorDisplayName','publishedAt','likeCount']]
# comments_df

In [None]:
api_call = 'https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId={}&key={}'.format(video_id, api_key)
result = requests.get(api_call).json()  
result

{'kind': 'youtube#commentThreadListResponse',
 'etag': 'F-rf9-lIPSS0zH-u0UYJA9nJNs4',
 'nextPageToken': 'QURTSl9pMmNEeHZIWW42THBmWGRhM1RqWmVsaG5xQTFuV3RsNVhpeG1WVTNyMzVsYUFtX3lNWUpIYnVUbkFheHRmcnBpT0ZiNEJZLURPSQ==',
 'pageInfo': {'totalResults': 20, 'resultsPerPage': 20},
 'items': [{'kind': 'youtube#commentThread',
   'etag': 'ASUSCdZDbOlyxjAp6_z5FWnUiNg',
   'id': 'Ugzj-DulT4Hjk4ePrtJ4AaABAg',
   'snippet': {'videoId': 'oYJGWcZ4lVQ',
    'topLevelComment': {'kind': 'youtube#comment',
     'etag': 'oJrrhqARtDMto7zXDPslXRvxbFY',
     'id': 'Ugzj-DulT4Hjk4ePrtJ4AaABAg',
     'snippet': {'videoId': 'oYJGWcZ4lVQ',
      'textDisplay': 'You wondered why people were dressed in suits when you were at Clarke Quay. Both Clarke &amp; Boat Quay are adjacent to the Central Business District where all the financial institutions, local &amp; international, are headquartered.',
      'textOriginal': 'You wondered why people were dressed in suits when you were at Clarke Quay. Both Clarke & Boat Quay 