### **Step 1: Import packages**

In [1]:
import pandas as pd
import requests
import time
import html
import datetime

### **Step 2: Get API key and channel id info**

In [2]:
# API key
api_key = '*****' # concealed for privacy reasons
# channel id for 'Food Wishes'
channel_id = 'UCRIZtPl9nb9RiXc9btSTQNw'

### **Step 3: Define functions to acquire stats for videos on the channel**

##### **Step 3.1: Define the function to acquire stats for a particular video**

- YouTube videos are identified by video ids.

- We need to provide the video id to see those stats.

- Youtube provides access to view counts, like counts, and comment counts via the API.

- The access to dislike counts has been discontinued.

In [3]:
def get_video_stats(video_id):
    video_stats_url = 'https://www.googleapis.com/youtube/v3/videos?id='+video_id+\
                    '&part=statistics&key='+api_key
    response_video_stats = requests.get(video_stats_url).json()
    view_count = response_video_stats['items'][0]['statistics']['viewCount']
    like_count = response_video_stats['items'][0]['statistics']['likeCount']
    comment_count = response_video_stats['items'][0]['statistics']['commentCount']
    return view_count, like_count, comment_count

##### **Step 3.2: Define the function to consolidate the stats of all videos into a dataframe**

1. The dataframe will record video ids, video title, upload dates, and the three counts.

1. The dataframe will collect all stats in the period of start year (1 Jan) -> end year (31 Dec).

1. A start year is always needed. Unless the end year is specifically given, its default is set to the current year.

1. API queries will be made in half-yearly batches to avoid Google's stipulation of maximum 500-entry limit per query.

In [4]:
def get_channel_data(start_year, end_year=datetime.datetime.now().year):
    # create an empty dataframe to store video info, provide column names
    df = pd.DataFrame(columns=['video_id', 'video_title', 'upload_date',\
                               'view_count', 'like_count', 'comment_count'])
    # create a list for time periods
    dates = list()
    for date in range(start_year, end_year+1):
        dates.append(f'{date}-01-01,{date}-12-31')
    # as Google imposes a 500-entry limit on one single API query,
    # we need to go through smaller time periods one at a time
    for date in dates:
        after, before = date.split(',')
        after_date = after+'T00:00:00Z'
        before_date = before+'T00:00:00Z'
        # if this is the first time running requests  
        # define first-time url
        url_initial = 'https://www.googleapis.com/youtube/v3/search?key='+api_key+\
        '&channelId='+channel_id+\
        '&part=snippet,id&Order=date&maxResults=50&publishedBefore='+before_date+'&publishedAfter='+after_date
        # retrieve responses as JSON
        response = requests.get(url_initial).json()
        # iterate through responses to fetch video info
        time.sleep(1)
        for item in response['items']:
            if item['id']['kind'] == 'youtube#video':
                video_id = item['id']['videoId']
                video_title = html.unescape(item['snippet']['title']) # covert escaped chars back to original forms
                upload_date = item['snippet']['publishTime'].split('T')[0] # extract only the            
                # get stats
                view_count, like_count, comment_count = get_video_stats(video_id)
                # append the info to the dataframe
                df = df.append(
                    dict(
                        video_id = video_id, 
                        video_title = video_title,
                        upload_date = upload_date,
                        view_count = view_count,
                        like_count = like_count,
                        comment_count = comment_count,
                    ), ignore_index=True
                )
        # when requests have been done once 
        # firstly check if the period has less than 50 entries
        # if less than 50 entries, the 'nextPageToken' field won't appear in response
        try: 
            response['nextPageToken']
        except KeyError:
            # print 'less than 50 entries' message
            print(f'{after} -> {before}: Less than 50 entries in this period. All entries collected.')
        else:
            # get page tokens
            page_token = response['nextPageToken']
            # traverse all pages until page tokens are no longer available in responses
            try:
                # when page tokens are showing
                while page_token != '':
                    url = 'https://www.googleapis.com/youtube/v3/search?key='+api_key+\
                    '&channelId='+channel_id+\
                    '&part=snippet,id&Order=date&maxResults=50&publishedBefore='+before_date+'&publishedAfter='+after_date+\
                    '&pageToken='+page_token
                    new_response = requests.get(url).json()
                    page_token = new_response['nextPageToken']
                    time.sleep(1)       
                    # iterate through responses to fetch video info
                    for item in new_response['items']:
                        if item['id']['kind'] == 'youtube#video':
                            video_id = item['id']['videoId']
                            video_title = html.unescape(item['snippet']['title']) # covert escaped chars back to original forms
                            upload_date = item['snippet']['publishTime'].split('T')[0] # extract only the 
                            # get stats
                            view_count, like_count, comment_count = get_video_stats(video_id)
                            # append the info to the dataframe
                            df = df.append(
                                dict(
                                    video_id = video_id, 
                                    video_title = video_title,
                                    upload_date = upload_date,
                                    view_count = view_count,
                                    like_count = like_count,
                                    comment_count = comment_count,
                                ), ignore_index=True
                            )
            # when page tokens are not available
            except KeyError:
                # remove page token variable
                del response
                del page_token
                # if limit reached, print limit warning message
                if df.shape[0] == 500:
                    print('The 500-entry limit impose by Google is reached. Reduce the collection size.')
                # if traversal finished, print completion message
                else:
                    print(f'{after} -> {before}: All entries collected for this period.')
    return df

### **Step 4: Make API queries and collect data**

In [5]:
food_wishes_from2012 = get_channel_data(start_year=2012)

2012-01-01 -> 2012-12-31: All entries collected for this period.
2013-01-01 -> 2013-12-31: All entries collected for this period.
2014-01-01 -> 2014-12-31: All entries collected for this period.
2015-01-01 -> 2015-12-31: All entries collected for this period.
2016-01-01 -> 2016-12-31: All entries collected for this period.
2017-01-01 -> 2017-12-31: All entries collected for this period.
2018-01-01 -> 2018-12-31: All entries collected for this period.
2019-01-01 -> 2019-12-31: All entries collected for this period.
2020-01-01 -> 2020-12-31: All entries collected for this period.
2021-01-01 -> 2021-12-31: All entries collected for this period.
2022-01-01 -> 2022-12-31: Less than 50 entries in this period. All entries collected.


In [6]:
# see what the dataframe looks like
food_wishes_from2012.head(10)

Unnamed: 0,video_id,video_title,upload_date,view_count,like_count,comment_count
0,CQPLo8hECWg,Twice Baked Potatoes -- How to Make Fancy Stuf...,2012-12-15,4194422,47208,1950
1,TsrTU3CJn2c,Irish Shepherd's Pie - Classic Shepherd Pie fo...,2012-03-05,2730601,44227,2843
2,wRtGM3f-UBc,How to Flip Food in a Pan Like a Chef!,2012-10-04,2764300,17836,1383
3,QGAJokcwBXI,Garlic Shrimp Recipe - Quick & Easy Garlic Shrimp,2012-02-17,5094363,55387,1904
4,ME9CM0zqubg,Rosemary & Honey Pull-Apart Dinner Rolls - Hol...,2012-12-11,606245,10940,599
5,pYhiIrlXY7I,Hash Browns - Hash Browned Potato Recipe - Cla...,2012-03-01,2198124,34812,2023
6,fP45d9xxNt0,Banana Bread Recipe - Chocolate Banana Nut Loaf,2012-01-06,881425,16767,1032
7,S4z2gmtUzHE,Drunken Mussels Recipe - Mussels Steamed in a ...,2012-05-21,2224430,27334,1303
8,rYm2hHaN2i0,Roasted Chicken Broth Recipe - Part 1 of How t...,2012-02-22,389670,4512,364
9,dKtySwuIZec,Chicken & Dumplings - Stewed Chicken with Thym...,2012-05-29,976791,21267,1025


In [7]:
# see how many entries have been collected
food_wishes_from2012.shape

(794, 6)

In [8]:
# check the datatypes of all columns
food_wishes_from2012.dtypes

video_id         object
video_title      object
upload_date      object
view_count       object
like_count       object
comment_count    object
dtype: object

In [9]:
# save the dataframe to a csv file
food_wishes_from2012.to_csv('food_wishes_from2007.csv')

### **Step 5: Conclusion**

1. **We have collected 794 entries in the dataframe.**

2. **The data are up-to-date as of Mar 12, 2022.**

1. **The informaton collected was all stats since 2012 (the most recent 10 years).**

1. **The dataframe is exported to a csv file for later processing.**