In [5]:
import os

import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors

from googleapiclient.discovery import build
from dateutil import parser
import pandas as pd
from IPython.display import JSON

# Data viz packages
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# NLP
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from wordcloud import WordCloud

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mixel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mixel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
api_key = 'AIzaSyAM4iLMk96aP2EL5YVDHKh6kl1wGaJWJR8'
channel_ids =['UCo9ZZ04kIhN_8xGxvnjaduQ']
#channel_id = 'UCLeBjrmfpAFGqDRJNL1PF5g' // missmangobutt
playlist_id = "UUo9ZZ04kIhN_8xGxvnjaduQ"

In [7]:
api_service_name = "youtube"
api_version = "v3"

# Get credentials and create an API client

youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=api_key)


In [8]:
def get_channel_stats(youtube, channel_ids):
    
    """
    Get channel stats
    
    Params:
    ------
    youtube: build object of Youtube API
    channel_ids: list of channel IDs
    
    Returns:
    ------
    dataframe with all channel stats for each channel ID
    
    """
    
    all_data = []
    
    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id=','.join(channel_ids)
    )
    response = request.execute()

    # loop through items
    for item in response['items']:
        data = {'channelName': item['snippet']['title'],
                'subscribers': item['statistics']['subscriberCount'],
                'views': item['statistics']['viewCount'],
                'totalVideos': item['statistics']['videoCount'],
                'playlistId': item['contentDetails']['relatedPlaylists']['uploads']
        }
        
        all_data.append(data)
        
    return(pd.DataFrame(all_data))

def get_video_ids(youtube, playlist_id):
    
    video_ids = []
    
    request = youtube.playlistItems().list(
        part="snippet,contentDetails",
        playlistId=playlist_id,
        maxResults = 50
    )
    response = request.execute()
    
    for item in response['items']:
        video_ids.append(item['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    while next_page_token is not None:
        request = youtube.playlistItems().list(
                    part='contentDetails',
                    playlistId = playlist_id,
                    maxResults = 50,
                    pageToken = next_page_token)
        response = request.execute()

        for item in response['items']:
            video_ids.append(item['contentDetails']['videoId'])

        next_page_token = response.get('nextPageToken')
        
    return video_ids
    
    
def get_video_details(youtube, video_ids):

    all_video_info = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute() 

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)
    
    return pd.DataFrame(all_video_info)

def get_comments_in_videos(youtube, video_ids):
    """
    Get top level comments as text from all videos with given IDs (only the first 10 comments due to quote limit of Youtube API)
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with video IDs and associated top level comment in text.
    
    """
    all_comments = []
    
    for video_id in video_ids:
        try:   
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id
            )
            response = request.execute()
        
            comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items'][0:10]]
            comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}

            all_comments.append(comments_in_video_info)
            
        except: 
            # When error occurs - most likely because comments are disabled on a video
            print('Could not get comments for video ' + video_id)
        
    return pd.DataFrame(all_comments)     

In [9]:
#get channel status
channel_stats = get_channel_stats(youtube, channel_ids)
channel_stats

Unnamed: 0,channelName,subscribers,views,totalVideos,playlistId
0,Stephanie Soo,2880000,642329404,1019,UUo9ZZ04kIhN_8xGxvnjaduQ


In [10]:
#get youtube playlist
video_ids = get_video_ids(youtube, playlist_id)
len(video_ids)

1020

In [11]:
# get video details
video_df = get_video_details(youtube, video_ids)
video_df

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,N_dQ_6CroaM,Stephanie Soo,"7 ""Friends"" Went To Cabo Only 6 Came Back Aliv...","If you’re ever injured in an accident, you can...","[Stephanie Soo, MissMangoButt, Stephanie Soo S...",2023-02-28T00:10:57Z,200671,12456,,655,PT51M11S,hd,false
1,99Rxr-LRdqw,Stephanie Soo,"The ""1 Mom 2 Sons"" That SCAMMED All Of SOUTH K...",CASETiFY's Bounce Cases are available at http:...,"[Stephanie Soo, MissMangoButt, Stephanie Soo S...",2023-02-25T00:13:39Z,294704,12463,,545,PT58M30S,hd,false
2,t_9sPQsJ7nk,Stephanie Soo,Did you know this about South Korea?,Full Video: https://www.youtube.com/watch?v=q0...,[mukbang],2023-02-21T20:03:15Z,485559,39828,,172,PT47S,hd,false
3,jlPIXxT5DEY,Stephanie Soo,He injected HIS FLUID into random women at the...,❤️‍🔥Click the link https://www.temu.com/k/c72c...,"[Stephanie Soo, MissMangoButt, Stephanie Soo S...",2023-02-19T00:49:04Z,320335,14516,,631,PT49M35S,hd,false
4,XEmevmTZeUU,Stephanie Soo,I found the groom cheating with the bride's mo...,🍟 For 65% off with HelloFresh PLUS free shippi...,"[Stephanie Soo, MissMangoButt, Stephanie Soo S...",2023-02-15T00:44:18Z,356563,15970,,673,PT47M14S,hd,false
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1015,HWsaP7Mvtc4,Stephanie Soo,Grilled Beef Short Ribs (Korean Kalbi) MUKBANG...,Grilled Beef Short Ribs (Korean Kalbi) MUKBANG...,"[mukbang, eating show, social eating, 먹방, Step...",2018-04-28T19:21:37Z,157484,4581,,373,PT28M24S,hd,false
1016,YZW8f2vlXKo,Stephanie Soo,Vietnamese Egg Rolls + Noodles MUKBANG | Eatin...,Vietnamese Egg Rolls + Noodles MUKBANG | Eatin...,"[mukbang, eating show, social eating, 먹방, Step...",2018-04-27T22:23:30Z,166720,4775,,432,PT24M22S,hd,false
1017,4AE61ymXvro,Stephanie Soo,Korean Spicy Rice Cakes Mukbang + Recipe | Eat...,Korean Spicy Rice Cakes Mukbang + Recipe | Eat...,"[mukbang, eating show, social eating, 먹방, cook...",2018-04-24T22:24:10Z,471128,11236,,606,PT38M42S,hd,false
1018,eK-6n6XSsAQ,Stephanie Soo,The Best Burger in LA Mukbang | Eating Show,The Best Burger in LA Mukbang | Eating Show \n...,"[mukbang, eating show, social eating, 먹방, Step...",2018-04-22T21:48:34Z,657255,12227,,735,PT28M1S,hd,false


In [42]:
# video_details.to_csv('video_details.csv', index=False)

In [12]:
# Check for NULL values
video_df.isnull().any()

video_id          False
channelTitle      False
title             False
description       False
tags              False
publishedAt       False
viewCount         False
likeCount         False
favouriteCount     True
commentCount      False
duration          False
definition        False
caption           False
dtype: bool

In [13]:
# Check data types
video_df.dtypes

video_id          object
channelTitle      object
title             object
description       object
tags              object
publishedAt       object
viewCount         object
likeCount         object
favouriteCount    object
commentCount      object
duration          object
definition        object
caption           object
dtype: object

In [14]:
# Convert count columns to numeric
numeric_cols = ['viewCount', 'likeCount', 'favouriteCount', 'commentCount']
video_df[numeric_cols] = video_df[numeric_cols].apply(pd.to_numeric, errors = 'coerce', axis = 1)

In [15]:
# Publish day in the week
video_df['publishedAt'] = video_df['publishedAt'].apply(lambda x: parser.parse(x)) 
video_df['pushblishDayName'] = video_df['publishedAt'].apply(lambda x: x.strftime("%A")) 

In [16]:
# convert duration to seconds
import isodate
video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x))
video_df['durationSecs'] = video_df['durationSecs'].astype('timedelta64[s]')

In [17]:
video_df[['durationSecs', 'duration']] 

Unnamed: 0,durationSecs,duration
0,3071.0,PT51M11S
1,3510.0,PT58M30S
2,47.0,PT47S
3,2975.0,PT49M35S
4,2834.0,PT47M14S
...,...,...
1015,1704.0,PT28M24S
1016,1462.0,PT24M22S
1017,2322.0,PT38M42S
1018,1681.0,PT28M1S


In [18]:
# Add tag count
video_df['tagCount'] = video_df['tags'].apply(lambda x: 0 if x is None else len(x))

In [19]:
video_df

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption,pushblishDayName,durationSecs,tagCount
0,N_dQ_6CroaM,Stephanie Soo,"7 ""Friends"" Went To Cabo Only 6 Came Back Aliv...","If you’re ever injured in an accident, you can...","[Stephanie Soo, MissMangoButt, Stephanie Soo S...",2023-02-28 00:10:57+00:00,200671.0,12456.0,,655.0,PT51M11S,hd,false,Tuesday,3071.0,31
1,99Rxr-LRdqw,Stephanie Soo,"The ""1 Mom 2 Sons"" That SCAMMED All Of SOUTH K...",CASETiFY's Bounce Cases are available at http:...,"[Stephanie Soo, MissMangoButt, Stephanie Soo S...",2023-02-25 00:13:39+00:00,294704.0,12463.0,,545.0,PT58M30S,hd,false,Saturday,3510.0,35
2,t_9sPQsJ7nk,Stephanie Soo,Did you know this about South Korea?,Full Video: https://www.youtube.com/watch?v=q0...,[mukbang],2023-02-21 20:03:15+00:00,485559.0,39828.0,,172.0,PT47S,hd,false,Tuesday,47.0,1
3,jlPIXxT5DEY,Stephanie Soo,He injected HIS FLUID into random women at the...,❤️‍🔥Click the link https://www.temu.com/k/c72c...,"[Stephanie Soo, MissMangoButt, Stephanie Soo S...",2023-02-19 00:49:04+00:00,320335.0,14516.0,,631.0,PT49M35S,hd,false,Sunday,2975.0,34
4,XEmevmTZeUU,Stephanie Soo,I found the groom cheating with the bride's mo...,🍟 For 65% off with HelloFresh PLUS free shippi...,"[Stephanie Soo, MissMangoButt, Stephanie Soo S...",2023-02-15 00:44:18+00:00,356563.0,15970.0,,673.0,PT47M14S,hd,false,Wednesday,2834.0,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1015,HWsaP7Mvtc4,Stephanie Soo,Grilled Beef Short Ribs (Korean Kalbi) MUKBANG...,Grilled Beef Short Ribs (Korean Kalbi) MUKBANG...,"[mukbang, eating show, social eating, 먹방, Step...",2018-04-28 19:21:37+00:00,157484.0,4581.0,,373.0,PT28M24S,hd,false,Saturday,1704.0,22
1016,YZW8f2vlXKo,Stephanie Soo,Vietnamese Egg Rolls + Noodles MUKBANG | Eatin...,Vietnamese Egg Rolls + Noodles MUKBANG | Eatin...,"[mukbang, eating show, social eating, 먹방, Step...",2018-04-27 22:23:30+00:00,166720.0,4775.0,,432.0,PT24M22S,hd,false,Friday,1462.0,20
1017,4AE61ymXvro,Stephanie Soo,Korean Spicy Rice Cakes Mukbang + Recipe | Eat...,Korean Spicy Rice Cakes Mukbang + Recipe | Eat...,"[mukbang, eating show, social eating, 먹방, cook...",2018-04-24 22:24:10+00:00,471128.0,11236.0,,606.0,PT38M42S,hd,false,Tuesday,2322.0,22
1018,eK-6n6XSsAQ,Stephanie Soo,The Best Burger in LA Mukbang | Eating Show,The Best Burger in LA Mukbang | Eating Show \n...,"[mukbang, eating show, social eating, 먹방, Step...",2018-04-22 21:48:34+00:00,657255.0,12227.0,,735.0,PT28M1S,hd,false,Sunday,1681.0,22


In [57]:
comment_df = get_comments_in_videos(youtube, video_ids)
comment_df

Could not get comments for video Oo9Y78uL0IE


Unnamed: 0,video_id,comments
0,N_dQ_6CroaM,"[If you’re ever injured in an accident, you ca..."
1,99Rxr-LRdqw,[This case is wild... you might drop your phon...
2,t_9sPQsJ7nk,[I would be under pressure like imagine the wh...
3,jlPIXxT5DEY,[❤‍🔥Click the link https://www.temu.com/k/c72c...
4,XEmevmTZeUU,[What is the weirdest family tradition you've ...
...,...,...
1014,uEn86Z1XkYw,[i saw this episode on my For you page and the...
1015,HWsaP7Mvtc4,[Okay first off stop judging me I was rewatchi...
1016,YZW8f2vlXKo,[The show with the blind girl is sad love stor...
1017,4AE61ymXvro,[I watched all ur vids and I had nothing to wa...


In [59]:
comment_df.to_csv('comment_df.csv', index=False)