In [1]:
import googleapiclient.discovery
import googleapiclient.errors

import pandas as pd
import os
import re
import csv

from tqdm import tqdm

In [2]:
#retrieve API key
api_path = "api_key.txt"
with open(api_path, 'r') as file:
    api_key = file.readline()

In [3]:
#extract only 'id' and 'title'
def vid_category(response):
    items_dict = response["items"]
    video_category = {}

    for i in range(len(items_dict)):
        category_id = items_dict[i]['id']
        category_title = items_dict[i]['snippet']['title']

        video_category[category_id] = category_title
    
    return video_category

In [4]:
vid_category_dict = {}

api_service_name = "youtube"
api_version = "v3"

# Get credentials and create an API client
youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=api_key)

request = youtube.videoCategories().list(
    part="snippet",
    regionCode='ID' #randomly stipulated since all ASEAN country have same video category id configuration
)
response = request.execute()

vid_category_dict = vid_category(response)
    
vid_category_dict    

{'1': 'Film & Animation',
 '2': 'Autos & Vehicles',
 '10': 'Music',
 '15': 'Pets & Animals',
 '17': 'Sports',
 '18': 'Short Movies',
 '19': 'Travel & Events',
 '20': 'Gaming',
 '21': 'Videoblogging',
 '22': 'People & Blogs',
 '23': 'Comedy',
 '24': 'Entertainment',
 '25': 'News & Politics',
 '26': 'Howto & Style',
 '27': 'Education',
 '28': 'Science & Technology',
 '30': 'Movies',
 '31': 'Anime/Animation',
 '32': 'Action/Adventure',
 '33': 'Classics',
 '34': 'Comedy',
 '35': 'Documentary',
 '36': 'Drama',
 '37': 'Family',
 '38': 'Foreign',
 '39': 'Horror',
 '40': 'Sci-Fi/Fantasy',
 '41': 'Thriller',
 '42': 'Shorts',
 '43': 'Shows',
 '44': 'Trailers'}

# Extract Most Popular Videos Based on Video Category in Each Country

### Function to Extract Attributes Needed

In [5]:
#define country codes (ASEAN countries)
country_code = ["BN", "MM", "KH", "ID", "MY", "PH", "SG", "TH", "VN", "LA", "TL"]

In [6]:
#function to create csv out of json most popular videos stats

def VideoStatCSV(input_dict, cat_id):
    video_id = []
    channel_id = []
    channel_name = []
    video_title = []
    category_id = []
    publish_time = []
    view_count = []
    like_count = []
    comment_count = []
    comments_disabled = []
    likes_disabled = []
    video_category = []
    description = []

    total_item = len(input_dict["items"])
    
    for item in range(total_item):
        # only extract data that match with the category id
        if input_dict["items"][item]["snippet"]["categoryId"] == cat_id:
            dictionary = input_dict["items"][item]
            snippet = dictionary["snippet"]
            stats = dictionary["statistics"]

            video_id.append(dictionary["id"])

            channel_id.append(snippet["channelId"])
            channel_name.append(clean_emoji(snippet["channelTitle"]))
            video_title.append(clean_emoji(snippet["title"]))
#             description.append(clean_emoji(prepare_feature(snippet["localized"]["description"])))
            category_id.append(snippet["categoryId"])
            video_category.append(vid_category_dict[snippet["categoryId"]])
            publish_time.append(snippet["publishedAt"])

            view_count.append(stats["viewCount"])

            if 'likeCount' in stats:
                like_count.append(stats["likeCount"])
                likes_disabled.append(False)
            else:
                likes_disabled.append(True)
                like_count.append(0)


            if 'commentCount' in stats:
                comment_count.append(stats["commentCount"])
                comments_disabled.append(False)
            else:
                comments_disabled.append(True)
                comment_count.append(0)



    df = pd.DataFrame(
    {'video_id': video_id,
     'channel_id': channel_id,
     'channel_name': channel_name,
     'video_title': video_title,
#      'description':description,
     'category_id': category_id,
     'video_category': video_category,
     'publish_time': publish_time,
     'view_count': view_count,
     'like_count': like_count,
     'like_count': like_count,
     'comment_count': comment_count,
     'comments_disabled':comments_disabled
    })

    return df


### Extract Comments Function

In [7]:
def get_comment(country,dataset):
    #generate dataframe of relevant comments from each video and store in dataframe
    column_videoid = dataset['video_id']
    
    api_service_name = "youtube"
    api_version = "v3"

    #get comments
    youtube = googleapiclient.discovery.build(
                api_service_name, api_version, developerKey=api_key)
    
    df = pd.DataFrame({'A' : []})
    print(f"Writing {country} comments in progress...")
    
    for videoid in tqdm(column_videoid):
        try:
            #request data
            request = youtube.commentThreads().list(
                part="snippet",
                videoId=videoid,
                maxResults = 50,
                order = 'relevance'
            )
            response = request.execute()
            response
            
            #store data in list
            country_id = []
            video_id = []
            comment = []
            author_display_name = []
            comment_like_count = []
            comments_disabled_check = []
            
            for key in response.keys():
                ncoms =(response['pageInfo']['totalResults'])


            for i in range(0,ncoms):
                country_id.append(country)
                video_id.append(videoid)
                comment.append(clean_emoji(prepare_feature(response['items'][i]['snippet']['topLevelComment']
                    ['snippet']['textOriginal'])))
                author_display_name.append(response['items'][i]['snippet']['topLevelComment']
                    ['snippet']['authorDisplayName'])
                comment_like_count.append(response['items'][i]['snippet']['topLevelComment']
                    ['snippet']['likeCount'])
                comments_disabled_check.append(False)
            
            #create dataframe out of the list for each video_id
            if df.empty:
                df = pd.DataFrame(
            {'country':country,
             'video_id': video_id,
             'comment': comment,
             'comment_like_count': comment_like_count
            })
            else:
                temp = pd.DataFrame(
                {'country':country,
                 'video_id': video_id,
                 'comment': comment,
                 'comment_like_count': comment_like_count
                })
                df = pd.concat([df,temp], ignore_index=True)

        except:
            None
        
    path = '/Users/verenakarina/Desktop/Project/youtube/comment_data'
            
    df.to_csv(os.path.join(path,f'{country}_comments_dataset.csv'),index=False)
    
    return df

### Data Cleaning Function

In [8]:
def clean_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF" 
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U00010000-\U0010ffff"
                      "]+", re.UNICODE)
    return emoji_pattern.sub(r'', text) # no emoji

In [9]:
unsafe_characters = ['\n', '"']

def prepare_feature(feature):
    # Removes any character from the unsafe characters list and surrounds the whole item in quotes
    for ch in unsafe_characters:
        feature = str(feature).replace(ch, "")
    return f'{feature}'

### Code Starts Here:

In [10]:
api_service_name = "youtube"
api_version = "v3"

category_based_vids = {}

for country in country_code:
    
    print(f"{country} video category in progress...")
    for categoryid in tqdm(vid_category_dict):
        youtube = googleapiclient.discovery.build(
            api_service_name, api_version, developerKey=api_key)
        
        try:
            request = youtube.videos().list(
                part="snippet,contentDetails,statistics",
                chart="mostPopular",
                regionCode=country,
                videoCategoryId=categoryid,
                maxResults=50
            )
            response = request.execute()
            if country in category_based_vids:
                category_based_vids[country][categoryid] = response
            else:
                category_based_vids[country] = {categoryid: response}
            
        except:
            None


BN video category in progress...


100%|███████████████████████████████████████████| 31/31 [00:02<00:00, 14.23it/s]


MM video category in progress...


100%|███████████████████████████████████████████| 31/31 [00:01<00:00, 16.79it/s]


KH video category in progress...


100%|███████████████████████████████████████████| 31/31 [00:05<00:00,  5.38it/s]


ID video category in progress...


100%|███████████████████████████████████████████| 31/31 [00:04<00:00,  6.21it/s]


MY video category in progress...


100%|███████████████████████████████████████████| 31/31 [00:05<00:00,  5.40it/s]


PH video category in progress...


100%|███████████████████████████████████████████| 31/31 [00:07<00:00,  4.13it/s]


SG video category in progress...


100%|███████████████████████████████████████████| 31/31 [00:08<00:00,  3.70it/s]


TH video category in progress...


100%|███████████████████████████████████████████| 31/31 [00:08<00:00,  3.73it/s]


VN video category in progress...


100%|███████████████████████████████████████████| 31/31 [00:08<00:00,  3.70it/s]


LA video category in progress...


100%|███████████████████████████████████████████| 31/31 [00:07<00:00,  4.04it/s]


TL video category in progress...


100%|███████████████████████████████████████████| 31/31 [00:05<00:00,  6.09it/s]


In [11]:
updated_country = []
for i in category_based_vids:
    updated_country.append(i)
updated_country

['KH', 'ID', 'MY', 'PH', 'SG', 'TH', 'VN', 'LA']

In [12]:
path = '/Users/verenakarina/Desktop/Project/youtube/raw_data2'

for country in updated_country:
    df = pd.DataFrame({'A' : []})
    for cat_id in vid_category_dict:
        if df.empty:
            try:
                df = VideoStatCSV(category_based_vids[country][cat_id], cat_id)
            except:
                None
        else:
            try:
                temp = VideoStatCSV(category_based_vids[country][cat_id], cat_id)
                df = pd.concat([df,temp], ignore_index=True)
            except:
                None
    
    get_comment(country,df)
    
    df.to_csv(os.path.join(path,f'{country}_dataset.csv'),index=False)


Writing KH comments in progress...


100%|█████████████████████████████████████████| 179/179 [00:33<00:00,  5.41it/s]


Writing ID comments in progress...


100%|█████████████████████████████████████████| 242/242 [00:45<00:00,  5.30it/s]


Writing MY comments in progress...


100%|█████████████████████████████████████████| 229/229 [00:57<00:00,  3.95it/s]


Writing PH comments in progress...


100%|█████████████████████████████████████████| 225/225 [00:59<00:00,  3.81it/s]


Writing SG comments in progress...


100%|█████████████████████████████████████████| 238/238 [01:22<00:00,  2.88it/s]


Writing TH comments in progress...


100%|█████████████████████████████████████████| 187/187 [00:43<00:00,  4.28it/s]


Writing VN comments in progress...


100%|█████████████████████████████████████████| 190/190 [00:38<00:00,  4.95it/s]


Writing LA comments in progress...


100%|█████████████████████████████████████████| 152/152 [00:27<00:00,  5.49it/s]
