In [86]:
import argparse
import json
import pandas as pd

from apiclient.discovery import build
from csv import writer
from urllib.parse import urlparse, parse_qs
from collections import defaultdict

def get_keys(filename):
    with open(filename) as f:
        key = f.readline()
    DEVELOPER_KEY = key
    YOUTUBE_API_SERVICE_NAME = "youtube"
    YOUTUBE_API_VERSION = "v3"
    return {'key': key, 'name': 'youtube', 'version': 'v3'}

def build_service(filename):
    with open(filename) as f:
        key = f.readline()

    YOUTUBE_API_SERVICE_NAME = "youtube"
    YOUTUBE_API_VERSION = "v3"
    return build(YOUTUBE_API_SERVICE_NAME,
                 YOUTUBE_API_VERSION,
                 developerKey=key)

# https://stackoverflow.com/questions/45579306/get-youtube-video-url-or-youtube-video-id-from-a-string-using-regex
def get_id(url):
    u_pars = urlparse(url)
    quer_v = parse_qs(u_pars.query).get('v')
    if quer_v:
        return quer_v[0]
    pth = u_pars.path.split('/')
    if pth:
        return pth[-1]

def get_comments(**kwargs):
    """
    ty: 
    https://python.gotrained.com/youtube-api-extracting-comments/#Cache_Credentials
    https://www.pingshiuanchua.com/blog/post/using-youtube-api-to-analyse-youtube-comments-on-python
    """

    # edit these list declarations as needed
    comments = []
    
    # count duplicated comments
    comment_text_count = defaultdict(lambda: 0)

    # clean kwargs

    # parameters needed for query
    kwargs['part'] = kwargs.get('part', 'snippet').split()
    kwargs['maxResults'] = kwargs.get('maxResults', 100)
    kwargs['textFormat'] = kwargs.get('textFormat', 'plainText')
    kwargs['order'] = kwargs.get('order', 'time')
    service = kwargs.pop('service')

    # other parameters for dealing with files
    write_lbl = kwargs.pop('write_lbl', True)
    csv_filename = kwargs.pop('csv_filename')
    token_filename = kwargs.pop('token_filename')


    # get the first page of comments
    response = service.commentThreads().list(
        **kwargs
    ).execute()

    # continue until we crash or reach the end
    page = 0
    while response:
        index = 0
        for item in response['items']:
            index += 1

            # query different pieces of data from the JSON response
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comment_id = item['snippet']['topLevelComment']['id']
            reply_count = item['snippet']['totalReplyCount']
            like_count = item['snippet']['topLevelComment']['snippet']['likeCount']
            updated_at = item['snippet']['topLevelComment']['snippet']['updatedAt']
            viewer_rating = item['snippet']['topLevelComment']['snippet']['viewerRating']

            _comment = {
                'text': comment,
                'id': comment_id,
                'reply_count': reply_count,
                'like_count': like_count,
                'updated_at': updated_at
            }

            # must have at least 50 likes to be selected
            if _comment['like_count'] >= 5 and _comment['reply_count'] >= 2:
                comments.append(_comment)
                comment_text_count[comment] += 1

        # check if there's a next page
        if 'nextPageToken' in response:
            kwargs['pageToken'] = response['nextPageToken']
            response = service.commentThreads().list(
                **kwargs
            ).execute()
        else:
            break
    
    # cleanup comments
    # 1. remove bots' comments by looking at duplicates
    comments_cleaned = []
    for comment in comments:
        if comment_text_count[comment['text']] == 1:
            comments_cleaned.append(comment)

    return comments_cleaned

def getCommentsFromVideo(video_id):
    # setup and get comments
    # build kwargs from args
    kwargs = {
        'apikey': './apiKey.json',
        'part': 'snippet',
        'maxResults': 100,
        'write_lbl': True,
        'csv_filename': None,
        'token_filename': None,
        'order': 'time',
        'pageToken': None
    }

    service = build_service(kwargs.pop('apikey'))

    if not kwargs['csv_filename']:
        kwargs['csv_filename'] = video_id + "_csv"

    if not kwargs['token_filename']:
        kwargs['token_filename'] = video_id + "_page_token"

    if not kwargs.get('pageToken'):
        kwargs.pop('pageToken')

    kwargs['videoId'] = video_id
    kwargs['service'] = service
    return get_comments(**kwargs)

In [87]:
import re
import boto3

comprehend = boto3.client(service_name='comprehend', region_name='us-east-1')
def stripStopWords(phrase):
    _stop_words = [
        "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
        "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself",
        "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
        "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
        "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as",
        "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through",
        "during", "before", "after", "above", "below", "to", "from", "in", "out", "on", "off",
        "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how",
        "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not",
        "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should",
        "now"] + ["yeah", "thing", "things", "hey", "ohh", "really", "mine", "everybody", "anybody", "everyone"]

    while True:
        # repetitively trim stop words from the beginning and the end
        modified = False
        _str = phrase['Text'].lower()
        for w in _stop_words:
            if _str.startswith(w+' '):
                phrase['Text'] = phrase['Text'][len(w)+1:]
                modified = True
                break
            elif _str.endswith(' '+w):
                phrase['Text'] = phrase['Text'][:-len(w)-1]
                modified = True
                break
            elif _str == w:
                phrase['Text'] = ""
                break
        if not modified:
            break
    return phrase

def retrieveKeywords(comments):
    # retrieve keywords for each comment
    # comprehend api requires each content to be less than 5000 bytes
    # see https://docs.aws.amazon.com/comprehend/latest/dg/guidelines-and-limits.html
    comment_text_list = [comment['text'][0:4000] for comment in comments]

    def chunks(l, n):
        for i in range(0, len(l), n):
            yield l[i:i + n]

    comment_list_chunks = list(chunks(comment_text_list, 25))

    keyPhrasesByContent = []
    for chunk in comment_list_chunks:
        keyPhrasesResponse = comprehend.batch_detect_key_phrases(
            TextList=chunk, LanguageCode='en')
        keyPhrasesByContent.extend(
            list(map(lambda x: x['KeyPhrases'], keyPhrasesResponse['ResultList'])))
    
    def filterPhrases(phrases):
        # clean up keyPhrases
        phrases = list(map(lambda x: {**x, 'ori_text': x['Text']}, phrases))

        # substitue \n with whitespace
        phrases = list(map(lambda x: {**x, 'Text': x['Text'].replace("\n", "")}, phrases))

        phrases = list(map(stripStopWords, phrases))

        # only keep bigrams
        phrases = list(filter(lambda x: len(
            x['Text'].split(' ')) >= 1, phrases))
        
        # remove duplicates
        unique_phrases = []
        phrase_set = set()
        for phrase in phrases:
            if phrase['Text'].lower() not in phrase_set:
                phrase_set.add(phrase['Text'].lower())
                unique_phrases.append(phrase)
        
        # phrase should have at least 3 chars
        unique_phrases = list(
            filter(lambda x: len(x['Text']) >= 3, unique_phrases))

        # valid phrase should be words only, optionally connected by '-' or whitespace
        unique_phrases = list(filter(lambda phrase: bool(re.fullmatch(r'^[a-zA-Z0-9-\s]+$', phrase['Text'])), unique_phrases))

        # phrase should not contain only numbers, optionally connected by '-' or whitespace
        unique_phrases = list(filter(lambda phrase: not bool(re.fullmatch(r'^[\d\s-]+$', phrase['Text'])), unique_phrases))

        return unique_phrases
    
    keyPhrasesByContent = list(map(filterPhrases, keyPhrasesByContent))


    for i in range(len(comments)):
        comments[i]['phrases'] = [phrase['Text'] for phrase in keyPhrasesByContent[i]]
    
    return comments
    


In [88]:

def getKeywordsFromComments(comments):
    phrase_dict = {}
    for comment in comments:
        for phrase in comment['phrases']:
            if phrase not in phrase_dict:
                phrase_dict[phrase] = {
                    # count how many comments containing the key phrase
                    # the higher the more important
                    'comment_freq': 0
                }
            
            phrase_dict[phrase]['comment_freq'] +=1
    
    all_phrases = []
    for phrase in phrase_dict:
        all_phrases.append({
            'text': phrase,
            'comment_freq': phrase_dict[phrase]['comment_freq']
        })
    
    all_phrases.sort(reverse=True, key=lambda x: x['comment_freq'])
    return all_phrases

In [89]:
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
from google.oauth2.credentials import Credentials

def getBestVideoList(keywords, topn=10):
    scopes = ["https://www.googleapis.com/auth/youtube.force-ssl"]
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "0"

    api_service_name = "youtube"
    api_version = "v3"
    client_secrets_file = "./google_credentials/client_secret_754636752811-rmth1g8e3dl144jda8fddh1ihhj413um.apps.googleusercontent.com.json"

    # Get credentials and create an API client
    flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
        client_secrets_file, scopes)

    credentials = Credentials(
        None,
        refresh_token="1//0fNppFYz3o7ABCgYIARAAGA8SNwF-L9IrgIZJAKCn9iSH_172SxyT6cA3mMHDlSQ0MTj9MmKTc6zZRnSy1nwMW5kRkl52JYb4jhg",
        token_uri="https://accounts.google.com/o/oauth2/token",
        client_id="754636752811-rmth1g8e3dl144jda8fddh1ihhj413um.apps.googleusercontent.com",
        client_secret="KhUufHmhS8XI0srgpP__cTCr")

    youtube = googleapiclient.discovery.build(
        api_service_name, api_version, credentials=credentials)

    request = youtube.search().list(
        part="snippet",
        maxResults=topn,
        q=keywords,
        relevanceLanguage='en'
    )
    response = request.execute()
    return response['items']

In [90]:
keywords = 'How To Buy Your First Rental'
print('Working on keywords: ' + keywords)
video_list = getBestVideoList(keywords, 15)

# step 1: retrieve all comments of the videos
all_comments = []
for video in video_list:
    comments = getCommentsFromVideo(video['id']['videoId'])
    all_comments.extend(comments)
    print('{} comments retrieved for video {}'.format(len(comments), video['id']['videoId']))


Working on keywords: How To Buy Your First Rental
Comments retrieved for video bJx7_1rWC6U
Comments retrieved for video u83O2l1QEj4
Comments retrieved for video 7TB_eRhSNV4


KeyboardInterrupt: 

In [None]:
# step 2: identify keywords for each comment
all_comments = retrieveKeywords(all_comments)

# step 3: extract keywords from these comments and sort by comment_freq
all_phrases = getKeywordsFromComments(all_comments)