added youtube comment extractor tutorial

x4nth055 · x4nth055 · commit 189dc863ed60 · 2020-08-02T17:27:15.000+02:00
diff --git a/README.md b/README.md
@@ -83,6 +83,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
     - [How to Extract Script and CSS Files from Web Pages in Python](https://www.thepythoncode.com/article/extract-web-page-script-and-css-files-in-python). ([code](web-scraping/webpage-js-css-extractor))
     - [How to Extract and Submit Web Forms from a URL using Python](https://www.thepythoncode.com/article/extracting-and-submitting-web-page-forms-in-python). ([code](web-scraping/extract-and-fill-forms))
     - [How to Get Domain Name Information in Python](https://www.thepythoncode.com/article/extracting-domain-name-information-in-python). ([code](web-scraping/get-domain-info))
+    - [How to Extract YouTube Comments in Python](https://www.thepythoncode.com/article/extract-youtube-comments-in-python). ([code](web-scraping/youtube-comments-extractor))
 
 - ### [Python Standard Library](https://www.thepythoncode.com/topic/python-standard-library)
     - [How to Transfer Files in the Network using Sockets in Python](https://www.thepythoncode.com/article/send-receive-files-using-sockets-python). ([code](general/transfer-files/))
diff --git a/web-scraping/youtube-comments-extractor/README.md b/web-scraping/youtube-comments-extractor/README.md
@@ -0,0 +1,27 @@
+# [How to Extract YouTube Comments in Python](https://www.thepythoncode.com/article/extract-youtube-comments-in-python)
+To run this:
+- `pip3 install -r requirements.txt`
+- ```
+    python youtube_comment_extractor.py --help
+    ```
+    **Output:**
+    ```
+    usage: youtube_comment_extractor.py [-h] [-l LIMIT] [-o OUTPUT] url
+
+    Simple YouTube Comment extractor
+
+    positional arguments:
+    url                   The YouTube video full URL
+
+    optional arguments:
+    -h, --help            show this help message and exit
+    -l LIMIT, --limit LIMIT
+                            Number of maximum comments to extract, helpful for
+                            longer videos
+    -o OUTPUT, --output OUTPUT
+                            Output JSON file, e.g data.json
+    ```
+- To download the latest 50 comments from https://www.youtube.com/watch?v=jNQXAC9IVRw and save them to `data.json`:
+    ```
+    python youtube_comment_extractor.py https://www.youtube.com/watch?v=jNQXAC9IVRw --limit 50 --output data.json
+    ```
diff --git a/web-scraping/youtube-comments-extractor/requirements.txt b/web-scraping/youtube-comments-extractor/requirements.txt
@@ -0,0 +1 @@
+requests
diff --git a/web-scraping/youtube-comments-extractor/youtube_comment_extractor.py b/web-scraping/youtube-comments-extractor/youtube_comment_extractor.py
@@ -0,0 +1,161 @@
+import requests
+import json
+import time
+
+# Code is partially grabbed from this repository:
+# https://github.com/egbertbouman/youtube-comment-downloader
+
+def search_dict(partial, key):
+    """
+    A handy function that searches for a specific `key` in a `data` dictionary/list
+    """
+    if isinstance(partial, dict):
+        for k, v in partial.items():
+            if k == key:
+                # found the key, return the value
+                yield v
+            else:
+                # value of the dict may be another dict, so we search there again
+                for o in search_dict(v, key):
+                    yield o
+    elif isinstance(partial, list):
+        # if the passed data is a list
+        # iterate over it & search for the key at the items in the list
+        for i in partial:
+            for o in search_dict(i, key):
+                yield o
+
+
+def find_value(html, key, num_sep_chars=2, separator='"'):
+    # define the start position by the position of the key + 
+    # length of key + separator length (usually : and ")
+    start_pos = html.find(key) + len(key) + num_sep_chars
+    # the end position is the position of the separator (such as ")
+    # starting from the start_pos
+    end_pos = html.find(separator, start_pos)
+    # return the content in this range
+    return html[start_pos:end_pos]
+
+
+def get_comments(url):
+    session = requests.Session()
+    # make the request
+    res = session.get(url)
+    # extract the XSRF token
+    xsrf_token = find_value(res.text, "XSRF_TOKEN", num_sep_chars=3)
+    # parse the YouTube initial data in the <script> tag
+    data_str = find_value(res.text, 'window["ytInitialData"] = ', num_sep_chars=0, separator="\n").rstrip(";")
+    # convert to Python dictionary instead of plain text string
+    data = json.loads(data_str)
+    # search for the ctoken & continuation parameter fields
+    for r in search_dict(data, "itemSectionRenderer"):
+        pagination_data = next(search_dict(r, "nextContinuationData"))
+        if pagination_data:
+            # if we got something, break out of the loop,
+            # we have the data we need
+            break
+
+    continuation_tokens = [(pagination_data['continuation'], pagination_data['clickTrackingParams'])]
+
+    while continuation_tokens:
+        # keep looping until continuation tokens list is empty (no more comments)
+        continuation, itct = continuation_tokens.pop()
+    
+        # construct params parameter (the ones in the URL)
+        params = {
+            "action_get_comments": 1,
+            "pbj": 1,
+            "ctoken": continuation,
+            "continuation": continuation,
+            "itct": itct,
+        }
+
+        # construct POST body data, which consists of the XSRF token
+        data = {
+            "session_token": xsrf_token,
+        }
+
+        # construct request headers
+        headers = {
+            "x-youtube-client-name": "1",
+            "x-youtube-client-version": "2.20200731.02.01"
+        }
+
+        # make the POST request to get the comments data
+        response = session.post("https://www.youtube.com/comment_service_ajax", params=params, data=data, headers=headers)
+        # convert to a Python dictionary
+        comments_data = json.loads(response.text)
+
+        for comment in search_dict(comments_data, "commentRenderer"):
+            # iterate over loaded comments and yield useful info
+            yield {
+                "commentId": comment["commentId"],
+                "text": ''.join([c['text'] for c in comment['contentText']['runs']]),
+                "time": comment['publishedTimeText']['runs'][0]['text'],
+                "isLiked": comment["isLiked"],
+                "likeCount": comment["likeCount"],
+                # "replyCount": comment["replyCount"],
+                'author': comment.get('authorText', {}).get('simpleText', ''),
+                'channel': comment['authorEndpoint']['browseEndpoint']['browseId'],
+                'votes': comment.get('voteCount', {}).get('simpleText', '0'),
+                'photo': comment['authorThumbnail']['thumbnails'][-1]['url'],
+                "authorIsChannelOwner": comment["authorIsChannelOwner"],
+            }
+
+        # load continuation tokens for next comments (ctoken & itct)
+        continuation_tokens = [(next_cdata['continuation'], next_cdata['clickTrackingParams'])
+                         for next_cdata in search_dict(comments_data, 'nextContinuationData')] + continuation_tokens
+
+        # avoid heavy loads with popular videos
+        time.sleep(0.1)
+    
+
+
+
+
+if __name__ == "__main__":
+    # from pprint import pprint
+    # url = "https://www.youtube.com/watch?v=jNQXAC9IVRw"
+    # for count, comment in enumerate(get_comments(url)):
+    #     if count == 3:
+    #         break
+    #     pprint(comment)
+    #     print("="*50)
+    import argparse
+    import os
+
+    parser = argparse.ArgumentParser(description="Simple YouTube Comment extractor")
+    parser.add_argument("url", help="The YouTube video full URL")
+    parser.add_argument("-l", "--limit", type=int, help="Number of maximum comments to extract, helpful for longer videos")
+    parser.add_argument("-o", "--output", help="Output JSON file, e.g data.json")
+
+    # parse passed arguments
+    args = parser.parse_args()
+    limit = args.limit
+    output = args.output
+    url = args.url
+
+    from pprint import pprint
+    for count, comment in enumerate(get_comments(url)):
+        if limit and count >= limit:
+            # break out of the loop when we exceed limit specified
+            break
+        if output:
+            # write comment as JSON to a file
+            with open(output, "a") as f:
+                # begin writing, adding an opening brackets
+                if count == 0:
+                    f.write("[")
+                f.write(json.dumps(comment, ensure_ascii=False) + ",")
+        else:
+            pprint(comment)
+            print("="*50)
+    print("total comments extracted:", count)
+    if output:
+        # remove the last comma ','
+        with open(output, "rb+") as f:
+            f.seek(-1, os.SEEK_END)
+            f.truncate()
+        # add "]" to close the list in the end of the file
+        with open(output, "a") as f:
+            print("]", file=f)