|
| 1 | +import requests |
| 2 | +import json |
| 3 | +import time |
| 4 | + |
| 5 | +# Code is partially grabbed from this repository: |
| 6 | +# https://github.com/egbertbouman/youtube-comment-downloader |
| 7 | + |
| 8 | +def search_dict(partial, key): |
| 9 | + """ |
| 10 | + A handy function that searches for a specific `key` in a `data` dictionary/list |
| 11 | + """ |
| 12 | + if isinstance(partial, dict): |
| 13 | + for k, v in partial.items(): |
| 14 | + if k == key: |
| 15 | + # found the key, return the value |
| 16 | + yield v |
| 17 | + else: |
| 18 | + # value of the dict may be another dict, so we search there again |
| 19 | + for o in search_dict(v, key): |
| 20 | + yield o |
| 21 | + elif isinstance(partial, list): |
| 22 | + # if the passed data is a list |
| 23 | + # iterate over it & search for the key at the items in the list |
| 24 | + for i in partial: |
| 25 | + for o in search_dict(i, key): |
| 26 | + yield o |
| 27 | + |
| 28 | + |
| 29 | +def find_value(html, key, num_sep_chars=2, separator='"'): |
| 30 | + # define the start position by the position of the key + |
| 31 | + # length of key + separator length (usually : and ") |
| 32 | + start_pos = html.find(key) + len(key) + num_sep_chars |
| 33 | + # the end position is the position of the separator (such as ") |
| 34 | + # starting from the start_pos |
| 35 | + end_pos = html.find(separator, start_pos) |
| 36 | + # return the content in this range |
| 37 | + return html[start_pos:end_pos] |
| 38 | + |
| 39 | + |
| 40 | +def get_comments(url): |
| 41 | + session = requests.Session() |
| 42 | + # make the request |
| 43 | + res = session.get(url) |
| 44 | + # extract the XSRF token |
| 45 | + xsrf_token = find_value(res.text, "XSRF_TOKEN", num_sep_chars=3) |
| 46 | + # parse the YouTube initial data in the <script> tag |
| 47 | + data_str = find_value(res.text, 'window["ytInitialData"] = ', num_sep_chars=0, separator="\n").rstrip(";") |
| 48 | + # convert to Python dictionary instead of plain text string |
| 49 | + data = json.loads(data_str) |
| 50 | + # search for the ctoken & continuation parameter fields |
| 51 | + for r in search_dict(data, "itemSectionRenderer"): |
| 52 | + pagination_data = next(search_dict(r, "nextContinuationData")) |
| 53 | + if pagination_data: |
| 54 | + # if we got something, break out of the loop, |
| 55 | + # we have the data we need |
| 56 | + break |
| 57 | + |
| 58 | + continuation_tokens = [(pagination_data['continuation'], pagination_data['clickTrackingParams'])] |
| 59 | + |
| 60 | + while continuation_tokens: |
| 61 | + # keep looping until continuation tokens list is empty (no more comments) |
| 62 | + continuation, itct = continuation_tokens.pop() |
| 63 | + |
| 64 | + # construct params parameter (the ones in the URL) |
| 65 | + params = { |
| 66 | + "action_get_comments": 1, |
| 67 | + "pbj": 1, |
| 68 | + "ctoken": continuation, |
| 69 | + "continuation": continuation, |
| 70 | + "itct": itct, |
| 71 | + } |
| 72 | + |
| 73 | + # construct POST body data, which consists of the XSRF token |
| 74 | + data = { |
| 75 | + "session_token": xsrf_token, |
| 76 | + } |
| 77 | + |
| 78 | + # construct request headers |
| 79 | + headers = { |
| 80 | + "x-youtube-client-name": "1", |
| 81 | + "x-youtube-client-version": "2.20200731.02.01" |
| 82 | + } |
| 83 | + |
| 84 | + # make the POST request to get the comments data |
| 85 | + response = session.post("https://www.youtube.com/comment_service_ajax", params=params, data=data, headers=headers) |
| 86 | + # convert to a Python dictionary |
| 87 | + comments_data = json.loads(response.text) |
| 88 | + |
| 89 | + for comment in search_dict(comments_data, "commentRenderer"): |
| 90 | + # iterate over loaded comments and yield useful info |
| 91 | + yield { |
| 92 | + "commentId": comment["commentId"], |
| 93 | + "text": ''.join([c['text'] for c in comment['contentText']['runs']]), |
| 94 | + "time": comment['publishedTimeText']['runs'][0]['text'], |
| 95 | + "isLiked": comment["isLiked"], |
| 96 | + "likeCount": comment["likeCount"], |
| 97 | + # "replyCount": comment["replyCount"], |
| 98 | + 'author': comment.get('authorText', {}).get('simpleText', ''), |
| 99 | + 'channel': comment['authorEndpoint']['browseEndpoint']['browseId'], |
| 100 | + 'votes': comment.get('voteCount', {}).get('simpleText', '0'), |
| 101 | + 'photo': comment['authorThumbnail']['thumbnails'][-1]['url'], |
| 102 | + "authorIsChannelOwner": comment["authorIsChannelOwner"], |
| 103 | + } |
| 104 | + |
| 105 | + # load continuation tokens for next comments (ctoken & itct) |
| 106 | + continuation_tokens = [(next_cdata['continuation'], next_cdata['clickTrackingParams']) |
| 107 | + for next_cdata in search_dict(comments_data, 'nextContinuationData')] + continuation_tokens |
| 108 | + |
| 109 | + # avoid heavy loads with popular videos |
| 110 | + time.sleep(0.1) |
| 111 | + |
| 112 | + |
| 113 | + |
| 114 | + |
| 115 | + |
| 116 | +if __name__ == "__main__": |
| 117 | + # from pprint import pprint |
| 118 | + # url = "https://www.youtube.com/watch?v=jNQXAC9IVRw" |
| 119 | + # for count, comment in enumerate(get_comments(url)): |
| 120 | + # if count == 3: |
| 121 | + # break |
| 122 | + # pprint(comment) |
| 123 | + # print("="*50) |
| 124 | + import argparse |
| 125 | + import os |
| 126 | + |
| 127 | + parser = argparse.ArgumentParser(description="Simple YouTube Comment extractor") |
| 128 | + parser.add_argument("url", help="The YouTube video full URL") |
| 129 | + parser.add_argument("-l", "--limit", type=int, help="Number of maximum comments to extract, helpful for longer videos") |
| 130 | + parser.add_argument("-o", "--output", help="Output JSON file, e.g data.json") |
| 131 | + |
| 132 | + # parse passed arguments |
| 133 | + args = parser.parse_args() |
| 134 | + limit = args.limit |
| 135 | + output = args.output |
| 136 | + url = args.url |
| 137 | + |
| 138 | + from pprint import pprint |
| 139 | + for count, comment in enumerate(get_comments(url)): |
| 140 | + if limit and count >= limit: |
| 141 | + # break out of the loop when we exceed limit specified |
| 142 | + break |
| 143 | + if output: |
| 144 | + # write comment as JSON to a file |
| 145 | + with open(output, "a") as f: |
| 146 | + # begin writing, adding an opening brackets |
| 147 | + if count == 0: |
| 148 | + f.write("[") |
| 149 | + f.write(json.dumps(comment, ensure_ascii=False) + ",") |
| 150 | + else: |
| 151 | + pprint(comment) |
| 152 | + print("="*50) |
| 153 | + print("total comments extracted:", count) |
| 154 | + if output: |
| 155 | + # remove the last comma ',' |
| 156 | + with open(output, "rb+") as f: |
| 157 | + f.seek(-1, os.SEEK_END) |
| 158 | + f.truncate() |
| 159 | + # add "]" to close the list in the end of the file |
| 160 | + with open(output, "a") as f: |
| 161 | + print("]", file=f) |
0 commit comments