Skip to content

Commit 189dc86

Browse files
committed
added youtube comment extractor tutorial
1 parent 3ac53fa commit 189dc86

File tree

4 files changed

+190
-0
lines changed

4 files changed

+190
-0
lines changed

Diff for: README.md

+1
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
8383
- [How to Extract Script and CSS Files from Web Pages in Python](https://www.thepythoncode.com/article/extract-web-page-script-and-css-files-in-python). ([code](web-scraping/webpage-js-css-extractor))
8484
- [How to Extract and Submit Web Forms from a URL using Python](https://www.thepythoncode.com/article/extracting-and-submitting-web-page-forms-in-python). ([code](web-scraping/extract-and-fill-forms))
8585
- [How to Get Domain Name Information in Python](https://www.thepythoncode.com/article/extracting-domain-name-information-in-python). ([code](web-scraping/get-domain-info))
86+
- [How to Extract YouTube Comments in Python](https://www.thepythoncode.com/article/extract-youtube-comments-in-python). ([code](web-scraping/youtube-comments-extractor))
8687

8788
- ### [Python Standard Library](https://www.thepythoncode.com/topic/python-standard-library)
8889
- [How to Transfer Files in the Network using Sockets in Python](https://www.thepythoncode.com/article/send-receive-files-using-sockets-python). ([code](general/transfer-files/))

Diff for: web-scraping/youtube-comments-extractor/README.md

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# [How to Extract YouTube Comments in Python](https://www.thepythoncode.com/article/extract-youtube-comments-in-python)
2+
To run this:
3+
- `pip3 install -r requirements.txt`
4+
- ```
5+
python youtube_comment_extractor.py --help
6+
```
7+
**Output:**
8+
```
9+
usage: youtube_comment_extractor.py [-h] [-l LIMIT] [-o OUTPUT] url
10+
11+
Simple YouTube Comment extractor
12+
13+
positional arguments:
14+
url The YouTube video full URL
15+
16+
optional arguments:
17+
-h, --help show this help message and exit
18+
-l LIMIT, --limit LIMIT
19+
Number of maximum comments to extract, helpful for
20+
longer videos
21+
-o OUTPUT, --output OUTPUT
22+
Output JSON file, e.g data.json
23+
```
24+
- To download the latest 50 comments from https://www.youtube.com/watch?v=jNQXAC9IVRw and save them to `data.json`:
25+
```
26+
python youtube_comment_extractor.py https://www.youtube.com/watch?v=jNQXAC9IVRw --limit 50 --output data.json
27+
```
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
requests
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
import requests
2+
import json
3+
import time
4+
5+
# Code is partially grabbed from this repository:
6+
# https://github.com/egbertbouman/youtube-comment-downloader
7+
8+
def search_dict(partial, key):
9+
"""
10+
A handy function that searches for a specific `key` in a `data` dictionary/list
11+
"""
12+
if isinstance(partial, dict):
13+
for k, v in partial.items():
14+
if k == key:
15+
# found the key, return the value
16+
yield v
17+
else:
18+
# value of the dict may be another dict, so we search there again
19+
for o in search_dict(v, key):
20+
yield o
21+
elif isinstance(partial, list):
22+
# if the passed data is a list
23+
# iterate over it & search for the key at the items in the list
24+
for i in partial:
25+
for o in search_dict(i, key):
26+
yield o
27+
28+
29+
def find_value(html, key, num_sep_chars=2, separator='"'):
30+
# define the start position by the position of the key +
31+
# length of key + separator length (usually : and ")
32+
start_pos = html.find(key) + len(key) + num_sep_chars
33+
# the end position is the position of the separator (such as ")
34+
# starting from the start_pos
35+
end_pos = html.find(separator, start_pos)
36+
# return the content in this range
37+
return html[start_pos:end_pos]
38+
39+
40+
def get_comments(url):
41+
session = requests.Session()
42+
# make the request
43+
res = session.get(url)
44+
# extract the XSRF token
45+
xsrf_token = find_value(res.text, "XSRF_TOKEN", num_sep_chars=3)
46+
# parse the YouTube initial data in the <script> tag
47+
data_str = find_value(res.text, 'window["ytInitialData"] = ', num_sep_chars=0, separator="\n").rstrip(";")
48+
# convert to Python dictionary instead of plain text string
49+
data = json.loads(data_str)
50+
# search for the ctoken & continuation parameter fields
51+
for r in search_dict(data, "itemSectionRenderer"):
52+
pagination_data = next(search_dict(r, "nextContinuationData"))
53+
if pagination_data:
54+
# if we got something, break out of the loop,
55+
# we have the data we need
56+
break
57+
58+
continuation_tokens = [(pagination_data['continuation'], pagination_data['clickTrackingParams'])]
59+
60+
while continuation_tokens:
61+
# keep looping until continuation tokens list is empty (no more comments)
62+
continuation, itct = continuation_tokens.pop()
63+
64+
# construct params parameter (the ones in the URL)
65+
params = {
66+
"action_get_comments": 1,
67+
"pbj": 1,
68+
"ctoken": continuation,
69+
"continuation": continuation,
70+
"itct": itct,
71+
}
72+
73+
# construct POST body data, which consists of the XSRF token
74+
data = {
75+
"session_token": xsrf_token,
76+
}
77+
78+
# construct request headers
79+
headers = {
80+
"x-youtube-client-name": "1",
81+
"x-youtube-client-version": "2.20200731.02.01"
82+
}
83+
84+
# make the POST request to get the comments data
85+
response = session.post("https://www.youtube.com/comment_service_ajax", params=params, data=data, headers=headers)
86+
# convert to a Python dictionary
87+
comments_data = json.loads(response.text)
88+
89+
for comment in search_dict(comments_data, "commentRenderer"):
90+
# iterate over loaded comments and yield useful info
91+
yield {
92+
"commentId": comment["commentId"],
93+
"text": ''.join([c['text'] for c in comment['contentText']['runs']]),
94+
"time": comment['publishedTimeText']['runs'][0]['text'],
95+
"isLiked": comment["isLiked"],
96+
"likeCount": comment["likeCount"],
97+
# "replyCount": comment["replyCount"],
98+
'author': comment.get('authorText', {}).get('simpleText', ''),
99+
'channel': comment['authorEndpoint']['browseEndpoint']['browseId'],
100+
'votes': comment.get('voteCount', {}).get('simpleText', '0'),
101+
'photo': comment['authorThumbnail']['thumbnails'][-1]['url'],
102+
"authorIsChannelOwner": comment["authorIsChannelOwner"],
103+
}
104+
105+
# load continuation tokens for next comments (ctoken & itct)
106+
continuation_tokens = [(next_cdata['continuation'], next_cdata['clickTrackingParams'])
107+
for next_cdata in search_dict(comments_data, 'nextContinuationData')] + continuation_tokens
108+
109+
# avoid heavy loads with popular videos
110+
time.sleep(0.1)
111+
112+
113+
114+
115+
116+
if __name__ == "__main__":
117+
# from pprint import pprint
118+
# url = "https://www.youtube.com/watch?v=jNQXAC9IVRw"
119+
# for count, comment in enumerate(get_comments(url)):
120+
# if count == 3:
121+
# break
122+
# pprint(comment)
123+
# print("="*50)
124+
import argparse
125+
import os
126+
127+
parser = argparse.ArgumentParser(description="Simple YouTube Comment extractor")
128+
parser.add_argument("url", help="The YouTube video full URL")
129+
parser.add_argument("-l", "--limit", type=int, help="Number of maximum comments to extract, helpful for longer videos")
130+
parser.add_argument("-o", "--output", help="Output JSON file, e.g data.json")
131+
132+
# parse passed arguments
133+
args = parser.parse_args()
134+
limit = args.limit
135+
output = args.output
136+
url = args.url
137+
138+
from pprint import pprint
139+
for count, comment in enumerate(get_comments(url)):
140+
if limit and count >= limit:
141+
# break out of the loop when we exceed limit specified
142+
break
143+
if output:
144+
# write comment as JSON to a file
145+
with open(output, "a") as f:
146+
# begin writing, adding an opening brackets
147+
if count == 0:
148+
f.write("[")
149+
f.write(json.dumps(comment, ensure_ascii=False) + ",")
150+
else:
151+
pprint(comment)
152+
print("="*50)
153+
print("total comments extracted:", count)
154+
if output:
155+
# remove the last comma ','
156+
with open(output, "rb+") as f:
157+
f.seek(-1, os.SEEK_END)
158+
f.truncate()
159+
# add "]" to close the list in the end of the file
160+
with open(output, "a") as f:
161+
print("]", file=f)

0 commit comments

Comments
 (0)