Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ie/youtube] Extract comments with or without new format #9775

Merged
merged 28 commits into from
May 17, 2024
Merged
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
ee81ca4
apply patch from issues/9358#issuecomment-2072600506
jakeogh Apr 23, 2024
16cb4fe
fix typo in previous patch, like count, and use direct dict access
jakeogh Apr 23, 2024
6083596
handle KeyError: 'frameworkUpdates' when the old comment format is se…
jakeogh Apr 23, 2024
2ef6563
fix old comment extraction
jakeogh Apr 23, 2024
4da1db9
fix like_count
jakeogh Apr 24, 2024
800906c
fix indent
jakeogh Apr 24, 2024
2763473
fix another indent
jakeogh Apr 24, 2024
17bb443
replace dict access with try_get()
jakeogh Apr 24, 2024
3ef6517
replace dict access with traverse_obj() and use likeCountA11y
jakeogh Apr 24, 2024
a1102d7
add commentRenderer fix from @minamotorin
jakeogh May 6, 2024
f4c1de1
apply patch from @minamotorin of example code by @shoxie007
jakeogh May 6, 2024
5a3a4f1
@bbilly1 correctly parse like_count for cases > 1000
jakeogh May 6, 2024
cf9751a
two whitespace fixes
jakeogh May 15, 2024
f09e66b
use traverse_obj with mutations variable
jakeogh May 15, 2024
1a0cf3c
fix test for None
jakeogh May 15, 2024
4701ad6
remove .get() call from content
jakeogh May 15, 2024
6f5c669
move continue block, filter() comment_keys for None
jakeogh May 15, 2024
f6ced29
use get_first() and remove .get()
jakeogh May 15, 2024
8d428b4
use single traversal
jakeogh May 15, 2024
0ef6c93
use traverse_obj for time_text
jakeogh May 15, 2024
1cee8e7
whitespace change
jakeogh May 15, 2024
743ed06
remove pinned_text var
jakeogh May 15, 2024
1adea35
add {bool}
jakeogh May 15, 2024
1872982
fix author_is_verified
jakeogh May 15, 2024
90b1129
use single traversal pattern
jakeogh May 15, 2024
54b823b
readability
pukkandan May 15, 2024
053cde7
Apply suggestions from code review
bashonly May 15, 2024
47a6fb7
use traversal and check for `comment_keys`
bashonly May 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 56 additions & 8 deletions yt_dlp/extractor/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -3307,7 +3307,36 @@ def _extract_heatmap(self, data):
'value': ('intensityScoreNormalized', {float_or_none}),
})) or None

def _extract_comment(self, comment_renderer, parent=None):
def _extract_comment(self, entities, parent=None):
comment_entity_payload = get_first(entities, ('payload', 'commentEntityPayload', {dict}))
if not (comment_id := traverse_obj(comment_entity_payload, ('properties', 'commentId', {str}))):
return

toolbar_entity_payload = get_first(entities, ('payload', 'engagementToolbarStateEntityPayload', {dict}))
time_text = traverse_obj(comment_entity_payload, ('properties', 'publishedTime', {str})) or ''
pukkandan marked this conversation as resolved.
Show resolved Hide resolved

return {
'id': comment_id,
'parent': parent or 'root',
**traverse_obj(comment_entity_payload, {
'text': ('properties', 'content', 'content', {str}),
'like_count': ('toolbar', 'likeCountA11y', {parse_count}),
'author_id': ('author', 'channelId', {self.ucid_or_none}),
'author': ('author', 'displayName', {str}),
'author_thumbnail': ('author', 'avatarThumbnailUrl', {url_or_none}),
'author_is_uploader': ('author', 'isCreator', {bool}),
'author_is_verified': ('author', 'isVerified', {bool}),
'author_url': ('author', 'channelCommand', 'innertubeCommand', (
('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url')
), {lambda x: urljoin('https://www.youtube.com', x)}),
}, get_all=False),
'is_favorited': (None if toolbar_entity_payload is None else
toolbar_entity_payload.get('heartState') == 'TOOLBAR_HEART_STATE_HEARTED'),
'_time_text': time_text, # FIXME: non-standard, but we need a way of showing that it is an estimate.
'timestamp': self._parse_time_text(time_text),
}

def _extract_comment_old(self, comment_renderer, parent=None):
comment_id = comment_renderer.get('commentId')
if not comment_id:
return
Expand Down Expand Up @@ -3388,21 +3417,39 @@ def extract_header(contents):
break
return _continuation

def extract_thread(contents):
def extract_thread(contents, entity_payloads):
if not parent:
tracker['current_page_thread'] = 0
for content in contents:
if not parent and tracker['total_parent_comments'] >= max_parents:
yield
comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
comment_renderer = get_first(
(comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
expected_type=dict, default={})

comment = self._extract_comment(comment_renderer, parent)
# old comment format
if not entity_payloads:
comment_renderer = get_first(
(comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
expected_type=dict, default={})

comment = self._extract_comment_old(comment_renderer, parent)

# new comment format
else:
view_model = (
traverse_obj(comment_thread_renderer, ('commentViewModel', 'commentViewModel', {dict}))
or traverse_obj(content, ('commentViewModel', {dict})))
comment_keys = traverse_obj(view_model, (('commentKey', 'toolbarStateKey'), {str}))
if not comment_keys:
continue
entities = traverse_obj(entity_payloads, lambda _, v: v['entityKey'] in comment_keys)
comment = self._extract_comment(entities, parent)
if comment:
comment['is_pinned'] = traverse_obj(view_model, ('pinnedText', {str})) is not None

if not comment:
continue
comment_id = comment['id']

jakeogh marked this conversation as resolved.
Show resolved Hide resolved
if comment.get('is_pinned'):
tracker['pinned_comment_ids'].add(comment_id)
# Sometimes YouTube may break and give us infinite looping comments.
Expand Down Expand Up @@ -3495,7 +3542,7 @@ def extract_thread(contents):
check_get_keys = None
if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0):
check_get_keys = [[*continuation_items_path, ..., (
'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]]
'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentViewModel', 'commentRenderer'))]]
try:
response = self._extract_response(
item_id=None, query=continuation,
Expand All @@ -3519,6 +3566,7 @@ def extract_thread(contents):
raise
is_forced_continuation = False
continuation = None
mutations = traverse_obj(response, ('frameworkUpdates', 'entityBatchUpdate', 'mutations', ..., {dict}))
for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]):
if is_first_continuation:
continuation = extract_header(continuation_items)
Expand All @@ -3527,7 +3575,7 @@ def extract_thread(contents):
break
continue

for entry in extract_thread(continuation_items):
for entry in extract_thread(continuation_items, mutations):
if not entry:
return
yield entry
Expand Down
Loading