Skip to content

Commit

Permalink
[ie] Make _search_nextjs_data non fatal (#8937)
Browse files Browse the repository at this point in the history
Authored by: Grub4K
  • Loading branch information
Grub4K committed Apr 21, 2024
1 parent e3b42d8 commit 3ee1194
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 9 deletions.
9 changes: 9 additions & 0 deletions test/test_InfoExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1906,6 +1906,15 @@ def test_response_with_expected_status_returns_content(self):
expected_status=TEAPOT_RESPONSE_STATUS)
self.assertEqual(content, TEAPOT_RESPONSE_BODY)

def test_search_nextjs_data(self):
data = '<script id="__NEXT_DATA__" type="application/json">{"props":{}}</script>'
self.assertEqual(self.ie._search_nextjs_data(data, None), {'props': {}})
self.assertEqual(self.ie._search_nextjs_data('', None, fatal=False), {})
self.assertEqual(self.ie._search_nextjs_data('', None, default=None), None)
self.assertEqual(self.ie._search_nextjs_data('', None, default={}), {})
with self.assertRaises(DeprecationWarning):
self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {})


if __name__ == '__main__':
unittest.main()
2 changes: 1 addition & 1 deletion yt_dlp/extractor/asobistage.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def _real_extract(self, url):
video_type = {'archive': 'archives', 'player': 'broadcasts'}[type_]
webpage = self._download_webpage(url, video_id)
event_data = traverse_obj(
self._search_nextjs_data(webpage, video_id, default='{}'),
self._search_nextjs_data(webpage, video_id, default={}),
('props', 'pageProps', 'eventCMSData', {
'title': ('event_name', {str}),
'thumbnail': ('event_thumbnail_image', {url_or_none}),
Expand Down
16 changes: 10 additions & 6 deletions yt_dlp/extractor/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1738,12 +1738,16 @@ def traverse_json_ld(json_ld, at_top_level=True):
traverse_json_ld(json_ld)
return filter_dict(info)

def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
return self._parse_json(
self._search_regex(
r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
webpage, 'next.js data', fatal=fatal, **kw),
video_id, transform_source=transform_source, fatal=fatal)
def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT, **kw):
if default == '{}':
self._downloader.deprecation_warning('using `default=\'{}\'` is deprecated, use `default={}` instead')
default = {}
if default is not NO_DEFAULT:
fatal = False

return self._search_json(
r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data',
video_id, end_pattern='</script>', fatal=fatal, default=default, **kw)

def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
Expand Down
2 changes: 1 addition & 1 deletion yt_dlp/extractor/stv.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def _real_extract(self, url):
ptype, video_id = self._match_valid_url(url).groups()

webpage = self._download_webpage(url, video_id, fatal=False) or ''
props = self._search_nextjs_data(webpage, video_id, default='{}').get('props') or {}
props = self._search_nextjs_data(webpage, video_id, default={}).get('props') or {}
player_api_cache = try_get(
props, lambda x: x['initialReduxState']['playerApiCache']) or {}

Expand Down
2 changes: 1 addition & 1 deletion yt_dlp/extractor/tiktok.py
Original file line number Diff line number Diff line change
Expand Up @@ -776,7 +776,7 @@ def _real_extract(self, url):
status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0
video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict}))

elif next_data := self._search_nextjs_data(webpage, video_id, default='{}'):
elif next_data := self._search_nextjs_data(webpage, video_id, default={}):
self.write_debug('Found next.js data')
status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0
video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict}))
Expand Down

2 comments on commit 3ee1194

@samoht0
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Causes a test regression for me:

=================================== FAILURES ===================================
__________________ TestInfoExtractor.test_search_nextjs_data ___________________

self = <test.test_InfoExtractor.TestInfoExtractor testMethod=test_search_nextjs_data>

    def test_search_nextjs_data(self):
        data = '<script id="__NEXT_DATA__" type="application/json">{"props":{}}</script>'
        self.assertEqual(self.ie._search_nextjs_data(data, None), {'props': {}})
        self.assertEqual(self.ie._search_nextjs_data('', None, fatal=False), {})
        self.assertEqual(self.ie._search_nextjs_data('', None, default=None), None)
        self.assertEqual(self.ie._search_nextjs_data('', None, default={}), {})
>       with self.assertRaises(DeprecationWarning):
E       AssertionError: DeprecationWarning not raised

test/test_InfoExtractor.py:1915: AssertionError
----------------------------- Captured stderr call -----------------------------
WARNING: [Dummy] unable to extract next.js data; please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U

@bashonly
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@samoht0 please open a new issue, and include the output of python -m yt_dlp -v called from the same directory that you ran the tests

Please sign in to comment.