[ie] Make _search_nextjs_data non fatal (#8937)

Authored by: Grub4K
yt-dlp · Apr 21, 2024 · 3ee1194 · 3ee1194 · samoht0 · Apr 22, 2024
1 parent e3b42d8
commit 3ee1194
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 9 deletions.
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py
@@ -1906,6 +1906,15 @@ def test_response_with_expected_status_returns_content(self):
             expected_status=TEAPOT_RESPONSE_STATUS)
         self.assertEqual(content, TEAPOT_RESPONSE_BODY)
 
+    def test_search_nextjs_data(self):
+        data = '<script id="__NEXT_DATA__" type="application/json">{"props":{}}</script>'
+        self.assertEqual(self.ie._search_nextjs_data(data, None), {'props': {}})
+        self.assertEqual(self.ie._search_nextjs_data('', None, fatal=False), {})
+        self.assertEqual(self.ie._search_nextjs_data('', None, default=None), None)
+        self.assertEqual(self.ie._search_nextjs_data('', None, default={}), {})
+        with self.assertRaises(DeprecationWarning):
+            self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {})
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/yt_dlp/extractor/asobistage.py b/yt_dlp/extractor/asobistage.py
@@ -105,7 +105,7 @@ def _real_extract(self, url):
         video_type = {'archive': 'archives', 'player': 'broadcasts'}[type_]
         webpage = self._download_webpage(url, video_id)
         event_data = traverse_obj(
-            self._search_nextjs_data(webpage, video_id, default='{}'),
+            self._search_nextjs_data(webpage, video_id, default={}),
             ('props', 'pageProps', 'eventCMSData', {
                 'title': ('event_name', {str}),
                 'thumbnail': ('event_thumbnail_image', {url_or_none}),

diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
@@ -1738,12 +1738,16 @@ def traverse_json_ld(json_ld, at_top_level=True):
         traverse_json_ld(json_ld)
         return filter_dict(info)
 
-    def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
-        return self._parse_json(
-            self._search_regex(
-                r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
-                webpage, 'next.js data', fatal=fatal, **kw),
-            video_id, transform_source=transform_source, fatal=fatal)
+    def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT, **kw):
+        if default == '{}':
+            self._downloader.deprecation_warning('using `default=\'{}\'` is deprecated, use `default={}` instead')
+            default = {}
+        if default is not NO_DEFAULT:
+            fatal = False
+
+        return self._search_json(
+            r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data',
+            video_id, end_pattern='</script>', fatal=fatal, default=default, **kw)
 
     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""

diff --git a/yt_dlp/extractor/stv.py b/yt_dlp/extractor/stv.py
@@ -41,7 +41,7 @@ def _real_extract(self, url):
         ptype, video_id = self._match_valid_url(url).groups()
 
         webpage = self._download_webpage(url, video_id, fatal=False) or ''
-        props = self._search_nextjs_data(webpage, video_id, default='{}').get('props') or {}
+        props = self._search_nextjs_data(webpage, video_id, default={}).get('props') or {}
         player_api_cache = try_get(
             props, lambda x: x['initialReduxState']['playerApiCache']) or {}
 

diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py
@@ -776,7 +776,7 @@ def _real_extract(self, url):
             status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0
             video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict}))
 
-        elif next_data := self._search_nextjs_data(webpage, video_id, default='{}'):
+        elif next_data := self._search_nextjs_data(webpage, video_id, default={}):
             self.write_debug('Found next.js data')
             status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0
             video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict}))