Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[extractor/iprima] Fix extractor (relax nuxt function regex, add js_to_json hack) #7216

Merged
merged 11 commits into from
Sep 21, 2023
Merged
2 changes: 1 addition & 1 deletion yt_dlp/extractor/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1672,7 +1672,7 @@ def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal
def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
rectx = re.escape(context_name)
FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){(?:.*?)return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
pukkandan marked this conversation as resolved.
Show resolved Hide resolved
js, arg_keys, arg_vals = self._search_regex(
(rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
Expand Down
11 changes: 10 additions & 1 deletion yt_dlp/extractor/iprima.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,19 @@ def _real_extract(self, url):
), webpage, 'real id', group='id', default=None)

if not video_id:
nuxt_data = self._search_nuxt_data(webpage, video_id, traverse='data')
nuxt_data = self._search_nuxt_data(webpage, video_id, traverse='data', fatal=False)
video_id = traverse_obj(
nuxt_data, (..., 'content', 'additionals', 'videoPlayId', {str}), get_all=False)

if not video_id:
nuxt_data = self._parse_json(
self._search_regex(
r'(?s)<script[^>]+\bid=["\']__NUXT_DATA__["\'][^>]+>(.+?)</script>',
webpage, 'nuxt data'),
'nuxt data')
std-move marked this conversation as resolved.
Show resolved Hide resolved

video_id = traverse_obj(nuxt_data, lambda _, v: re.fullmatch(r'p\d+', v), get_all=False)

if not video_id:
self.raise_no_formats('Unable to extract video ID from webpage')

Expand Down
8 changes: 8 additions & 0 deletions yt_dlp/utils/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3234,7 +3234,15 @@ def fix_kv(m):
def create_map(mobj):
return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))

def create_array(mobj):
return mobj.group(1) + js_to_json(f'[{mobj.group(2)}]', vars=vars) + mobj.group(3)

code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
while True:
old_code = code
code = re.sub(r'^(.*?)(?:new\s+)?Array\((.*?)\)(.*?)$', create_array, old_code)
if old_code == code:
break
std-move marked this conversation as resolved.
Show resolved Hide resolved
if not strict:
code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
Expand Down