From 8b7fb8b60da78b54a518246b251be3d1829fef38 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 3 Oct 2022 16:50:27 +0530 Subject: [PATCH] [extractor] Make search_json able to parse lists Now `contains_pattern` can be set to `\[.+\]` --- yt_dlp/extractor/common.py | 4 ++-- yt_dlp/extractor/dropbox.py | 2 +- yt_dlp/extractor/radiofrance.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 11e7158714..caec0ccf62 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1227,7 +1227,7 @@ def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, f return None def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', - contains_pattern='(?s:.+)', fatal=True, default=NO_DEFAULT, **kwargs): + contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs): """Searches string for the JSON object specified by start_pattern""" # NB: end_pattern is only used to reduce the size of the initial match if default is NO_DEFAULT: @@ -1236,7 +1236,7 @@ def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', fatal, has_default = False, True json_string = self._search_regex( - rf'(?:{start_pattern})\s*(?P{{\s*(?:{contains_pattern})\s*}})\s*(?:{end_pattern})', + rf'(?:{start_pattern})\s*(?P{contains_pattern})\s*(?:{end_pattern})', string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT) if not json_string: return default diff --git a/yt_dlp/extractor/dropbox.py b/yt_dlp/extractor/dropbox.py index 0d12513b29..54d97a25dc 100644 --- a/yt_dlp/extractor/dropbox.py +++ b/yt_dlp/extractor/dropbox.py @@ -54,7 +54,7 @@ def _real_extract(self, url): raise ExtractorError('Password protected video, use --video-password ', expected=True) info_json = self._search_json(r'InitReact\.mountComponent\(.*?,', webpage, 'mountComponent', video_id, - contains_pattern=r'.+?"preview".+?', end_pattern=r'\)')['props'] + contains_pattern=r'{.+?"preview".+?}', end_pattern=r'\)')['props'] transcode_url = traverse_obj(info_json, ((None, 'preview'), 'file', 'preview', 'content', 'transcode_url'), get_all=False) formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id) diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py index 7b60b2617b..38420a15d6 100644 --- a/yt_dlp/extractor/radiofrance.py +++ b/yt_dlp/extractor/radiofrance.py @@ -84,7 +84,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, display_id) # _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846 - video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'\s*"@type"\s*:\s*"AudioObject"\s*.+') + video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'{\s*"@type"\s*:\s*"AudioObject".+}') return { 'id': video_id,