From bdd0b75e3f41ff35440eda6d395008beef19ef2f Mon Sep 17 00:00:00 2001 From: GD-Slime <82302542+GD-Slime@users.noreply.github.com> Date: Sun, 9 Jul 2023 06:26:03 +0800 Subject: [PATCH 1/7] [ie/BiliBiliBangumi] Fix extractors (#7337) - Overhaul BiliBiliBangumi extractor for the site's new API - Add BiliBiliBangumiSeason extractor - Refactor BiliBiliBangumiMedia extractor Closes #6701, Closes #7400 Authored by: GD-Slime --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/bilibili.py | 129 +++++++++++++++++++++----------- 2 files changed, 85 insertions(+), 45 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c0a330dbe54..1e7f165ab94 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -214,6 +214,7 @@ from .bilibili import ( BiliBiliIE, BiliBiliBangumiIE, + BiliBiliBangumiSeasonIE, BiliBiliBangumiMediaIE, BiliBiliSearchIE, BilibiliCategoryIE, diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 6629fbc08c4..e8714a33ab9 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -18,6 +18,7 @@ float_or_none, format_field, int_or_none, + join_nonempty, make_archive_id, merge_dicts, mimetype2ext, @@ -135,6 +136,17 @@ def _get_all_children(self, reply): for children in map(self._get_all_children, traverse_obj(reply, ('replies', ...))): yield from children + def _get_episodes_from_season(self, ss_id, url): + season_info = self._download_json( + 'https://api.bilibili.com/pgc/web/season/section', ss_id, + note='Downloading season info', query={'season_id': ss_id}, + headers={'Referer': url, **self.geo_verification_headers()}) + + for entry in traverse_obj(season_info, ( + 'result', 'main_section', 'episodes', + lambda _, v: url_or_none(v['share_url']) and v['id'])): + yield self.url_result(entry['share_url'], BiliBiliBangumiIE, f'ep{entry["id"]}') + class BiliBiliIE(BilibiliBaseIE): _VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P[^/?#&]+)' @@ -403,76 +415,93 @@ def _real_extract(self, url): class BiliBiliBangumiIE(BilibiliBaseIE): - _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/(?P(?:ss|ep)\d+)' + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/play/(?Pep\d+)' _TESTS = [{ - 'url': 'https://www.bilibili.com/bangumi/play/ss897', + 'url': 'https://www.bilibili.com/bangumi/play/ep267851', 'info_dict': { - 'id': 'ss897', + 'id': '267851', 'ext': 'mp4', - 'series': '神的记事本', - 'season': '神的记事本', - 'season_id': 897, + 'series': '鬼灭之刃', + 'series_id': '4358', + 'season': '鬼灭之刃', + 'season_id': '26801', 'season_number': 1, - 'episode': '你与旅行包', - 'episode_number': 2, - 'title': '神的记事本:第2话 你与旅行包', - 'duration': 1428.487, - 'timestamp': 1310809380, - 'upload_date': '20110716', - 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'episode': '残酷', + 'episode_id': '267851', + 'episode_number': 1, + 'title': '1 残酷', + 'duration': 1425.256, + 'timestamp': 1554566400, + 'upload_date': '20190406', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$' }, - }, { - 'url': 'https://www.bilibili.com/bangumi/play/ep508406', - 'only_matching': True, + 'skip': 'According to the copyright owner\'s request, you may only watch the video after you are premium member.' }] def _real_extract(self, url): video_id = self._match_id(url) + episode_id = video_id[2:] webpage = self._download_webpage(url, video_id) if '您所在的地区无法观看本片' in webpage: raise GeoRestrictedError('This video is restricted') - elif ('开通大会员观看' in webpage and '__playinfo__' not in webpage - or '正在观看预览,大会员免费看全片' in webpage): + elif '正在观看预览,大会员免费看全片' in webpage: self.raise_login_required('This video is for premium members only') - play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] + headers = {'Referer': url, **self.geo_verification_headers()} + play_info = self._download_json( + 'https://api.bilibili.com/pgc/player/web/v2/playurl', video_id, + 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id}, + headers=headers) + premium_only = play_info.get('code') == -10403 + play_info = traverse_obj(play_info, ('result', 'video_info', {dict})) or {} + formats = self.extract_formats(play_info) - if (not formats and '成为大会员抢先看' in webpage - and play_info.get('durl') and not play_info.get('dash')): + if not formats and (premium_only or '成为大会员抢先看' in webpage or '开通大会员观看' in webpage): self.raise_login_required('This video is for premium members only') - initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) + bangumi_info = self._download_json( + 'https://api.bilibili.com/pgc/view/web/season', video_id, 'Get episode details', + query={'ep_id': episode_id}, headers=headers)['result'] - season_id = traverse_obj(initial_state, ('mediaInfo', 'season_id')) + episode_number, episode_info = next(( + (idx, ep) for idx, ep in enumerate(traverse_obj( + bangumi_info, ('episodes', ..., {dict})), 1) + if str_or_none(ep.get('id')) == episode_id), (1, {})) + + season_id = bangumi_info.get('season_id') season_number = season_id and next(( idx + 1 for idx, e in enumerate( - traverse_obj(initial_state, ('mediaInfo', 'seasons', ...))) + traverse_obj(bangumi_info, ('seasons', ...))) if e.get('season_id') == season_id ), None) + aid = episode_info.get('aid') + return { 'id': video_id, 'formats': formats, - 'title': traverse_obj(initial_state, 'h1Title'), - 'episode': traverse_obj(initial_state, ('epInfo', 'long_title')), - 'episode_number': int_or_none(traverse_obj(initial_state, ('epInfo', 'title'))), - 'series': traverse_obj(initial_state, ('mediaInfo', 'series')), - 'season': traverse_obj(initial_state, ('mediaInfo', 'season_title')), - 'season_id': season_id, + **traverse_obj(bangumi_info, { + 'series': ('series', 'series_title', {str}), + 'series_id': ('series', 'series_id', {str_or_none}), + 'thumbnail': ('square_cover', {url_or_none}), + }), + 'title': join_nonempty('title', 'long_title', delim=' ', from_dict=episode_info), + 'episode': episode_info.get('long_title'), + 'episode_id': episode_id, + 'episode_number': int_or_none(episode_info.get('title')) or episode_number, + 'season_id': str_or_none(season_id), 'season_number': season_number, - 'thumbnail': traverse_obj(initial_state, ('epInfo', 'cover')), - 'timestamp': traverse_obj(initial_state, ('epInfo', 'pub_time')), + 'timestamp': int_or_none(episode_info.get('pub_time')), 'duration': float_or_none(play_info.get('timelength'), scale=1000), - 'subtitles': self.extract_subtitles( - video_id, initial_state, traverse_obj(initial_state, ('epInfo', 'cid'))), - '__post_extractor': self.extract_comments(traverse_obj(initial_state, ('epInfo', 'aid'))), - 'http_headers': {'Referer': url, **self.geo_verification_headers()}, + 'subtitles': self.extract_subtitles(video_id, aid, episode_info.get('cid')), + '__post_extractor': self.extract_comments(aid), + 'http_headers': headers, } -class BiliBiliBangumiMediaIE(InfoExtractor): +class BiliBiliBangumiMediaIE(BilibiliBaseIE): _VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/media/md24097891', @@ -485,16 +514,26 @@ class BiliBiliBangumiMediaIE(InfoExtractor): def _real_extract(self, url): media_id = self._match_id(url) webpage = self._download_webpage(url, media_id) + ss_id = self._search_json( + r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id)['mediaInfo']['season_id'] + + return self.playlist_result(self._get_episodes_from_season(ss_id, url), media_id) + + +class BiliBiliBangumiSeasonIE(BilibiliBaseIE): + _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/ss(?P\d+)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/bangumi/play/ss26801', + 'info_dict': { + 'id': '26801' + }, + 'playlist_mincount': 26 + }] - initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id) - episode_list = self._download_json( - 'https://api.bilibili.com/pgc/web/season/section', media_id, - query={'season_id': initial_state['mediaInfo']['season_id']}, - note='Downloading season info')['result']['main_section']['episodes'] + def _real_extract(self, url): + ss_id = self._match_id(url) - return self.playlist_result(( - self.url_result(entry['share_url'], BiliBiliBangumiIE, entry['aid']) - for entry in episode_list), media_id) + return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id) class BilibiliSpaceBaseIE(InfoExtractor): From 325191d0c9bf3fe257b8a7c2eb95080f44f6ddfc Mon Sep 17 00:00:00 2001 From: Zprokkel <105783800+Zprokkel@users.noreply.github.com> Date: Mon, 10 Jul 2023 15:15:47 +0200 Subject: [PATCH 2/7] [ie/vrt] Update token signing key (#7519) Authored by: Zprokkel --- yt_dlp/extractor/vrt.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index bacd3df29ad..00583571221 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -44,9 +44,11 @@ class VRTBaseIE(GigyaBaseIE): 'version': '2.7.4-prod-2023-04-19T06:05:45' } } - # From https://player.vrt.be/vrtnws/js/main.js & https://player.vrt.be/ketnet/js/main.fd1de01a40a1e3d842ea.js + # From https://player.vrt.be/vrtnws/js/main.js & https://player.vrt.be/ketnet/js/main.8cdb11341bcb79e4cd44.js _JWT_KEY_ID = '0-0Fp51UZykfaiCJrfTE3+oMI8zvDteYfPtR+2n1R+z8w=' - _JWT_SIGNING_KEY = '2a9251d782700769fb856da5725daf38661874ca6f80ae7dc2b05ec1a81a24ae' + _JWT_SIGNING_KEY = 'b5f500d55cb44715107249ccd8a5c0136cfb2788dbb71b90a4f142423bacaf38' # -dev + # player-stag.vrt.be key: d23987504521ae6fbf2716caca6700a24bb1579477b43c84e146b279de5ca595 + # player.vrt.be key: 2a9251d782700769fb856da5725daf38661874ca6f80ae7dc2b05ec1a81a24ae def _extract_formats_and_subtitles(self, data, video_id): if traverse_obj(data, 'drm'): From 2af4eeb77246b8183aae75a0a8d19f18c08115b2 Mon Sep 17 00:00:00 2001 From: Mahmoud Abdel-Fattah Date: Tue, 11 Jul 2023 05:00:38 +0400 Subject: [PATCH 3/7] [utils] `clean_podcast_url`: Handle more trackers (#7556) Authored by: mabdelfattah, bashonly Closes #7544 --- test/test_utils.py | 2 ++ yt_dlp/utils/_utils.py | 10 +++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index a22f25d730d..bdbd2d87960 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1835,6 +1835,8 @@ def test_iri_to_uri(self): def test_clean_podcast_url(self): self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3') self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3') + self.assertEqual(clean_podcast_url('https://pdst.fm/e/2.gum.fm/chtbl.com/track/chrt.fm/track/34D33/pscrb.fm/rss/p/traffic.megaphone.fm/ITLLC7765286967.mp3?updated=1687282661'), 'https://traffic.megaphone.fm/ITLLC7765286967.mp3?updated=1687282661') + self.assertEqual(clean_podcast_url('https://pdst.fm/e/https://mgln.ai/e/441/www.buzzsprout.com/1121972/13019085-ep-252-the-deep-life-stack.mp3'), 'https://www.buzzsprout.com/1121972/13019085-ep-252-the-deep-life-stack.mp3') def test_LazyList(self): it = list(range(10)) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 82d9ba4d578..3023c33b24d 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -5123,14 +5123,18 @@ def clean_podcast_url(url): (?: chtbl\.com/track| media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/ - play\.podtrac\.com - )/[^/]+| + play\.podtrac\.com| + chrt\.fm/track| + mgln\.ai/e + )(?:/[^/.]+)?| (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure flex\.acast\.com| pd(?: cn\.co| # https://podcorn.com/analytics-prefix/ st\.fm # https://podsights.com/docs/ - )/e + )/e| + [0-9]\.gum\.fm| + pscrb\.fm/rss/p )/''', '', url) return re.sub(r'^\w+://(\w+://)', r'\1', url) From 2cfe221fbbe46faa3f46552c08d947a51f424903 Mon Sep 17 00:00:00 2001 From: Aleri Kaisattera <73682764+alerikaisattera@users.noreply.github.com> Date: Thu, 13 Jul 2023 20:17:05 +0600 Subject: [PATCH 4/7] [ie/streamanity] Remove (#7571) Service is dead Authored by: alerikaisattera --- yt_dlp/extractor/_extractors.py | 1 - yt_dlp/extractor/streamanity.py | 47 --------------------------------- 2 files changed, 48 deletions(-) delete mode 100644 yt_dlp/extractor/streamanity.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 1e7f165ab94..2af99b3dad7 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1871,7 +1871,6 @@ StoryFireSeriesIE, ) from .streamable import StreamableIE -from .streamanity import StreamanityIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE from .streamff import StreamFFIE diff --git a/yt_dlp/extractor/streamanity.py b/yt_dlp/extractor/streamanity.py deleted file mode 100644 index 6eaee52d95e..00000000000 --- a/yt_dlp/extractor/streamanity.py +++ /dev/null @@ -1,47 +0,0 @@ -from .common import InfoExtractor - - -class StreamanityIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?streamanity\.com/video/(?P[A-Za-z0-9]+)' - _TESTS = [{ - 'url': 'https://streamanity.com/video/9DFPTnuYi8f2', - 'md5': '6ab171e8d4a02ad5dcbff6bea44cf5a1', - 'info_dict': { - 'id': '9DFPTnuYi8f2', - 'ext': 'mp4', - 'title': 'Bitcoin vs The Lighting Network', - 'thumbnail': r're:https://res\.cloudinary\.com/.+\.png', - 'description': '', - 'uploader': 'Tom Bombadil (Freddy78)', - } - }, { - 'url': 'https://streamanity.com/video/JktOUjSlfzTD', - 'md5': '31f131e28abd3377c38be586a59532dc', - 'info_dict': { - 'id': 'JktOUjSlfzTD', - 'ext': 'mp4', - 'title': 'Share data when you see it', - 'thumbnail': r're:https://res\.cloudinary\.com/.+\.png', - 'description': 'Reposting as data should be public and stored on blockchain', - 'uploader': 'digitalcurrencydaily', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - video_info = self._download_json( - f'https://app.streamanity.com/api/video/{video_id}', video_id)['data']['video'] - - formats = self._extract_m3u8_formats( - f'https://stream.mux.com/{video_info["play_id"]}.m3u8?token={video_info["token"]}', - video_id, ext='mp4', m3u8_id='hls') - - return { - 'id': video_id, - 'title': video_info['title'], - 'description': video_info.get('description'), - 'uploader': video_info.get('author_name'), - 'is_live': False, - 'thumbnail': video_info.get('thumb'), - 'formats': formats, - } From 8a4cd12c8f8e93292e3e95200b9d17a3af39624c Mon Sep 17 00:00:00 2001 From: Neurognostic Date: Thu, 13 Jul 2023 16:39:21 -0400 Subject: [PATCH 5/7] [pp/EmbedThumbnail] Support `m4v` (#7583) Authored by: Neurognostic --- yt_dlp/postprocessor/embedthumbnail.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/postprocessor/embedthumbnail.py b/yt_dlp/postprocessor/embedthumbnail.py index 88a767132ac..d7be0b398e4 100644 --- a/yt_dlp/postprocessor/embedthumbnail.py +++ b/yt_dlp/postprocessor/embedthumbnail.py @@ -114,7 +114,7 @@ def run(self, info): self._report_run('ffmpeg', filename) self.run_ffmpeg(filename, temp_filename, options) - elif info['ext'] in ['m4a', 'mp4', 'mov']: + elif info['ext'] in ['m4a', 'mp4', 'm4v', 'mov']: prefer_atomicparsley = 'embed-thumbnail-atomicparsley' in self.get_param('compat_opts', []) # Method 1: Use mutagen if not mutagen or prefer_atomicparsley: @@ -213,7 +213,7 @@ def run(self, info): temp_filename = filename else: - raise EmbedThumbnailPPError('Supported filetypes for thumbnail embedding are: mp3, mkv/mka, ogg/opus/flac, m4a/mp4/mov') + raise EmbedThumbnailPPError('Supported filetypes for thumbnail embedding are: mp3, mkv/mka, ogg/opus/flac, m4a/mp4/m4v/mov') if success and temp_filename != filename: os.replace(temp_filename, filename) From 1bcb9fe8715b1f288efc322be3de409ee0597080 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Finn=20R=2E=20G=C3=A4rtner?= <65015656+FinnRG@users.noreply.github.com> Date: Fri, 14 Jul 2023 20:09:02 +0200 Subject: [PATCH 6/7] [ie/piapro] Support `/content` URL (#7592) Authored by: FinnRG --- yt_dlp/extractor/piapro.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/piapro.py b/yt_dlp/extractor/piapro.py index d8d9c780109..eb5923d110a 100644 --- a/yt_dlp/extractor/piapro.py +++ b/yt_dlp/extractor/piapro.py @@ -12,17 +12,22 @@ class PiaproIE(InfoExtractor): _NETRC_MACHINE = 'piapro' - _VALID_URL = r'https?://piapro\.jp/t/(?P\w+)/?' + _VALID_URL = r'https?://piapro\.jp/(?:t|content)/(?P\w+)/?' _TESTS = [{ 'url': 'https://piapro.jp/t/NXYR', - 'md5': 'a9d52f27d13bafab7ee34116a7dcfa77', + 'md5': 'f7c0f760913fb1d44a1c45a4af793909', 'info_dict': { 'id': 'NXYR', 'ext': 'mp3', 'uploader': 'wowaka', 'uploader_id': 'wowaka', 'title': '裏表ラバーズ', - 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'http://www.nicovideo.jp/watch/sm8082467', + 'duration': 189.0, + 'timestamp': 1251785475, + 'thumbnail': r're:^https?://.*\.(?:png|jpg)$', + 'upload_date': '20090901', + 'view_count': int, } }, { 'note': 'There are break lines in description, mandating (?s) flag', @@ -34,8 +39,16 @@ class PiaproIE(InfoExtractor): 'title': '青に溶けた風船 / 初音ミク', 'description': 'md5:d395a9bd151447631a5a1460bc7f9132', 'uploader': 'シアン・キノ', + 'duration': 229.0, + 'timestamp': 1644030039, + 'upload_date': '20220205', + 'view_count': int, + 'thumbnail': r're:^https?://.*\.(?:png|jpg)$', 'uploader_id': 'cyankino', } + }, { + 'url': 'https://piapro.jp/content/hcw0z3a169wtemz6', + 'only_matching': True }] _login_status = False From 1ba6fe9db5f660d5538588315c23ad6cf0371c5f Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 15 Jul 2023 15:20:24 +1200 Subject: [PATCH 7/7] [ie/youtube:tab] Detect looping feeds (#6621) Closes https://github.com/yt-dlp/yt-dlp/issues/5555 Note: the first page may still be repeated, however this is better than nothing. Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 73bfa662d26..826bbb20e18 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4921,10 +4921,15 @@ def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data): or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {}) yield from extract_entries(parent_renderer) continuation = continuation_list[0] - + seen_continuations = set() for page_num in itertools.count(1): if not continuation: break + continuation_token = continuation.get('continuation') + if continuation_token is not None and continuation_token in seen_continuations: + self.write_debug('Detected YouTube feed looping - assuming end of feed.') + break + seen_continuations.add(continuation_token) headers = self.generate_api_headers( ytcfg=ytcfg, account_syncid=account_syncid, visitor_data=visitor_data) response = self._extract_response(