From a47fc807bdca899173142453b8ae0d96d836fb18 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Wed, 24 Apr 2024 03:30:16 -0400 Subject: [PATCH 01/10] taptap extractor --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/taptap.py | 148 ++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 yt_dlp/extractor/taptap.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 42034275b9de..b1828ef29bca 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1906,6 +1906,7 @@ from .syfy import SyfyIE from .sztvhu import SztvHuIE from .tagesschau import TagesschauIE +from .taptap import TapTapIE from .tass import TassIE from .tbs import TBSIE from .tbsjp import ( diff --git a/yt_dlp/extractor/taptap.py b/yt_dlp/extractor/taptap.py new file mode 100644 index 000000000000..beba65a83cfc --- /dev/null +++ b/yt_dlp/extractor/taptap.py @@ -0,0 +1,148 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + parse_qs, + str_or_none, + traverse_obj, + url_or_none, +) + + +class TapTapIE(InfoExtractor): + _VALID_URL = r'https?://www\.taptap\.cn/(?P
moment|app)/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.taptap.cn/moment/194618230982052443', + 'info_dict': { + 'id': 'moment_194618230982052443', + "title": "《崩坏3》开放世界「后崩坏书」新篇章 于淹没之地仰视辰星", + "description": "md5:cf66f7819d413641b8b28c8543f4ecda", + "timestamp": 1633453402, + "upload_date": "20211005", + "uploader": "乌酱", + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '2202584', + "ext": "mp4", + "title": "《崩坏3》开放世界「后崩坏书」新篇章 于淹没之地仰视辰星", + "description": "md5:cf66f7819d413641b8b28c8543f4ecda", + "duration": 66, + "timestamp": 1633453402, + "upload_date": "20211005", + "uploader": "乌酱", + 'thumbnail': r're:^https?://.*\.(png|jpg)', + } + }] + }, { + 'url': 'https://www.taptap.cn/moment/521630629209573493', + 'info_dict': { + 'id': 'moment_521630629209573493', + "title": "《崩坏:星穹铁道》黄泉角色PV——「你的颜色」", + "description": "md5:2c81245da864428c904d53ae4ad2182b", + "timestamp": 1711425600, + "upload_date": "20240326", + "uploader": "崩坏:星穹铁道", + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '4006511', + "ext": "mp4", + "title": "《崩坏:星穹铁道》黄泉角色PV——「你的颜色」", + "description": "md5:2c81245da864428c904d53ae4ad2182b", + "duration": 173, + "timestamp": 1711425600, + "upload_date": "20240326", + "uploader": "崩坏:星穹铁道", + 'thumbnail': r're:^https?://.*\.(png|jpg)', + } + }] + }, { + 'url': 'https://www.taptap.cn/app/168332', + 'info_dict': { + 'id': 'app_168332', + "title": "原神", + "description": "md5:e345f39a5fea5de2a46923f70d5f76ab", + }, + 'playlist_count': 2, + 'playlist': [{ + 'info_dict': { + 'id': '4058443', + "ext": "mp4", + "title": "原神", + "description": "md5:e345f39a5fea5de2a46923f70d5f76ab", + "duration": 26, + 'thumbnail': r're:^https?://.*\.(png|jpg)', + } + }, { + 'info_dict': { + 'id': '4058462', + "ext": "mp4", + "title": "原神", + "description": "md5:e345f39a5fea5de2a46923f70d5f76ab", + "duration": 295, + 'thumbnail': r're:^https?://.*\.(png|jpg)', + } + }] + + }] + + def _deserialize_nuxt_data(self, serialized_nuxt): + for row in serialized_nuxt: + if isinstance(row, dict): + for key, value_or_ref in row.items(): + if isinstance(value_or_ref, int): + row[key] = serialized_nuxt[value_or_ref] + elif isinstance(row, list): + for index, value_or_ref in tuple(enumerate(row)): + if isinstance(value_or_ref, int): + row[index] = serialized_nuxt[value_or_ref] + return serialized_nuxt[0] + + def _extract_video(self, video_id, x_ua): + data = self._download_json( + 'https://www.taptap.cn/webapiv2/video-resource/v1/multi-get', video_id, + query={'video_ids': video_id, 'X-UA': x_ua}) + + video = traverse_obj(data, ('data', 'list', 0, { + 'id': ('video_id', {str_or_none}), + 'url': ('play_url', ('url', 'url_h265'), {url_or_none}), + 'duration': ('info', 'duration', {int_or_none}), + 'thumbnail': ('thumbnail', ('original_url', 'url'), {url_or_none}), + }), get_all=False) + if '.m3u8' in video['url']: + video['formats'] = self._extract_m3u8_formats(video.pop('url'), video_id) + return video + + def _real_extract(self, url): + section, list_id = self._match_valid_url(url).groups() + list_id = f'{section}_{list_id}' + + webpage = self._download_webpage(url, list_id) + nuxt_data = self._deserialize_nuxt_data(self._search_json( + r']+\bid=["\']__NUXT_DATA__["\'][^>]*>', webpage, + 'nuxt data', list_id, contains_pattern=r'\[(?s:.+)\]'))[1] + x_ua = traverse_obj(nuxt_data, ( + 'state', '$sbff', ..., {lambda x: parse_qs(x)['X-UA']}, ...), get_all=False) + + if section == 'moment': + moment_data = traverse_obj(nuxt_data, ('data', ..., 'moment'), get_all=False) + video_ids = traverse_obj(moment_data, ('topic', (('videos', ...), 'pin_video'), 'video_id')) + metainfo = traverse_obj(moment_data, { + 'timestamp': ('created_time', {int_or_none}), + 'uploader': ('author', 'user', 'name', {str}), + 'title': ('topic', 'title', {str}), + 'description': ('topic', 'summary', {str}), + }) + elif section == 'app': + video_ids = traverse_obj(nuxt_data, ('data', ..., ('app_videos', 'videos'), ..., 'video_id')) + metainfo = traverse_obj(nuxt_data, ('data', ..., { + 'title': ('title', {str}), + 'description': ('description', 'text', {str}, {clean_html}), + }), get_all=False) + + entries = [self._extract_video(video_id, x_ua) for video_id in set(video_ids)] + + return self.playlist_result([{**metainfo, **e} for e in entries], **metainfo, id=list_id) From 83e2e407902446dd2647b3bdb7c0834a04ee31e7 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Fri, 10 May 2024 19:27:03 -0400 Subject: [PATCH 02/10] change to single quote --- yt_dlp/extractor/taptap.py | 68 +++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/yt_dlp/extractor/taptap.py b/yt_dlp/extractor/taptap.py index beba65a83cfc..a36fe6bca150 100644 --- a/yt_dlp/extractor/taptap.py +++ b/yt_dlp/extractor/taptap.py @@ -15,23 +15,23 @@ class TapTapIE(InfoExtractor): 'url': 'https://www.taptap.cn/moment/194618230982052443', 'info_dict': { 'id': 'moment_194618230982052443', - "title": "《崩坏3》开放世界「后崩坏书」新篇章 于淹没之地仰视辰星", - "description": "md5:cf66f7819d413641b8b28c8543f4ecda", - "timestamp": 1633453402, - "upload_date": "20211005", - "uploader": "乌酱", + 'title': '《崩坏3》开放世界「后崩坏书」新篇章 于淹没之地仰视辰星', + 'description': 'md5:cf66f7819d413641b8b28c8543f4ecda', + 'timestamp': 1633453402, + 'upload_date': '20211005', + 'uploader': '乌酱', }, 'playlist_count': 1, 'playlist': [{ 'info_dict': { 'id': '2202584', - "ext": "mp4", - "title": "《崩坏3》开放世界「后崩坏书」新篇章 于淹没之地仰视辰星", - "description": "md5:cf66f7819d413641b8b28c8543f4ecda", - "duration": 66, - "timestamp": 1633453402, - "upload_date": "20211005", - "uploader": "乌酱", + 'ext': 'mp4', + 'title': '《崩坏3》开放世界「后崩坏书」新篇章 于淹没之地仰视辰星', + 'description': 'md5:cf66f7819d413641b8b28c8543f4ecda', + 'duration': 66, + 'timestamp': 1633453402, + 'upload_date': '20211005', + 'uploader': '乌酱', 'thumbnail': r're:^https?://.*\.(png|jpg)', } }] @@ -39,23 +39,23 @@ class TapTapIE(InfoExtractor): 'url': 'https://www.taptap.cn/moment/521630629209573493', 'info_dict': { 'id': 'moment_521630629209573493', - "title": "《崩坏:星穹铁道》黄泉角色PV——「你的颜色」", - "description": "md5:2c81245da864428c904d53ae4ad2182b", - "timestamp": 1711425600, - "upload_date": "20240326", - "uploader": "崩坏:星穹铁道", + 'title': '《崩坏:星穹铁道》黄泉角色PV——「你的颜色」', + 'description': 'md5:2c81245da864428c904d53ae4ad2182b', + 'timestamp': 1711425600, + 'upload_date': '20240326', + 'uploader': '崩坏:星穹铁道', }, 'playlist_count': 1, 'playlist': [{ 'info_dict': { 'id': '4006511', - "ext": "mp4", - "title": "《崩坏:星穹铁道》黄泉角色PV——「你的颜色」", - "description": "md5:2c81245da864428c904d53ae4ad2182b", - "duration": 173, - "timestamp": 1711425600, - "upload_date": "20240326", - "uploader": "崩坏:星穹铁道", + 'ext': 'mp4', + 'title': '《崩坏:星穹铁道》黄泉角色PV——「你的颜色」', + 'description': 'md5:2c81245da864428c904d53ae4ad2182b', + 'duration': 173, + 'timestamp': 1711425600, + 'upload_date': '20240326', + 'uploader': '崩坏:星穹铁道', 'thumbnail': r're:^https?://.*\.(png|jpg)', } }] @@ -63,26 +63,26 @@ class TapTapIE(InfoExtractor): 'url': 'https://www.taptap.cn/app/168332', 'info_dict': { 'id': 'app_168332', - "title": "原神", - "description": "md5:e345f39a5fea5de2a46923f70d5f76ab", + 'title': '原神', + 'description': 'md5:e345f39a5fea5de2a46923f70d5f76ab', }, 'playlist_count': 2, 'playlist': [{ 'info_dict': { 'id': '4058443', - "ext": "mp4", - "title": "原神", - "description": "md5:e345f39a5fea5de2a46923f70d5f76ab", - "duration": 26, + 'ext': 'mp4', + 'title': '原神', + 'description': 'md5:e345f39a5fea5de2a46923f70d5f76ab', + 'duration': 26, 'thumbnail': r're:^https?://.*\.(png|jpg)', } }, { 'info_dict': { 'id': '4058462', - "ext": "mp4", - "title": "原神", - "description": "md5:e345f39a5fea5de2a46923f70d5f76ab", - "duration": 295, + 'ext': 'mp4', + 'title': '原神', + 'description': 'md5:e345f39a5fea5de2a46923f70d5f76ab', + 'duration': 295, 'thumbnail': r're:^https?://.*\.(png|jpg)', } }] From 5b35ca73332a477a95b37ae30ab19c938e3c5ca8 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Fri, 10 May 2024 23:18:27 -0400 Subject: [PATCH 03/10] split extractors --- yt_dlp/extractor/_extractors.py | 7 +- yt_dlp/extractor/taptap.py | 187 +++++++++++++++++++++----------- 2 files changed, 130 insertions(+), 64 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b1828ef29bca..610ddb8f6f6e 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1906,7 +1906,12 @@ from .syfy import SyfyIE from .sztvhu import SztvHuIE from .tagesschau import TagesschauIE -from .taptap import TapTapIE +from .taptap import ( + TapTapMomentIE, + TapTapAppIE, + TapTapAppIntlIE, + TapTapPostIntlIE, +) from .tass import TassIE from .tbs import TBSIE from .tbsjp import ( diff --git a/yt_dlp/extractor/taptap.py b/yt_dlp/extractor/taptap.py index a36fe6bca150..29bafeae07cb 100644 --- a/yt_dlp/extractor/taptap.py +++ b/yt_dlp/extractor/taptap.py @@ -1,20 +1,84 @@ +import random +import re + from .common import InfoExtractor from ..utils import ( clean_html, int_or_none, - parse_qs, - str_or_none, + join_nonempty, traverse_obj, url_or_none, ) -class TapTapIE(InfoExtractor): - _VALID_URL = r'https?://www\.taptap\.cn/(?P
moment|app)/(?P\d+)' +class TapTapBaseIE(InfoExtractor): + _X_UA = 'V=1&PN=WebApp&LANG=zh_CN&VN_CODE=102&LOC=CN&PLT=PC&DS=Android&UID={uuid}&OS=Windows&OSV=10&DT=PC' + _VIDEO_API = 'https://www.taptap.cn/webapiv2/video-resource/v1/multi-get' + _INFO_API = None + _INFO_QUERY_KEY = 'id' + _DATA_PATH = ('data') + _ID_PATH = None + _META_PATH = None + + def _get_api(self, url, video_id, query, **kwargs): + rand_hex = lambda digits: ''.join(f'{random.randint(0, 15):x}' for _ in range(digits)) + uuid = '-'.join(rand_hex(digits) for digits in [8, 4, 4, 4, 12]) + query = {**query, 'X-UA': self._X_UA.format(uuid=uuid)} + return self._download_json(url, video_id, query=query, **kwargs) + + def _extract_video(self, video_id, is_intl=False): + video_data = self._get_api(self._VIDEO_API, video_id, query={'video_ids': video_id})['data']['list'][0] + + video_url = traverse_obj(video_data, ('play_url', ('url_h265', 'url'), {url_or_none}))[0] + formats = self._extract_m3u8_formats(video_url, video_id) + for format in formats: + if re.search(r'^(hev|hvc|hvt)\d', format.get('vcodec', '')): + format['format_id'] = join_nonempty(format.get('format_id'), 'h265', delim='_') + + return { + 'id': str(video_id), + 'formats': formats, + **traverse_obj(video_data, ({ + 'duration': ('info', 'duration', {int_or_none}), + 'thumbnail': ('thumbnail', ('original_url', 'url'), {url_or_none}), + }), get_all=False) + } + + def _extract_entries(self, video_ids, metainfo, list_id): + entries = [{**metainfo, **self._extract_video(id)} for id in set(video_ids)] + return self.playlist_result(entries, **metainfo, id=list_id) + + def _real_extract(self, url): + video_id = self._match_id(url) + query = {self._INFO_QUERY_KEY: video_id} + + data = traverse_obj( + self._get_api(self._INFO_API, video_id, query=query), self._DATA_PATH) + + video_ids = traverse_obj(data, self._ID_PATH) + metainfo = traverse_obj(data, self._META_PATH) + return self._extract_entries(video_ids, metainfo, video_id) + + +class TapTapIntlBase(TapTapBaseIE): + _X_UA = 'V=1&PN=WebAppIntl2&LANG=zh_TW&VN_CODE=115&VN=0.1.0&LOC=CN&PLT=PC&DS=Android&UID={uuid}&CURR=&DT=PC&OS=Windows&OSV=NT%208.0.0' + _VIDEO_API = 'https://www.taptap.io/webapiv2/video-resource/v1/multi-get' + + +class TapTapMomentIE(TapTapBaseIE): + _VALID_URL = r'https?://www\.taptap\.cn/moment/(?P\d+)' + _INFO_API = 'https://www.taptap.cn/webapiv2/moment/v3/detail' + _ID_PATH = ('moment', 'topic', (('videos', ...), 'pin_video'), 'video_id') + _META_PATH = ('moment', { + 'timestamp': ('created_time', {int_or_none}), + 'uploader': ('author', 'user', 'name', {str}), + 'title': ('topic', 'title', {str}), + 'description': ('topic', 'summary', {str}), + }) _TESTS = [{ 'url': 'https://www.taptap.cn/moment/194618230982052443', 'info_dict': { - 'id': 'moment_194618230982052443', + 'id': '194618230982052443', 'title': '《崩坏3》开放世界「后崩坏书」新篇章 于淹没之地仰视辰星', 'description': 'md5:cf66f7819d413641b8b28c8543f4ecda', 'timestamp': 1633453402, @@ -38,7 +102,7 @@ class TapTapIE(InfoExtractor): }, { 'url': 'https://www.taptap.cn/moment/521630629209573493', 'info_dict': { - 'id': 'moment_521630629209573493', + 'id': '521630629209573493', 'title': '《崩坏:星穹铁道》黄泉角色PV——「你的颜色」', 'description': 'md5:2c81245da864428c904d53ae4ad2182b', 'timestamp': 1711425600, @@ -59,10 +123,21 @@ class TapTapIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.(png|jpg)', } }] - }, { + }] + + +class TapTapAppIE(TapTapBaseIE): + _VALID_URL = r'https?://www\.taptap\.cn/app/(?P\d+)' + _INFO_API = 'https://www.taptap.cn/webapiv2/app/v4/detail' + _ID_PATH = (('app_videos', 'videos'), ..., 'video_id') + _META_PATH = { + 'title': ('title', {str}), + 'description': ('description', 'text', {str}, {clean_html}), + } + _TESTS = [{ 'url': 'https://www.taptap.cn/app/168332', 'info_dict': { - 'id': 'app_168332', + 'id': '168332', 'title': '原神', 'description': 'md5:e345f39a5fea5de2a46923f70d5f76ab', }, @@ -86,63 +161,49 @@ class TapTapIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.(png|jpg)', } }] - }] - def _deserialize_nuxt_data(self, serialized_nuxt): - for row in serialized_nuxt: - if isinstance(row, dict): - for key, value_or_ref in row.items(): - if isinstance(value_or_ref, int): - row[key] = serialized_nuxt[value_or_ref] - elif isinstance(row, list): - for index, value_or_ref in tuple(enumerate(row)): - if isinstance(value_or_ref, int): - row[index] = serialized_nuxt[value_or_ref] - return serialized_nuxt[0] - - def _extract_video(self, video_id, x_ua): - data = self._download_json( - 'https://www.taptap.cn/webapiv2/video-resource/v1/multi-get', video_id, - query={'video_ids': video_id, 'X-UA': x_ua}) - - video = traverse_obj(data, ('data', 'list', 0, { - 'id': ('video_id', {str_or_none}), - 'url': ('play_url', ('url', 'url_h265'), {url_or_none}), - 'duration': ('info', 'duration', {int_or_none}), - 'thumbnail': ('thumbnail', ('original_url', 'url'), {url_or_none}), - }), get_all=False) - if '.m3u8' in video['url']: - video['formats'] = self._extract_m3u8_formats(video.pop('url'), video_id) - return video - def _real_extract(self, url): - section, list_id = self._match_valid_url(url).groups() - list_id = f'{section}_{list_id}' - - webpage = self._download_webpage(url, list_id) - nuxt_data = self._deserialize_nuxt_data(self._search_json( - r']+\bid=["\']__NUXT_DATA__["\'][^>]*>', webpage, - 'nuxt data', list_id, contains_pattern=r'\[(?s:.+)\]'))[1] - x_ua = traverse_obj(nuxt_data, ( - 'state', '$sbff', ..., {lambda x: parse_qs(x)['X-UA']}, ...), get_all=False) - - if section == 'moment': - moment_data = traverse_obj(nuxt_data, ('data', ..., 'moment'), get_all=False) - video_ids = traverse_obj(moment_data, ('topic', (('videos', ...), 'pin_video'), 'video_id')) - metainfo = traverse_obj(moment_data, { - 'timestamp': ('created_time', {int_or_none}), - 'uploader': ('author', 'user', 'name', {str}), - 'title': ('topic', 'title', {str}), - 'description': ('topic', 'summary', {str}), - }) - elif section == 'app': - video_ids = traverse_obj(nuxt_data, ('data', ..., ('app_videos', 'videos'), ..., 'video_id')) - metainfo = traverse_obj(nuxt_data, ('data', ..., { - 'title': ('title', {str}), - 'description': ('description', 'text', {str}, {clean_html}), - }), get_all=False) +class TapTapAppIntlIE(TapTapAppIE, TapTapIntlBase): + _VALID_URL = r'https?://www\.taptap\.io/app/(?P\d+)' + _INFO_API = 'https://www.taptap.io/webapiv2/i/app/v5/detail' + _DATA_PATH = ('data', 'app') - entries = [self._extract_video(video_id, x_ua) for video_id in set(video_ids)] - return self.playlist_result([{**metainfo, **e} for e in entries], **metainfo, id=list_id) +class TapTapPostIntlIE(TapTapAppIntlIE): + _VALID_URL = r'https?://www\.taptap\.io/post/(?P\d+)' + _INFO_API = 'https://www.taptap.io/webapiv2/creation/post/v1/detail' + _INFO_QUERY_KEY = 'id_str' + _DATA_PATH = ('data', 'post') + _ID_PATH = ((('videos', ...), 'pin_video'), 'video_id') + _META_PATH = { + 'timestamp': ('published_time', {int_or_none}), + 'uploader': ('user', 'name', {str}), + 'title': ('title', {str}), + 'description': ('list_fields', 'summary', {str}), + } + _TESTS = [{ + 'url': 'https://www.taptap.io/post/571785', + 'info_dict': { + 'id': '571785', + 'title': 'Arknights x Rainbow Six Siege | Event PV', + 'description': 'md5:f7717c13f6d3108e22db7303e6690bf7', + 'timestamp': 1614664951, + 'upload_date': '20210302', + 'uploader': 'TapTap Editor', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '2149491903', + 'ext': 'mp4', + 'title': 'Arknights x Rainbow Six Siege | Event PV', + 'description': 'md5:f7717c13f6d3108e22db7303e6690bf7', + 'duration': 122, + 'timestamp': 1614664951, + 'upload_date': '20210302', + 'uploader': 'TapTap Editor', + 'thumbnail': r're:^https?://.*\.(png|jpg)', + } + }] + }] From 847b1df127aa81d6e4c0ff6b49a4f160b3d68fc7 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Wed, 22 May 2024 01:50:41 -0400 Subject: [PATCH 04/10] Update yt_dlp/extractor/taptap.py Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> --- yt_dlp/extractor/taptap.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/taptap.py b/yt_dlp/extractor/taptap.py index 29bafeae07cb..01aba6cb6599 100644 --- a/yt_dlp/extractor/taptap.py +++ b/yt_dlp/extractor/taptap.py @@ -21,10 +21,8 @@ class TapTapBaseIE(InfoExtractor): _META_PATH = None def _get_api(self, url, video_id, query, **kwargs): - rand_hex = lambda digits: ''.join(f'{random.randint(0, 15):x}' for _ in range(digits)) - uuid = '-'.join(rand_hex(digits) for digits in [8, 4, 4, 4, 12]) - query = {**query, 'X-UA': self._X_UA.format(uuid=uuid)} - return self._download_json(url, video_id, query=query, **kwargs) + query = {**query, 'X-UA': self._X_UA.format(uuid=uuid.uuid4())} + return self._download_json(url, video_id, query=query, **kwargs)['data'] def _extract_video(self, video_id, is_intl=False): video_data = self._get_api(self._VIDEO_API, video_id, query={'video_ids': video_id})['data']['list'][0] From bf460863b20edab9505ce2a146ffdda6869698aa Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Wed, 22 May 2024 01:52:43 -0400 Subject: [PATCH 05/10] Apply suggestions from code review Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> --- yt_dlp/extractor/taptap.py | 88 +++++++++++++++++++++++++++++++++----- 1 file changed, 77 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/taptap.py b/yt_dlp/extractor/taptap.py index 01aba6cb6599..63e51c82a205 100644 --- a/yt_dlp/extractor/taptap.py +++ b/yt_dlp/extractor/taptap.py @@ -16,7 +16,7 @@ class TapTapBaseIE(InfoExtractor): _VIDEO_API = 'https://www.taptap.cn/webapiv2/video-resource/v1/multi-get' _INFO_API = None _INFO_QUERY_KEY = 'id' - _DATA_PATH = ('data') + _DATA_PATH = None _ID_PATH = None _META_PATH = None @@ -24,8 +24,8 @@ def _get_api(self, url, video_id, query, **kwargs): query = {**query, 'X-UA': self._X_UA.format(uuid=uuid.uuid4())} return self._download_json(url, video_id, query=query, **kwargs)['data'] - def _extract_video(self, video_id, is_intl=False): - video_data = self._get_api(self._VIDEO_API, video_id, query={'video_ids': video_id})['data']['list'][0] + def _extract_video(self, video_id): + video_data = self._get_api(self._VIDEO_API, video_id, query={'video_ids': video_id})['list'][0] video_url = traverse_obj(video_data, ('play_url', ('url_h265', 'url'), {url_or_none}))[0] formats = self._extract_m3u8_formats(video_url, video_id) @@ -69,7 +69,9 @@ class TapTapMomentIE(TapTapBaseIE): _ID_PATH = ('moment', 'topic', (('videos', ...), 'pin_video'), 'video_id') _META_PATH = ('moment', { 'timestamp': ('created_time', {int_or_none}), + 'modified_timestamp': ('edited_time', {int_or_none}), 'uploader': ('author', 'user', 'name', {str}), + 'uploader_id': ('author', 'user', 'id', {int}, {str_or_none}), 'title': ('topic', 'title', {str}), 'description': ('topic', 'summary', {str}), }) @@ -81,7 +83,10 @@ class TapTapMomentIE(TapTapBaseIE): 'description': 'md5:cf66f7819d413641b8b28c8543f4ecda', 'timestamp': 1633453402, 'upload_date': '20211005', + 'modified_timestamp': 1633453402, + 'modified_date': '20211005', 'uploader': '乌酱', + 'uploader_id': '532896', }, 'playlist_count': 1, 'playlist': [{ @@ -93,10 +98,14 @@ class TapTapMomentIE(TapTapBaseIE): 'duration': 66, 'timestamp': 1633453402, 'upload_date': '20211005', + 'modified_timestamp': 1633453402, + 'modified_date': '20211005', 'uploader': '乌酱', + 'uploader_id': '532896', 'thumbnail': r're:^https?://.*\.(png|jpg)', } - }] + }], + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://www.taptap.cn/moment/521630629209573493', 'info_dict': { @@ -105,7 +114,10 @@ class TapTapMomentIE(TapTapBaseIE): 'description': 'md5:2c81245da864428c904d53ae4ad2182b', 'timestamp': 1711425600, 'upload_date': '20240326', + 'modified_timestamp': 1711425600, + 'modified_date': '20240326', 'uploader': '崩坏:星穹铁道', + 'uploader_id': '414732580', }, 'playlist_count': 1, 'playlist': [{ @@ -117,10 +129,29 @@ class TapTapMomentIE(TapTapBaseIE): 'duration': 173, 'timestamp': 1711425600, 'upload_date': '20240326', + 'modified_timestamp': 1711425600, + 'modified_date': '20240326', 'uploader': '崩坏:星穹铁道', + 'uploader_id': '414732580', 'thumbnail': r're:^https?://.*\.(png|jpg)', } - }] + }], + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.taptap.cn/moment/540493587511511299', + 'playlist_count': 2, + 'info_dict': { + 'id': '540493587511511299', + 'title': '中式民俗解谜《纸嫁衣7》、新系列《纸不语》公布!', + 'description': 'md5:d60842350e686ddb242291ddfb8e39c9', + 'timestamp': 1715920200, + 'upload_date': '20240517', + 'modified_timestamp': 1715942225, + 'modified_date': '20240517', + 'uploader': 'TapTap 编辑', + 'uploader_id': '7159244', + }, + 'params': {'skip_download': 'm3u8'}, }] @@ -158,25 +189,53 @@ class TapTapAppIE(TapTapBaseIE): 'duration': 295, 'thumbnail': r're:^https?://.*\.(png|jpg)', } - }] + }], + 'params': {'skip_download': 'm3u8'}, }] -class TapTapAppIntlIE(TapTapAppIE, TapTapIntlBase): +class TapTapAppIntlIE(TapTapIntlBase): _VALID_URL = r'https?://www\.taptap\.io/app/(?P\d+)' _INFO_API = 'https://www.taptap.io/webapiv2/i/app/v5/detail' - _DATA_PATH = ('data', 'app') + _DATA_PATH = 'app' + _ID_PATH = (('app_videos', 'videos'), ..., 'video_id') + _META_PATH = { + 'title': ('title', {str}), + 'description': ('description', 'text', {str}, {clean_html}), + } + _TESTS = [{ + 'url': 'https://www.taptap.io/app/233287', + 'info_dict': { + 'id': '233287', + 'title': '《虹彩六號 M》', + 'description': 'md5:418285f9c15347fc3cf3e3a3c649f182', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '2149708997', + 'ext': 'mp4', + 'title': '《虹彩六號 M》', + 'description': 'md5:418285f9c15347fc3cf3e3a3c649f182', + 'duration': 78, + 'thumbnail': r're:^https?://.*\.(png|jpg)', + } + }], + 'params': {'skip_download': 'm3u8'}, + }] -class TapTapPostIntlIE(TapTapAppIntlIE): +class TapTapPostIntlIE(TapTapIntlBase): _VALID_URL = r'https?://www\.taptap\.io/post/(?P\d+)' _INFO_API = 'https://www.taptap.io/webapiv2/creation/post/v1/detail' _INFO_QUERY_KEY = 'id_str' - _DATA_PATH = ('data', 'post') + _DATA_PATH = 'post' _ID_PATH = ((('videos', ...), 'pin_video'), 'video_id') _META_PATH = { 'timestamp': ('published_time', {int_or_none}), + 'modified_timestamp': ('edited_time', {int_or_none}), 'uploader': ('user', 'name', {str}), + 'uploader_id': ('user', 'id', {int}, {str_or_none}), 'title': ('title', {str}), 'description': ('list_fields', 'summary', {str}), } @@ -188,7 +247,10 @@ class TapTapPostIntlIE(TapTapAppIntlIE): 'description': 'md5:f7717c13f6d3108e22db7303e6690bf7', 'timestamp': 1614664951, 'upload_date': '20210302', + 'modified_timestamp': 1614664951, + 'modified_date': '20210302', 'uploader': 'TapTap Editor', + 'uploader_id': '80224473', }, 'playlist_count': 1, 'playlist': [{ @@ -200,8 +262,12 @@ class TapTapPostIntlIE(TapTapAppIntlIE): 'duration': 122, 'timestamp': 1614664951, 'upload_date': '20210302', + 'modified_timestamp': 1614664951, + 'modified_date': '20210302', 'uploader': 'TapTap Editor', + 'uploader_id': '80224473', 'thumbnail': r're:^https?://.*\.(png|jpg)', } - }] + }], + 'params': {'skip_download': 'm3u8'}, }] From fd0ccd294ae955f05dfa3e146a4267eeb979c1ed Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Wed, 22 May 2024 01:53:18 -0400 Subject: [PATCH 06/10] Apply suggestions from code review Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> --- yt_dlp/extractor/taptap.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/taptap.py b/yt_dlp/extractor/taptap.py index 63e51c82a205..d3911273ede4 100644 --- a/yt_dlp/extractor/taptap.py +++ b/yt_dlp/extractor/taptap.py @@ -27,6 +27,7 @@ def _get_api(self, url, video_id, query, **kwargs): def _extract_video(self, video_id): video_data = self._get_api(self._VIDEO_API, video_id, query={'video_ids': video_id})['list'][0] + # h265 playlist contains both h265 and h264 formats video_url = traverse_obj(video_data, ('play_url', ('url_h265', 'url'), {url_or_none}))[0] formats = self._extract_m3u8_formats(video_url, video_id) for format in formats: @@ -42,10 +43,6 @@ def _extract_video(self, video_id): }), get_all=False) } - def _extract_entries(self, video_ids, metainfo, list_id): - entries = [{**metainfo, **self._extract_video(id)} for id in set(video_ids)] - return self.playlist_result(entries, **metainfo, id=list_id) - def _real_extract(self, url): video_id = self._match_id(url) query = {self._INFO_QUERY_KEY: video_id} @@ -53,9 +50,13 @@ def _real_extract(self, url): data = traverse_obj( self._get_api(self._INFO_API, video_id, query=query), self._DATA_PATH) - video_ids = traverse_obj(data, self._ID_PATH) metainfo = traverse_obj(data, self._META_PATH) - return self._extract_entries(video_ids, metainfo, video_id) + entries = [{ + **metainfo, + **self._extract_video(id) + } for id in set(traverse_obj(data, self._ID_PATH))] + + return self.playlist_result(entries, **metainfo, id=video_id) class TapTapIntlBase(TapTapBaseIE): From c28475eddc90df23bb5715dc7ee5b94aa26b26c1 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Wed, 22 May 2024 01:55:19 -0400 Subject: [PATCH 07/10] Update yt_dlp/extractor/taptap.py Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> --- yt_dlp/extractor/taptap.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/taptap.py b/yt_dlp/extractor/taptap.py index d3911273ede4..d567636e71bc 100644 --- a/yt_dlp/extractor/taptap.py +++ b/yt_dlp/extractor/taptap.py @@ -1,14 +1,15 @@ -import random import re +import uuid from .common import InfoExtractor from ..utils import ( clean_html, int_or_none, join_nonempty, - traverse_obj, + str_or_none, url_or_none, ) +from ..utils.traversal import traverse_obj class TapTapBaseIE(InfoExtractor): From 546c144ed723ecd41a0c4ea4c030fbf537c91574 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Thu, 23 May 2024 01:45:38 -0400 Subject: [PATCH 08/10] moving base class --- yt_dlp/extractor/taptap.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/taptap.py b/yt_dlp/extractor/taptap.py index d567636e71bc..c9b8458e391d 100644 --- a/yt_dlp/extractor/taptap.py +++ b/yt_dlp/extractor/taptap.py @@ -60,11 +60,6 @@ def _real_extract(self, url): return self.playlist_result(entries, **metainfo, id=video_id) -class TapTapIntlBase(TapTapBaseIE): - _X_UA = 'V=1&PN=WebAppIntl2&LANG=zh_TW&VN_CODE=115&VN=0.1.0&LOC=CN&PLT=PC&DS=Android&UID={uuid}&CURR=&DT=PC&OS=Windows&OSV=NT%208.0.0' - _VIDEO_API = 'https://www.taptap.io/webapiv2/video-resource/v1/multi-get' - - class TapTapMomentIE(TapTapBaseIE): _VALID_URL = r'https?://www\.taptap\.cn/moment/(?P\d+)' _INFO_API = 'https://www.taptap.cn/webapiv2/moment/v3/detail' @@ -196,6 +191,11 @@ class TapTapAppIE(TapTapBaseIE): }] +class TapTapIntlBase(TapTapBaseIE): + _X_UA = 'V=1&PN=WebAppIntl2&LANG=zh_TW&VN_CODE=115&VN=0.1.0&LOC=CN&PLT=PC&DS=Android&UID={uuid}&CURR=&DT=PC&OS=Windows&OSV=NT%208.0.0' + _VIDEO_API = 'https://www.taptap.io/webapiv2/video-resource/v1/multi-get' + + class TapTapAppIntlIE(TapTapIntlBase): _VALID_URL = r'https?://www\.taptap\.io/app/(?P\d+)' _INFO_API = 'https://www.taptap.io/webapiv2/i/app/v5/detail' From 147d7a328670513e5be9e98a72255d9fab561cba Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Thu, 23 May 2024 01:52:49 -0400 Subject: [PATCH 09/10] Update yt_dlp/extractor/taptap.py Co-authored-by: Simon Sawicki --- yt_dlp/extractor/taptap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/taptap.py b/yt_dlp/extractor/taptap.py index c9b8458e391d..b88807388c89 100644 --- a/yt_dlp/extractor/taptap.py +++ b/yt_dlp/extractor/taptap.py @@ -29,7 +29,7 @@ def _extract_video(self, video_id): video_data = self._get_api(self._VIDEO_API, video_id, query={'video_ids': video_id})['list'][0] # h265 playlist contains both h265 and h264 formats - video_url = traverse_obj(video_data, ('play_url', ('url_h265', 'url'), {url_or_none}))[0] + video_url = traverse_obj(video_data, ('play_url', ('url_h265', 'url'), {url_or_none}, any)) formats = self._extract_m3u8_formats(video_url, video_id) for format in formats: if re.search(r'^(hev|hvc|hvt)\d', format.get('vcodec', '')): From 31bb05cd5e65938b58ff4418f2a447eef50accae Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Thu, 23 May 2024 20:10:07 +0200 Subject: [PATCH 10/10] Make m3u8 extraction non fatal Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/taptap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/taptap.py b/yt_dlp/extractor/taptap.py index b88807388c89..56f2f0ef4b6c 100644 --- a/yt_dlp/extractor/taptap.py +++ b/yt_dlp/extractor/taptap.py @@ -30,7 +30,7 @@ def _extract_video(self, video_id): # h265 playlist contains both h265 and h264 formats video_url = traverse_obj(video_data, ('play_url', ('url_h265', 'url'), {url_or_none}, any)) - formats = self._extract_m3u8_formats(video_url, video_id) + formats = self._extract_m3u8_formats(video_url, video_id, fatal=False) for format in formats: if re.search(r'^(hev|hvc|hvt)\d', format.get('vcodec', '')): format['format_id'] = join_nonempty(format.get('format_id'), 'h265', delim='_')