Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ie/taptap] Add extractors #9776

Merged
merged 10 commits into from
May 23, 2024
6 changes: 6 additions & 0 deletions yt_dlp/extractor/_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1906,6 +1906,12 @@
from .syfy import SyfyIE
from .sztvhu import SztvHuIE
from .tagesschau import TagesschauIE
from .taptap import (
TapTapMomentIE,
TapTapAppIE,
TapTapAppIntlIE,
TapTapPostIntlIE,
)
from .tass import TassIE
from .tbs import TBSIE
from .tbsjp import (
Expand Down
275 changes: 275 additions & 0 deletions yt_dlp/extractor/taptap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,275 @@
import re
import uuid

from .common import InfoExtractor
from ..utils import (
clean_html,
int_or_none,
join_nonempty,
str_or_none,
url_or_none,
)
from ..utils.traversal import traverse_obj


class TapTapBaseIE(InfoExtractor):
_X_UA = 'V=1&PN=WebApp&LANG=zh_CN&VN_CODE=102&LOC=CN&PLT=PC&DS=Android&UID={uuid}&OS=Windows&OSV=10&DT=PC'
_VIDEO_API = 'https://www.taptap.cn/webapiv2/video-resource/v1/multi-get'
_INFO_API = None
_INFO_QUERY_KEY = 'id'
_DATA_PATH = None
_ID_PATH = None
_META_PATH = None

def _get_api(self, url, video_id, query, **kwargs):
query = {**query, 'X-UA': self._X_UA.format(uuid=uuid.uuid4())}
Grub4K marked this conversation as resolved.
Show resolved Hide resolved
return self._download_json(url, video_id, query=query, **kwargs)['data']

def _extract_video(self, video_id):
video_data = self._get_api(self._VIDEO_API, video_id, query={'video_ids': video_id})['list'][0]

# h265 playlist contains both h265 and h264 formats
video_url = traverse_obj(video_data, ('play_url', ('url_h265', 'url'), {url_or_none}, any))
formats = self._extract_m3u8_formats(video_url, video_id, fatal=False)
for format in formats:
if re.search(r'^(hev|hvc|hvt)\d', format.get('vcodec', '')):
format['format_id'] = join_nonempty(format.get('format_id'), 'h265', delim='_')

return {
'id': str(video_id),
'formats': formats,
**traverse_obj(video_data, ({
'duration': ('info', 'duration', {int_or_none}),
'thumbnail': ('thumbnail', ('original_url', 'url'), {url_or_none}),
}), get_all=False)
}

def _real_extract(self, url):
video_id = self._match_id(url)
query = {self._INFO_QUERY_KEY: video_id}

data = traverse_obj(
self._get_api(self._INFO_API, video_id, query=query), self._DATA_PATH)

metainfo = traverse_obj(data, self._META_PATH)
entries = [{
**metainfo,
**self._extract_video(id)
} for id in set(traverse_obj(data, self._ID_PATH))]

return self.playlist_result(entries, **metainfo, id=video_id)


class TapTapMomentIE(TapTapBaseIE):
_VALID_URL = r'https?://www\.taptap\.cn/moment/(?P<id>\d+)'
_INFO_API = 'https://www.taptap.cn/webapiv2/moment/v3/detail'
_ID_PATH = ('moment', 'topic', (('videos', ...), 'pin_video'), 'video_id')
_META_PATH = ('moment', {
'timestamp': ('created_time', {int_or_none}),
'modified_timestamp': ('edited_time', {int_or_none}),
'uploader': ('author', 'user', 'name', {str}),
c-basalt marked this conversation as resolved.
Show resolved Hide resolved
'uploader_id': ('author', 'user', 'id', {int}, {str_or_none}),
'title': ('topic', 'title', {str}),
'description': ('topic', 'summary', {str}),
})
_TESTS = [{
'url': 'https://www.taptap.cn/moment/194618230982052443',
'info_dict': {
'id': '194618230982052443',
'title': '《崩坏3》开放世界「后崩坏书」新篇章 于淹没之地仰视辰星',
'description': 'md5:cf66f7819d413641b8b28c8543f4ecda',
'timestamp': 1633453402,
'upload_date': '20211005',
'modified_timestamp': 1633453402,
'modified_date': '20211005',
'uploader': '乌酱',
c-basalt marked this conversation as resolved.
Show resolved Hide resolved
'uploader_id': '532896',
},
'playlist_count': 1,
'playlist': [{
'info_dict': {
'id': '2202584',
'ext': 'mp4',
'title': '《崩坏3》开放世界「后崩坏书」新篇章 于淹没之地仰视辰星',
'description': 'md5:cf66f7819d413641b8b28c8543f4ecda',
'duration': 66,
'timestamp': 1633453402,
'upload_date': '20211005',
'modified_timestamp': 1633453402,
'modified_date': '20211005',
'uploader': '乌酱',
c-basalt marked this conversation as resolved.
Show resolved Hide resolved
'uploader_id': '532896',
'thumbnail': r're:^https?://.*\.(png|jpg)',
}
}],
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://www.taptap.cn/moment/521630629209573493',
'info_dict': {
'id': '521630629209573493',
'title': '《崩坏:星穹铁道》黄泉角色PV——「你的颜色」',
'description': 'md5:2c81245da864428c904d53ae4ad2182b',
'timestamp': 1711425600,
'upload_date': '20240326',
'modified_timestamp': 1711425600,
'modified_date': '20240326',
'uploader': '崩坏:星穹铁道',
c-basalt marked this conversation as resolved.
Show resolved Hide resolved
'uploader_id': '414732580',
},
'playlist_count': 1,
'playlist': [{
'info_dict': {
'id': '4006511',
'ext': 'mp4',
'title': '《崩坏:星穹铁道》黄泉角色PV——「你的颜色」',
'description': 'md5:2c81245da864428c904d53ae4ad2182b',
'duration': 173,
'timestamp': 1711425600,
'upload_date': '20240326',
'modified_timestamp': 1711425600,
'modified_date': '20240326',
'uploader': '崩坏:星穹铁道',
c-basalt marked this conversation as resolved.
Show resolved Hide resolved
'uploader_id': '414732580',
'thumbnail': r're:^https?://.*\.(png|jpg)',
}
}],
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://www.taptap.cn/moment/540493587511511299',
'playlist_count': 2,
'info_dict': {
'id': '540493587511511299',
'title': '中式民俗解谜《纸嫁衣7》、新系列《纸不语》公布!',
'description': 'md5:d60842350e686ddb242291ddfb8e39c9',
'timestamp': 1715920200,
'upload_date': '20240517',
'modified_timestamp': 1715942225,
'modified_date': '20240517',
'uploader': 'TapTap 编辑',
'uploader_id': '7159244',
},
'params': {'skip_download': 'm3u8'},
}]


class TapTapAppIE(TapTapBaseIE):
_VALID_URL = r'https?://www\.taptap\.cn/app/(?P<id>\d+)'
_INFO_API = 'https://www.taptap.cn/webapiv2/app/v4/detail'
_ID_PATH = (('app_videos', 'videos'), ..., 'video_id')
_META_PATH = {
'title': ('title', {str}),
'description': ('description', 'text', {str}, {clean_html}),
}
_TESTS = [{
'url': 'https://www.taptap.cn/app/168332',
'info_dict': {
'id': '168332',
'title': '原神',
'description': 'md5:e345f39a5fea5de2a46923f70d5f76ab',
},
'playlist_count': 2,
'playlist': [{
'info_dict': {
'id': '4058443',
'ext': 'mp4',
'title': '原神',
'description': 'md5:e345f39a5fea5de2a46923f70d5f76ab',
'duration': 26,
'thumbnail': r're:^https?://.*\.(png|jpg)',
}
}, {
'info_dict': {
'id': '4058462',
'ext': 'mp4',
'title': '原神',
'description': 'md5:e345f39a5fea5de2a46923f70d5f76ab',
'duration': 295,
'thumbnail': r're:^https?://.*\.(png|jpg)',
}
}],
'params': {'skip_download': 'm3u8'},
}]


class TapTapIntlBase(TapTapBaseIE):
_X_UA = 'V=1&PN=WebAppIntl2&LANG=zh_TW&VN_CODE=115&VN=0.1.0&LOC=CN&PLT=PC&DS=Android&UID={uuid}&CURR=&DT=PC&OS=Windows&OSV=NT%208.0.0'
_VIDEO_API = 'https://www.taptap.io/webapiv2/video-resource/v1/multi-get'


class TapTapAppIntlIE(TapTapIntlBase):
_VALID_URL = r'https?://www\.taptap\.io/app/(?P<id>\d+)'
_INFO_API = 'https://www.taptap.io/webapiv2/i/app/v5/detail'
_DATA_PATH = 'app'
_ID_PATH = (('app_videos', 'videos'), ..., 'video_id')
_META_PATH = {
'title': ('title', {str}),
'description': ('description', 'text', {str}, {clean_html}),
}
_TESTS = [{
'url': 'https://www.taptap.io/app/233287',
'info_dict': {
'id': '233287',
'title': '《虹彩六號 M》',
'description': 'md5:418285f9c15347fc3cf3e3a3c649f182',
},
'playlist_count': 1,
'playlist': [{
'info_dict': {
'id': '2149708997',
'ext': 'mp4',
'title': '《虹彩六號 M》',
'description': 'md5:418285f9c15347fc3cf3e3a3c649f182',
'duration': 78,
'thumbnail': r're:^https?://.*\.(png|jpg)',
}
}],
'params': {'skip_download': 'm3u8'},
}]

c-basalt marked this conversation as resolved.
Show resolved Hide resolved

class TapTapPostIntlIE(TapTapIntlBase):
_VALID_URL = r'https?://www\.taptap\.io/post/(?P<id>\d+)'
_INFO_API = 'https://www.taptap.io/webapiv2/creation/post/v1/detail'
_INFO_QUERY_KEY = 'id_str'
_DATA_PATH = 'post'
_ID_PATH = ((('videos', ...), 'pin_video'), 'video_id')
_META_PATH = {
'timestamp': ('published_time', {int_or_none}),
'modified_timestamp': ('edited_time', {int_or_none}),
'uploader': ('user', 'name', {str}),
c-basalt marked this conversation as resolved.
Show resolved Hide resolved
'uploader_id': ('user', 'id', {int}, {str_or_none}),
'title': ('title', {str}),
'description': ('list_fields', 'summary', {str}),
}
_TESTS = [{
'url': 'https://www.taptap.io/post/571785',
'info_dict': {
'id': '571785',
'title': 'Arknights x Rainbow Six Siege | Event PV',
'description': 'md5:f7717c13f6d3108e22db7303e6690bf7',
'timestamp': 1614664951,
'upload_date': '20210302',
'modified_timestamp': 1614664951,
'modified_date': '20210302',
'uploader': 'TapTap Editor',
c-basalt marked this conversation as resolved.
Show resolved Hide resolved
'uploader_id': '80224473',
},
'playlist_count': 1,
'playlist': [{
'info_dict': {
'id': '2149491903',
'ext': 'mp4',
'title': 'Arknights x Rainbow Six Siege | Event PV',
'description': 'md5:f7717c13f6d3108e22db7303e6690bf7',
'duration': 122,
'timestamp': 1614664951,
'upload_date': '20210302',
'modified_timestamp': 1614664951,
'modified_date': '20210302',
'uploader': 'TapTap Editor',
c-basalt marked this conversation as resolved.
Show resolved Hide resolved
'uploader_id': '80224473',
'thumbnail': r're:^https?://.*\.(png|jpg)',
}
}],
'params': {'skip_download': 'm3u8'},
}]