Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[extractor/douyutv] fix broken DouyuTV and DouyuShow #7652

Merged
merged 29 commits into from
Sep 21, 2023
Merged
Changes from 16 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
5db628a
port WhiteMinds/DouYuRecorder for #7295
c-basalt Jul 18, 2023
d180f98
format_id to str
c-basalt Jul 18, 2023
ac5d068
detect replays
c-basalt Jul 18, 2023
42d3bde
extract all formats available
c-basalt Jul 18, 2023
9900be1
improve log consistency
c-basalt Jul 19, 2023
f3da783
rewrite DouyuShow
c-basalt Jul 21, 2023
4d06427
rewrite DouyuShow
c-basalt Jul 21, 2023
0e6f535
clean up old code
c-basalt Jul 21, 2023
bf4f367
Merge branch 'douyu' of github-cb:c-basalt/yt-dlp into douyu
c-basalt Jul 21, 2023
0bd9507
fixes and improvement
c-basalt Jul 21, 2023
b710536
improve js_sign_func extraction
c-basalt Jul 23, 2023
729571f
cache cryptojs and add more cdn urls
c-basalt Jul 23, 2023
ef33903
minor edit
c-basalt Jul 23, 2023
788bdfb
use disk cache for js lib
c-basalt Jul 23, 2023
ffb4114
clean up
c-basalt Jul 23, 2023
0d2681e
improve regex
c-basalt Aug 2, 2023
feaab13
extract common to base ie
c-basalt Aug 2, 2023
de6e7db
fix
c-basalt Aug 2, 2023
057d0e6
flake8
c-basalt Aug 2, 2023
e239988
Apply suggestions for sign functions
c-basalt Sep 19, 2023
16f0161
Apply suggestions for regex
c-basalt Sep 19, 2023
ca8dd68
Apply suggestions for traversal
c-basalt Sep 19, 2023
6eaaa7f
sort imports
c-basalt Sep 19, 2023
0fd095b
flake8
c-basalt Sep 19, 2023
71d51ee
changing ext to mp4
c-basalt Sep 19, 2023
ca9e174
making metadata non-fatal
c-basalt Sep 19, 2023
2d5718a
Update yt_dlp/extractor/douyutv.py
c-basalt Sep 19, 2023
836e057
Update yt_dlp/extractor/douyutv.py
c-basalt Sep 19, 2023
0d831d4
Update yt_dlp/extractor/douyutv.py
c-basalt Sep 21, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
271 changes: 182 additions & 89 deletions yt_dlp/extractor/douyutv.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,65 @@
import time
import hashlib
import re
import urllib
import uuid

from .common import InfoExtractor
from .openload import PhantomJSwrapper
from ..utils import (
ExtractorError,
UserNotLive,
int_or_none,
str_or_none,
url_or_none,
traverse_obj,
determine_ext,
parse_resolution,
urlencode_postdata,
unescapeHTML,
unified_strdate,
js_to_json,
urljoin,
)
c-basalt marked this conversation as resolved.
Show resolved Hide resolved


class DouyuTVIE(InfoExtractor):
IE_DESC = '斗鱼'
class DouyuBaseIE(InfoExtractor):
def _download_cryptojs_md5(self, video_id):
for url in [
'https://cdnjs.cloudflare.com/ajax/libs/crypto-js/3.1.2/rollups/md5.js',
'https://cdn.bootcdn.net/ajax/libs/crypto-js/3.1.2/rollups/md5.js',
]:
js_code = self._download_webpage(
url, video_id, note='Downloading signing dependency', fatal=False)
if js_code:
self.cache.store('douyu', 'crypto-js-md5', js_code)
return js_code
raise ExtractorError('Unable to download JS dependency (crypto-js/md5)')

def _get_cryptojs_md5(self, video_id):
return self.cache.load('douyu', 'crypto-js-md5') or self._download_cryptojs_md5(video_id)

def _calc_sign(self, sign_func, a, b, c, video_id):
js_script = f'{self._get_cryptojs_md5(video_id)};{sign_func};console.log(ub98484234("{a}","{b}","{c}"))'
phantom = PhantomJSwrapper(self)
result = phantom.execute(js_script, video_id,
note='Executing JS signing script').strip()
return {i: v[0] for i, v in urllib.parse.parse_qs(result).items()}
bashonly marked this conversation as resolved.
Show resolved Hide resolved


class DouyuTVIE(DouyuBaseIE):
IE_DESC = '斗鱼直播'
_VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(topic/\w+\?rid=|(?:[^/]+/))*(?P<id>[A-Za-z0-9]+)'
_TESTS = [{
'url': 'http://www.douyutv.com/iseven',
'url': 'https://www.douyu.com/pigff',
'info_dict': {
'id': '17732',
'display_id': 'iseven',
'ext': 'flv',
'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r're:.*m7show@163\.com.*',
'thumbnail': r're:^https?://.*\.png',
'uploader': '7师傅',
'id': '24422',
'display_id': 'pigff',
'ext': 'ts',
'title': 're:^【PIGFF】.* [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r'≥15级牌子看鱼吧置顶帖进粉丝vx群',
'thumbnail': str,
'uploader': 'pigff',
'is_live': True,
'live_status': 'is_live',
},
'params': {
'skip_download': True,
Expand Down Expand Up @@ -85,15 +119,51 @@
'only_matching': True,
}]

def _sign(self, js_sign_func, room_id, video_id, params={}):
params.update(self._calc_sign(
js_sign_func, room_id, uuid.uuid4().hex, round(time.time()), video_id))
return params

def _get_sign_func(self, room_id, video_id):
return self._download_json(
f'https://www.douyu.com/swf_api/homeH5Enc?rids={room_id}', video_id,
note='Getting signing script')['data'][f'room{room_id}']

def _extract_stream_formats(self, stream_formats):
formats = []
for stream_info in traverse_obj(stream_formats, (..., 'data')):
stream_url = urljoin(
traverse_obj(stream_info, 'rtmp_url'), traverse_obj(stream_info, 'rtmp_live'))
if stream_url:
rate_id = traverse_obj(stream_info, ('rate', {int_or_none}))
rate_info = traverse_obj(stream_info, ('multirates', lambda _, v: v.get('rate') == rate_id), get_all=False)
c-basalt marked this conversation as resolved.
Show resolved Hide resolved
formats.append({
'url': stream_url,
'format_id': str_or_none(rate_id),
'ext': 'ts' if '.m3u8' in stream_url else determine_ext(stream_url),
bashonly marked this conversation as resolved.
Show resolved Hide resolved
'quality': rate_id % -10000 if rate_id is not None else None,
**traverse_obj(rate_info, {
'format': ('name', {str_or_none}),
'tbr': ('bit', {int_or_none}),
}),
})
return formats

def _real_extract(self, url):
video_id = self._match_id(url)

if video_id.isdigit():
room_id = video_id
else:
page = self._download_webpage(url, video_id)
room_id = self._html_search_regex(
r'"room_id\\?"\s*:\s*(\d+),', page, 'room id')
webpage = self._download_webpage(url, video_id)
room_id = self._search_regex(r'\$ROOM\.room_id\s*=\s*(\d+)', webpage, 'room id')

if '"videoLoop":1,' in webpage:
raise UserNotLive('The channel is auto-playing VODs', video_id=video_id)
if '$ROOM.show_status =2;' in webpage:
raise UserNotLive(video_id=video_id)
c-basalt marked this conversation as resolved.
Show resolved Hide resolved

# use preceeding greedy non-capture to achieve non-greedy in backward direction
js_sign_func = self._search_regex(
r'(?:<script.*)*<script[^>]*>(.*?ub98484234.*?)</script>', webpage, 'JS sign func', fatal=False
Fixed Show fixed Hide fixed
) or self._get_sign_func(room_id, video_id)

# Grab metadata from API
params = {
Expand All @@ -102,110 +172,133 @@
'time': int(time.time()),
}
params['auth'] = hashlib.md5(
f'room/{video_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest()
f'room/{room_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest()
room = self._download_json(
f'http://www.douyutv.com/api/v1/room/{room_id}', video_id,
note='Downloading room info', query=params)['data']

# 1 = live, 2 = offline
if room.get('show_status') == '2':
raise ExtractorError('Live stream is offline', expected=True)
raise UserNotLive(video_id=video_id)

video_url = urljoin('https://hls3-akm.douyucdn.cn/', self._search_regex(r'(live/.*)', room['hls_url'], 'URL'))
formats, subs = self._extract_m3u8_formats_and_subtitles(video_url, room_id)
form_data = self._sign(js_sign_func, room_id, video_id, {'rate': 0})
stream_formats = [self._download_json(
f'https://www.douyu.com/lapi/live/getH5Play/{room_id}',
video_id, note="Downloading livestream format",
data=urlencode_postdata(form_data))]

title = unescapeHTML(room['room_name'])
description = room.get('show_details')
thumbnail = room.get('room_src')
uploader = room.get('nickname')
for rate_id in traverse_obj(stream_formats[0], ('data', 'multirates', ..., 'rate')):
if rate_id != traverse_obj(stream_formats[0], ('data', 'rate')):
form_data['rate'] = rate_id
stream_formats.append(self._download_json(
f'https://www.douyu.com/lapi/live/getH5Play/{room_id}',
video_id, note=f'Downloading livestream format {rate_id}',
data=urlencode_postdata(form_data)))

return {
'id': room_id,
'display_id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'formats': self._extract_stream_formats(stream_formats),
'is_live': True,
'subtitles': subs,
'formats': formats,
**traverse_obj(room, {
'display_id': ('url', {str_or_none}, {lambda i: i[1:] if i else None}),
'title': ('room_name', {str_or_none}, {unescapeHTML}),
'description': ('show_details', {str_or_none}),
'uploader': ('nickname', {str_or_none}),
'thumbnail': ('room_src', {url_or_none}),
c-basalt marked this conversation as resolved.
Show resolved Hide resolved
})
}


class DouyuShowIE(InfoExtractor):
class DouyuShowIE(DouyuBaseIE):
_VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)'

_TESTS = [{
'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw',
'md5': '0c2cfd068ee2afe657801269b2d86214',
'url': 'https://v.douyu.com/show/mPyq7oVNe5Yv1gLY',
'info_dict': {
'id': 'rjNBdvnVXNzvE2yw',
'ext': 'mp4',
'title': '陈一发儿:砒霜 我有个室友系列!04-01 22点场',
'duration': 7150.08,
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': '陈一发儿',
'uploader_id': 'XrZwYelr5wbK',
'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK',
'upload_date': '20170402',
'id': 'mPyq7oVNe5Yv1gLY',
'ext': 'ts',
'title': '四川人小时候的味道“蒜苗回锅肉”,传统菜不能丢,要常做来吃',
'duration': 633,
'thumbnail': str,
'uploader': '美食作家王刚V',
'uploader_id': 'OVAO4NVx1m7Q',
'timestamp': 1661850002,
'upload_date': '20220830',
'view_count': int,
'tags': ['美食', '美食综合'],
},
}, {
'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw',
'only_matching': True,
}]

_FORMATS = {
'super': '原画',
'high': '超清',
'normal': '高清',
}

_QUALITIES = {
'super': -1,
'high': -2,
'normal': -3,
}

_RESOLUTIONS = {
'super': '1920x1080',
'high': '1280x720',
'normal': '852x480',
}

def _sign(self, sign_func, vid, video_id):
return self._calc_sign(sign_func, vid, uuid.uuid4().hex, round(time.time()), video_id)

def _real_extract(self, url):
url = url.replace('vmobile.', 'v.')
video_id = self._match_id(url)

webpage = self._download_webpage(url, video_id)

room_info = self._parse_json(self._search_regex(
r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id)

video_info = None

for trial in range(5):
# Sometimes Douyu rejects our request. Let's try it more times
try:
video_info = self._download_json(
'https://vmobile.douyu.com/video/getInfo', video_id,
query={'vid': video_id},
headers={
'Referer': url,
'x-requested-with': 'XMLHttpRequest',
})
break
except ExtractorError:
self._sleep(1, video_id)

if not video_info:
raise ExtractorError('Can\'t fetch video info')

formats = self._extract_m3u8_formats(
video_info['data']['video_url'], video_id,
entry_protocol='m3u8_native', ext='mp4')

upload_date = unified_strdate(self._html_search_regex(
r'<em>上传时间:</em><span>([^<]+)</span>', webpage,
'upload date', fatal=False))

uploader = uploader_id = uploader_url = None
mobj = re.search(
r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"',
webpage)
if mobj:
uploader_id, uploader = mobj.groups()
uploader_url = urljoin(url, '/author/' + uploader_id)
video_info = self._search_json(
r'<script>window\.\$DATA=', webpage,
c-basalt marked this conversation as resolved.
Show resolved Hide resolved
'video info', video_id, transform_source=js_to_json)

js_sign_func = self._search_regex(r'(?:<script.*)*<script[^>]*>(.*?ub98484234.*?)</script>', webpage, 'JS sign func')
Fixed Show fixed Hide fixed
form_data = {
'vid': video_id,
**self._sign(js_sign_func, video_info['ROOM']['point_id'], video_id),
}
url_info = self._download_json(
'https://v.douyu.com/api/stream/getStreamUrl', video_id,
data=urlencode_postdata(form_data), note="Downloading video formats")

formats = []
for name, url in traverse_obj(url_info, ('data', 'thumb_video')).items():
c-basalt marked this conversation as resolved.
Show resolved Hide resolved
video_url = traverse_obj(url, ('url', {url_or_none}))
if video_url:
formats.append({
'format': self._FORMATS.get(name),
'format_id': name,
'url': video_url,
'quality': self._QUALITIES.get(name),
'ext': 'ts' if '.m3u8' in video_url else determine_ext(video_url),
bashonly marked this conversation as resolved.
Show resolved Hide resolved
**parse_resolution(self._RESOLUTIONS.get(name))
})
else:
self.to_screen(
f'"{self._FORMATS.get(name, name)}" format may require logging in. {self._login_hint()}')

return {
'id': video_id,
'title': room_info['name'],
'formats': formats,
'duration': room_info.get('duration'),
'thumbnail': room_info.get('pic'),
'upload_date': upload_date,
'uploader': uploader,
'uploader_id': uploader_id,
'uploader_url': uploader_url,
**traverse_obj(video_info, ('DATA', {
'title': ('content', 'title', {str_or_none}),
'uploader': ('content', 'author', {str_or_none}),
'uploader_id': ('content', 'up_id', {str_or_none}),
'duration': ('content', 'video_duration', {int_or_none}),
'thumbnail': ('content', 'video_pic', {url_or_none}),
'timestamp': ('content', 'create_time', {int_or_none}),
'view_count': ('content', 'view_num', {int_or_none}),
'tags': ('videoTag', ..., 'tagName', {str_or_none}),
c-basalt marked this conversation as resolved.
Show resolved Hide resolved
}))
}