Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[duboku] add new extractor #26467

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 242 additions & 0 deletions youtube_dl/extractor/duboku.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
clean_html,
extract_attributes,
ExtractorError,
get_elements_by_class,
int_or_none,
js_to_json,
smuggle_url,
unescapeHTML,
)


def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
"""Return the content of the tag with the specified attribute in the passed HTML document"""

if tag is None:
tag = '[a-zA-Z0-9:._-]+'
if attribute is None:
attribute = ''
else:
attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute)
if value is None:
value = ''
else:
value = re.escape(value) if escape_value else value
value = '=[\'"]?(?P<value>%s)[\'"]?' % value

retlist = []
for m in re.finditer(r'''(?xs)
<(?P<tag>%s)
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
%s%s
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
\s*>
(?P<content>.*?)
</\1>
''' % (tag, attribute, value), html):
retlist.append(m)

return retlist


def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
return retval[0] if retval else None


class DubokuIE(InfoExtractor):
IE_NAME = 'duboku'
IE_DESC = 'www.duboku.co'

_VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Require n-n-n in id field; no need to match the tail:

Suggested change
_VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
_VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>(?:[0-9]+-){2}[0-9]+)\.html'

_TESTS = [{
'url': 'https://www.duboku.co/vodplay/1575-1-1.html',
'info_dict': {
'id': '1575-1-1',
'ext': 'ts',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fix test:

Suggested change
'ext': 'ts',
'ext': 'mp4',

'series': '白色月光',
'title': 'contains:白色月光',
'season_number': 1,
'episode_number': 1,
},
'params': {
'skip_download': 'm3u8 download',
},
}, {
'url': 'https://www.duboku.co/vodplay/1588-1-1.html',
'info_dict': {
'id': '1588-1-1',
'ext': 'ts',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fix test:

Suggested change
'ext': 'ts',
'ext': 'mp4',

'series': '亲爱的自己',
'title': 'contains:预告片',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Page has changed:

Suggested change
'title': 'contains:预告片',
'title': '亲爱的自己 第1集',

'season_number': 1,
'episode_number': 1,
},
'params': {
'skip_download': 'm3u8 download',
},
}]

_PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'

def _real_extract(self, url):
video_id = self._match_id(url)
temp = video_id.split('-')
series_id = temp[0]
season_id = temp[1]
episode_id = temp[2]
Comment on lines +92 to +95
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Simpler:

Suggested change
temp = video_id.split('-')
series_id = temp[0]
season_id = temp[1]
episode_id = temp[2]
series_id, season_id, episode_id = video_id.split('-')


webpage_url = 'https://www.duboku.co/vodplay/%s.html' % video_id
webpage_html = self._download_webpage(webpage_url, video_id)

# extract video url

player_data = self._search_regex(
self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
player_data = self._parse_json(player_data, video_id, js_to_json)

# extract title

temp = get_elements_by_class('title', webpage_html)
series_title = None
title = None
for html in temp:
Comment on lines +108 to +111
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Simplify:

Suggested change
temp = get_elements_by_class('title', webpage_html)
series_title = None
title = None
for html in temp:
series_title, title = None, None
for html in get_elements_by_class('title', webpage_html):

mobj = re.search(r'<a\s+.*>(.*)</a>', html)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use the function defined earlier:

Suggested change
mobj = re.search(r'<a\s+.*>(.*)</a>', html)
mobj = _get_element_by_tag_and_attrib(html, tag='a')

if mobj:
href = extract_attributes(mobj.group(0)).get('href')
if href:
mobj1 = re.search(r'/(\d+)\.html', href)
if mobj1 and mobj1.group(1) == series_id:
series_title = clean_html(mobj.group(0))
series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
title = clean_html(html)
title = re.sub(r'[\s\r\n\t]+', ' ', title)
break
Comment on lines +114 to +122
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • use the resulting match object
  • avoid excessive indentation
  • r'\s' includes any whitespace
  • simplify clean_html() expressions
Suggested change
href = extract_attributes(mobj.group(0)).get('href')
if href:
mobj1 = re.search(r'/(\d+)\.html', href)
if mobj1 and mobj1.group(1) == series_id:
series_title = clean_html(mobj.group(0))
series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
title = clean_html(html)
title = re.sub(r'[\s\r\n\t]+', ' ', title)
break
href = extract_attributes(html[mobj.start(0):mobj.start('content')]).get('href')
if not href:
continue
mobj1 = re.search(r'/(?P<s_id>\d+)\.html', href)
if mobj1 and mobj1.group('s_id') == series_id:
series_title = clean_html(re.sub(r'\s+', ' ', mobj.group('content')))
title = clean_html(re.sub(r'\s+', ' ', html))
break


data_url = player_data.get('url')
if not data_url:
raise ExtractorError('Cannot find url in player_data')
data_from = player_data.get('from')

# if it is an embedded iframe, maybe it's an external source
if data_from == 'iframe':
# use _type url_transparent to retain the meaningful details
# of the video.
return {
'_type': 'url_transparent',
'url': smuggle_url(data_url, {'http_headers': {'Referer': webpage_url}}),
'id': video_id,
'title': title,
'series': series_title,
'season_number': int_or_none(season_id),
'season_id': season_id,
'episode_number': int_or_none(episode_id),
'episode_id': episode_id,
}

formats = self._extract_m3u8_formats(data_url, video_id, 'mp4')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pass Referer header to avoid 403:

Suggested change
formats = self._extract_m3u8_formats(data_url, video_id, 'mp4')
headers = {'Referer': 'https://www.duboku.co/static/player/videojs.html'}
formats = self._extract_m3u8_formats(data_url, video_id, 'mp4', headers=headers)


return {
'id': video_id,
'title': title,
'series': series_title,
'season_number': int_or_none(season_id),
'season_id': season_id,
'episode_number': int_or_none(episode_id),
'episode_id': episode_id,
'formats': formats,
'http_headers': {'Referer': 'https://www.duboku.co/static/player/videojs.html'}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use headers as introduced above:

Suggested change
'http_headers': {'Referer': 'https://www.duboku.co/static/player/videojs.html'}
'http_headers': headers,

}


class DubokuPlaylistIE(InfoExtractor):
IE_NAME = 'duboku:list'
IE_DESC = 'www.duboku.co entire series'

_VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P<id>[0-9]+)\.html.*'
_TESTS = [{
'url': 'https://www.duboku.co/voddetail/1575.html',
'info_dict': {
'id': 'startswith:1575',
'title': '白色月光',
},
'playlist_count': 12,
}, {
'url': 'https://www.duboku.co/voddetail/1554.html',
'info_dict': {
'id': 'startswith:1554',
'title': '以家人之名',
},
'playlist_mincount': 30,
}, {
'url': 'https://www.duboku.co/voddetail/1554.html#playlist2',
'info_dict': {
'id': '1554#playlist2',
Comment on lines +180 to +182
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#playlist2 has gone: use #playlist1 instead:

Suggested change
'url': 'https://www.duboku.co/voddetail/1554.html#playlist2',
'info_dict': {
'id': '1554#playlist2',
'url': 'https://www.duboku.co/voddetail/1554.html#playlist1',
'info_dict': {
'id': '1554#playlist1',

'title': '以家人之名',
},
'playlist_mincount': 27,
}]

def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError('Invalid URL: %s' % url)
series_id = mobj.group('id')
Comment on lines +189 to +192
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Simplify:

Suggested change
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError('Invalid URL: %s' % url)
series_id = mobj.group('id')
series_id = self._match_id(url)

fragment = compat_urlparse.urlparse(url).fragment

webpage_url = 'https://www.duboku.co/voddetail/%s.html' % series_id
webpage_html = self._download_webpage(webpage_url, series_id)

# extract title

title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
title = unescapeHTML(title.group('content')) if title else None
if not title:
title = self._html_search_meta('keywords', webpage_html)
if not title:
title = _get_element_by_tag_and_attrib(webpage_html, 'title')
title = unescapeHTML(title.group('content')) if title else None

# extract playlists

playlists = {}
for div in _get_elements_by_tag_and_attrib(
webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
playlist_id = div.group('value')
playlist = []
for a in _get_elements_by_tag_and_attrib(
div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
playlist.append({
'href': unescapeHTML(a.group('value')),
'title': unescapeHTML(a.group('content'))
})
playlists[playlist_id] = playlist

# select the specified playlist if url fragment exists
playlist = None
playlist_id = None
if fragment:
playlist = playlists.get(fragment)
playlist_id = fragment
else:
first = next(iter(playlists.items()), None)
if first:
(playlist_id, playlist) = first
if not playlist:
raise ExtractorError(
'Cannot find %s' % fragment if fragment else 'Cannot extract playlist')

# return url results
return self.playlist_result([
self.url_result(
compat_urlparse.urljoin('https://www.duboku.co', x['href']),
ie=DubokuIE.ie_key(), video_title=x.get('title'))
for x in playlist], series_id + '#' + playlist_id, title)
4 changes: 4 additions & 0 deletions youtube_dl/extractor/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,10 @@
)
from .dtube import DTubeIE
from .dvtv import DVTVIE
from .duboku import (
DubokuIE,
DubokuPlaylistIE
)
from .dumpert import DumpertIE
from .defense import DefenseGouvFrIE
from .discovery import DiscoveryIE
Expand Down