Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generalized framework for webpage-based extraction #4307

Merged
merged 6 commits into from
Aug 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ Some of yt-dlp's default options are different from that of youtube-dl and youtu
* Some metadata are embedded into different fields when using `--add-metadata` as compared to youtube-dl. Most notably, `comment` field contains the `webpage_url` and `synopsis` contains the `description`. You can [use `--parse-metadata`](#modifying-metadata) to modify this to your liking or use `--compat-options embed-metadata` to revert this
* `playlist_index` behaves differently when used with options like `--playlist-reverse` and `--playlist-items`. See [#302](https://github.com/yt-dlp/yt-dlp/issues/302) for details. You can use `--compat-options playlist-index` if you want to keep the earlier behavior
* The output of `-F` is listed in a new format. Use `--compat-options list-formats` to revert this
* All *experiences* of a funimation episode are considered as a single video. This behavior breaks existing archives. Use `--compat-options seperate-video-versions` to extract information from only the default player
* Live chats (if available) are considered as subtitles. Use `--sub-langs all,-live_chat` to download all subtitles except live chat. You can also use `--compat-options no-live-chat` to prevent any live chat/danmaku from downloading
* Youtube channel URLs are automatically redirected to `/video`. Append a `/featured` to the URL to download only the videos in the home page. If the channel does not have a videos tab, we try to download the equivalent `UU` playlist instead. For all other tabs, if the channel does not show the requested tab, an error will be raised. Also, `/live` URLs raise an error if there are no live videos instead of silently downloading the entire channel. You may use `--compat-options no-youtube-channel-redirect` to revert all these redirections
* Unavailable videos are also listed for youtube playlists. Use `--compat-options no-youtube-unavailable-videos` to remove this
Expand Down Expand Up @@ -2132,6 +2131,7 @@ These options may no longer work as intended
--no-include-ads Default
--write-annotations No supported site has annotations now
--no-write-annotations Default
--compat-options seperate-video-versions No longer needed

#### Removed
These options were deprecated since 2014 and have now been entirely removed
Expand Down
6 changes: 4 additions & 2 deletions devscripts/lazy_load_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@
write_string,
)

# These bloat the lazy_extractors, so allow them to passthrough silently
ALLOWED_CLASSMETHODS = {'get_testcases', 'extract_from_webpage'}


class LazyLoadMetaClass(type):
def __getattr__(cls, name):
# "_TESTS" bloat the lazy_extractors
if '_real_class' not in cls.__dict__ and name != 'get_testcases':
if '_real_class' not in cls.__dict__ and name not in ALLOWED_CLASSMETHODS:
write_string(
'WARNING: Falling back to normal extractor since lazy extractor '
f'{cls.__name__} does not have attribute {name}{bug_reports_message()}\n')
Expand Down
7 changes: 1 addition & 6 deletions devscripts/make_lazy_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from inspect import getsource

NO_ATTR = object()
STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_WORKING', '_NETRC_MACHINE', 'age_limit']
STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_VALID_URL', '_WORKING', '_NETRC_MACHINE', 'age_limit']
CLASS_METHODS = [
'ie_key', 'working', 'description', 'suitable', '_match_valid_url', '_match_id', 'get_temp_id', 'is_suitable'
]
Expand Down Expand Up @@ -116,11 +116,6 @@ def build_lazy_ie(ie, name, attr_base):
}.get(base.__name__, base.__name__) for base in ie.__bases__)

s = IE_TEMPLATE.format(name=name, module=ie.__module__, bases=bases)
valid_url = getattr(ie, '_VALID_URL', None)
if not valid_url and hasattr(ie, '_make_valid_url'):
valid_url = ie._make_valid_url()
if valid_url:
s += f' _VALID_URL = {valid_url!r}\n'
return s + '\n'.join(extra_ie_code(ie, attr_base))


Expand Down
7 changes: 7 additions & 0 deletions test/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,13 @@ def gettestcases(include_onlymatching=False):
yield from ie.get_testcases(include_onlymatching)


def getwebpagetestcases():
for ie in yt_dlp.extractor.gen_extractors():
for tc in ie.get_webpage_testcases():
tc.setdefault('add_ie', []).append('Generic')
yield tc


md5 = lambda s: hashlib.md5(s.encode()).hexdigest()


Expand Down
70 changes: 34 additions & 36 deletions test/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


import collections
import hashlib
import http.client
import json
Expand All @@ -20,6 +21,7 @@
expect_warnings,
get_params,
gettestcases,
getwebpagetestcases,
is_download_test,
report_warning,
try_rm,
Expand All @@ -32,6 +34,7 @@
ExtractorError,
UnavailableVideoError,
format_bytes,
join_nonempty,
)

RETRIES = 3
Expand All @@ -57,7 +60,9 @@ def _file_md5(fn):
return hashlib.md5(f.read()).hexdigest()


defs = gettestcases()
normal_test_cases = gettestcases()
webpage_test_cases = getwebpagetestcases()
tests_counter = collections.defaultdict(collections.Counter)


@is_download_test
Expand All @@ -72,24 +77,13 @@ class TestDownload(unittest.TestCase):

def __str__(self):
"""Identify each test with the `add_ie` attribute, if available."""
cls, add_ie = type(self), getattr(self, self._testMethodName).add_ie
return f'{self._testMethodName} ({cls.__module__}.{cls.__name__}){f" [{add_ie}]" if add_ie else ""}:'

def strclass(cls):
"""From 2.7's unittest; 2.6 had _strclass so we can't import it."""
return f'{cls.__module__}.{cls.__name__}'

add_ie = getattr(self, self._testMethodName).add_ie
return '%s (%s)%s:' % (self._testMethodName,
strclass(self.__class__),
' [%s]' % add_ie if add_ie else '')

def setUp(self):
self.defs = defs

# Dynamically generate tests


def generator(test_case, tname):

def test_template(self):
if self.COMPLETED_TESTS.get(tname):
return
Expand Down Expand Up @@ -255,39 +249,43 @@ def try_rm_tcs_files(tcs=None):


# And add them to TestDownload
tests_counter = {}
for test_case in defs:
name = test_case['name']
i = tests_counter.get(name, 0)
tests_counter[name] = i + 1
tname = f'test_{name}_{i}' if i else f'test_{name}'
test_method = generator(test_case, tname)
test_method.__name__ = str(tname)
ie_list = test_case.get('add_ie')
test_method.add_ie = ie_list and ','.join(ie_list)
setattr(TestDownload, test_method.__name__, test_method)
del test_method
def inject_tests(test_cases, label=''):
for test_case in test_cases:
name = test_case['name']
tname = join_nonempty('test', name, label, tests_counter[name][label], delim='_')
tests_counter[name][label] += 1

test_method = generator(test_case, tname)
test_method.__name__ = tname
test_method.add_ie = ','.join(test_case.get('add_ie', []))
setattr(TestDownload, test_method.__name__, test_method)

def batch_generator(name, num_tests):

inject_tests(normal_test_cases)

# TODO: disable redirection to the IE to ensure we are actually testing the webpage extraction
inject_tests(webpage_test_cases, 'webpage')


def batch_generator(name):
def test_template(self):
for i in range(num_tests):
test_name = f'test_{name}_{i}' if i else f'test_{name}'
try:
getattr(self, test_name)()
except unittest.SkipTest:
print(f'Skipped {test_name}')
for label, num_tests in tests_counter[name].items():
for i in range(num_tests):
test_name = join_nonempty('test', name, label, i, delim='_')
try:
getattr(self, test_name)()
except unittest.SkipTest:
print(f'Skipped {test_name}')

return test_template


for name, num_tests in tests_counter.items():
test_method = batch_generator(name, num_tests)
for name in tests_counter:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we add a test_[ie]_webpage_all?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure. The current implementation is to have test_(ie)_all to run both normal and webpage tests. I don't personally use the _all tests much (instead using pytest's pattern matching), so not sure whether others would find more helpful to have it unified or split

test_method = batch_generator(name)
test_method.__name__ = f'test_{name}_all'
test_method.add_ie = ''
setattr(TestDownload, test_method.__name__, test_method)
del test_method
del test_method


if __name__ == '__main__':
Expand Down
4 changes: 2 additions & 2 deletions test/test_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def test_nocheckcertificate(self):

ydl = YoutubeDL({'logger': FakeLogger(), 'nocheckcertificate': True})
r = ydl.extract_info('https://127.0.0.1:%d/video.html' % self.port)
self.assertEqual(r['entries'][0]['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port)
self.assertEqual(r['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port)


class TestClientCert(unittest.TestCase):
Expand Down Expand Up @@ -113,7 +113,7 @@ def _run_test(self, **params):
**params,
})
r = ydl.extract_info('https://127.0.0.1:%d/video.html' % self.port)
self.assertEqual(r['entries'][0]['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port)
self.assertEqual(r['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port)

def test_certificate_combined_nopass(self):
self._run_test(client_certificate=os.path.join(self.certdir, 'clientwithkey.crt'))
Expand Down
11 changes: 5 additions & 6 deletions yt_dlp/YoutubeDL.py
Original file line number Diff line number Diff line change
Expand Up @@ -1566,7 +1566,8 @@ def process_ie_result(self, ie_result, download=True, extra_info=None):
result_type = ie_result.get('_type', 'video')

if result_type in ('url', 'url_transparent'):
ie_result['url'] = sanitize_url(ie_result['url'])
ie_result['url'] = sanitize_url(
ie_result['url'], scheme='http' if self.params.get('prefer_insecure') else 'https')
if ie_result.get('original_url'):
extra_info.setdefault('original_url', ie_result['original_url'])

Expand Down Expand Up @@ -3454,11 +3455,9 @@ def in_download_archive(self, info_dict):
if fn is None:
return False

vid_id = self._make_archive_id(info_dict)
if not vid_id:
return False # Incomplete video information

return vid_id in self.archive
vid_ids = [self._make_archive_id(info_dict)]
vid_ids.extend(info_dict.get('_old_archive_ids', []))
return any(id_ in self.archive for id_ in vid_ids)

def record_download_archive(self, info_dict):
fn = self.params.get('download_archive')
Expand Down
5 changes: 4 additions & 1 deletion yt_dlp/extractor/_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@
CamdemyFolderIE
)
from .cammodels import CamModelsIE
from .camtasia import CamtasiaEmbedIE
from .camwithher import CamWithHerIE
from .canalalpha import CanalAlphaIE
from .canalplus import CanalplusIE
Expand Down Expand Up @@ -445,7 +446,7 @@
DWIE,
DWArticleIE,
)
from .eagleplatform import EaglePlatformIE
from .eagleplatform import EaglePlatformIE, ClipYouEmbedIE
from .ebaumsworld import EbaumsWorldIE
from .echomsk import EchoMskIE
from .egghead import (
Expand Down Expand Up @@ -661,6 +662,7 @@
HSEShowIE,
HSEProductIE,
)
from .genericembeds import HTML5MediaEmbedIE
from .huajiao import HuajiaoIE
from .huya import HuyaLiveIE
from .huffpost import HuffPostIE
Expand Down Expand Up @@ -1553,6 +1555,7 @@
SharedIE,
VivoIE,
)
from .sharevideos import ShareVideosEmbedIE
from .shemaroome import ShemarooMeIE
from .showroomlive import ShowRoomLiveIE
from .simplecast import (
Expand Down
1 change: 1 addition & 0 deletions yt_dlp/extractor/adobetv.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ def _real_extract(self, url):
class AdobeTVVideoIE(AdobeTVBaseIE):
IE_NAME = 'adobetv:video'
_VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)'
_EMBED_REGEX = [r'<iframe[^>]+src=[\'"](?P<url>(?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]']

_TEST = {
# From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners
Expand Down
15 changes: 2 additions & 13 deletions yt_dlp/extractor/ant1newsgr.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import re
import urllib.parse

from .common import InfoExtractor
Expand All @@ -7,7 +6,6 @@
ExtractorError,
determine_ext,
scale_thumbnails_to_max_format_width,
unescapeHTML,
)


Expand Down Expand Up @@ -91,7 +89,7 @@ def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle')
embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage))
embed_urls = list(Ant1NewsGrEmbedIE._extract_embed_urls(url, webpage))
if not embed_urls:
raise ExtractorError('no videos found for %s' % video_id, expected=True)
return self.playlist_from_matches(
Expand All @@ -104,6 +102,7 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE):
IE_DESC = 'ant1news.gr embedded videos'
_BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player'
_VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)'
_EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)']
_API_PATH = '/news/templates/data/jsonPlayer'

_TESTS = [{
Expand All @@ -117,16 +116,6 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE):
},
}]

@classmethod
def _extract_urls(cls, webpage):
_EMBED_URL_RE = rf'{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+'
_EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_EMBED_URL_RE})(?P=_q1)'
for mobj in re.finditer(_EMBED_RE, webpage):
url = unescapeHTML(mobj.group('url'))
if not cls.suitable(url):
continue
yield url

def _real_extract(self, url):
video_id = self._match_id(url)

Expand Down
30 changes: 8 additions & 22 deletions yt_dlp/extractor/anvato.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,30 +340,16 @@ def _get_anvato_videos(self, access_key, video_id):
'subtitles': subtitles,
}

@staticmethod
def _extract_urls(ie, webpage, video_id):
entries = []
for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage):
anvplayer_data = ie._parse_json(
mobj.group('anvp'), video_id, transform_source=unescapeHTML,
fatal=False)
if not anvplayer_data:
continue
video = anvplayer_data.get('video')
if not isinstance(video, compat_str) or not video.isdigit():
continue
access_key = anvplayer_data.get('accessKey')
if not access_key:
mcp = anvplayer_data.get('mcp')
if mcp:
access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get(
mcp.lower())
@classmethod
def _extract_from_webpage(cls, url, webpage):
for mobj in re.finditer(cls._ANVP_RE, webpage):
anvplayer_data = unescapeHTML(json.loads(mobj.group('anvp'))) or {}
video_id, access_key = anvplayer_data.get('video'), anvplayer_data.get('accessKey')
if not access_key:
access_key = cls._MCP_TO_ACCESS_KEY_TABLE.get((anvplayer_data.get('mcp') or '').lower())
if not (video_id or '').isdigit() or not access_key:
continue
entries.append(ie.url_result(
'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(),
video_id=video))
return entries
yield cls.url_result(f'anvato:{access_key}:{video_id}', AnvatoIE, video_id)

def _extract_anvato_videos(self, webpage, video_id):
anvplayer_data = self._parse_json(
Expand Down
11 changes: 1 addition & 10 deletions yt_dlp/extractor/apa.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import re

from .common import InfoExtractor
from ..utils import (
determine_ext,
Expand All @@ -10,6 +8,7 @@

class APAIE(InfoExtractor):
_VALID_URL = r'(?P<base_url>https?://[^/]+\.apa\.at)/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1']
_TESTS = [{
'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029',
'md5': '2b12292faeb0a7d930c778c7a5b4759b',
Expand All @@ -30,14 +29,6 @@ class APAIE(InfoExtractor):
'only_matching': True,
}]

@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url')
for mobj in re.finditer(
r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1',
webpage)]

def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id, base_url = mobj.group('id', 'base_url')
Expand Down