yt-dlp · pukkandan · Aug 1, 2022 · Aug 1, 2022 · Jul 8, 2022 · Aug 1, 2022
diff --git a/README.md b/README.md
@@ -138,7 +138,6 @@ Some of yt-dlp's default options are different from that of youtube-dl and youtu
 * Some metadata are embedded into different fields when using `--add-metadata` as compared to youtube-dl. Most notably, `comment` field contains the `webpage_url` and `synopsis` contains the `description`. You can [use `--parse-metadata`](#modifying-metadata) to modify this to your liking or use `--compat-options embed-metadata` to revert this
 * `playlist_index` behaves differently when used with options like `--playlist-reverse` and `--playlist-items`. See [#302](https://github.com/yt-dlp/yt-dlp/issues/302) for details. You can use `--compat-options playlist-index` if you want to keep the earlier behavior
 * The output of `-F` is listed in a new format. Use `--compat-options list-formats` to revert this
-* All *experiences* of a funimation episode are considered as a single video. This behavior breaks existing archives. Use `--compat-options seperate-video-versions` to extract information from only the default player
 * Live chats (if available) are considered as subtitles. Use `--sub-langs all,-live_chat` to download all subtitles except live chat. You can also use `--compat-options no-live-chat` to prevent any live chat/danmaku from downloading
 * Youtube channel URLs are automatically redirected to `/video`. Append a `/featured` to the URL to download only the videos in the home page. If the channel does not have a videos tab, we try to download the equivalent `UU` playlist instead. For all other tabs, if the channel does not show the requested tab, an error will be raised. Also, `/live` URLs raise an error if there are no live videos instead of silently downloading the entire channel. You may use `--compat-options no-youtube-channel-redirect` to revert all these redirections
 * Unavailable videos are also listed for youtube playlists. Use `--compat-options no-youtube-unavailable-videos` to remove this
@@ -2132,6 +2131,7 @@ These options may no longer work as intended
     --no-include-ads                 Default
     --write-annotations              No supported site has annotations now
     --no-write-annotations           Default
+    --compat-options seperate-video-versions  No longer needed
 
 #### Removed
 These options were deprecated since 2014 and have now been entirely removed

diff --git a/devscripts/lazy_load_template.py b/devscripts/lazy_load_template.py
@@ -9,11 +9,13 @@
     write_string,
 )
 
+# These bloat the lazy_extractors, so allow them to passthrough silently
+ALLOWED_CLASSMETHODS = {'get_testcases', 'extract_from_webpage'}
+
 
 class LazyLoadMetaClass(type):
     def __getattr__(cls, name):
-        # "_TESTS" bloat the lazy_extractors
-        if '_real_class' not in cls.__dict__ and name != 'get_testcases':
+        if '_real_class' not in cls.__dict__ and name not in ALLOWED_CLASSMETHODS:
             write_string(
                 'WARNING: Falling back to normal extractor since lazy extractor '
                 f'{cls.__name__} does not have attribute {name}{bug_reports_message()}\n')

diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py
@@ -11,7 +11,7 @@
 from inspect import getsource
 
 NO_ATTR = object()
-STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_WORKING', '_NETRC_MACHINE', 'age_limit']
+STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_VALID_URL', '_WORKING', '_NETRC_MACHINE', 'age_limit']
 CLASS_METHODS = [
     'ie_key', 'working', 'description', 'suitable', '_match_valid_url', '_match_id', 'get_temp_id', 'is_suitable'
 ]
@@ -116,11 +116,6 @@ def build_lazy_ie(ie, name, attr_base):
     }.get(base.__name__, base.__name__) for base in ie.__bases__)
 
     s = IE_TEMPLATE.format(name=name, module=ie.__module__, bases=bases)
-    valid_url = getattr(ie, '_VALID_URL', None)
-    if not valid_url and hasattr(ie, '_make_valid_url'):
-        valid_url = ie._make_valid_url()
-    if valid_url:
-        s += f'    _VALID_URL = {valid_url!r}\n'
     return s + '\n'.join(extra_ie_code(ie, attr_base))
 
 

diff --git a/test/helper.py b/test/helper.py
@@ -92,6 +92,13 @@ def gettestcases(include_onlymatching=False):
         yield from ie.get_testcases(include_onlymatching)
 
 
+def getwebpagetestcases():
+    for ie in yt_dlp.extractor.gen_extractors():
+        for tc in ie.get_webpage_testcases():
+            tc.setdefault('add_ie', []).append('Generic')
+            yield tc
+
+
 md5 = lambda s: hashlib.md5(s.encode()).hexdigest()
 
 

diff --git a/test/test_download.py b/test/test_download.py
@@ -8,6 +8,7 @@
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 
+import collections
 import hashlib
 import http.client
 import json
@@ -20,6 +21,7 @@
     expect_warnings,
     get_params,
     gettestcases,
+    getwebpagetestcases,
     is_download_test,
     report_warning,
     try_rm,
@@ -32,6 +34,7 @@
     ExtractorError,
     UnavailableVideoError,
     format_bytes,
+    join_nonempty,
 )
 
 RETRIES = 3
@@ -57,7 +60,9 @@ def _file_md5(fn):
         return hashlib.md5(f.read()).hexdigest()
 
 
-defs = gettestcases()
+normal_test_cases = gettestcases()
+webpage_test_cases = getwebpagetestcases()
+tests_counter = collections.defaultdict(collections.Counter)
 
 
 @is_download_test
@@ -72,24 +77,13 @@ class TestDownload(unittest.TestCase):
 
     def __str__(self):
         """Identify each test with the `add_ie` attribute, if available."""
+        cls, add_ie = type(self), getattr(self, self._testMethodName).add_ie
+        return f'{self._testMethodName} ({cls.__module__}.{cls.__name__}){f" [{add_ie}]" if add_ie else ""}:'
 
-        def strclass(cls):
-            """From 2.7's unittest; 2.6 had _strclass so we can't import it."""
-            return f'{cls.__module__}.{cls.__name__}'
-
-        add_ie = getattr(self, self._testMethodName).add_ie
-        return '%s (%s)%s:' % (self._testMethodName,
-                               strclass(self.__class__),
-                               ' [%s]' % add_ie if add_ie else '')
-
-    def setUp(self):
-        self.defs = defs
 
 # Dynamically generate tests
 
-
 def generator(test_case, tname):
-
     def test_template(self):
         if self.COMPLETED_TESTS.get(tname):
             return
@@ -255,39 +249,43 @@ def try_rm_tcs_files(tcs=None):
 
 
 # And add them to TestDownload
-tests_counter = {}
-for test_case in defs:
-    name = test_case['name']
-    i = tests_counter.get(name, 0)
-    tests_counter[name] = i + 1
-    tname = f'test_{name}_{i}' if i else f'test_{name}'
-    test_method = generator(test_case, tname)
-    test_method.__name__ = str(tname)
-    ie_list = test_case.get('add_ie')
-    test_method.add_ie = ie_list and ','.join(ie_list)
-    setattr(TestDownload, test_method.__name__, test_method)
-    del test_method
+def inject_tests(test_cases, label=''):
+    for test_case in test_cases:
+        name = test_case['name']
+        tname = join_nonempty('test', name, label, tests_counter[name][label], delim='_')
+        tests_counter[name][label] += 1
 
+        test_method = generator(test_case, tname)
+        test_method.__name__ = tname
+        test_method.add_ie = ','.join(test_case.get('add_ie', []))
+        setattr(TestDownload, test_method.__name__, test_method)
 
-def batch_generator(name, num_tests):
 
+inject_tests(normal_test_cases)
+
+# TODO: disable redirection to the IE to ensure we are actually testing the webpage extraction
+inject_tests(webpage_test_cases, 'webpage')
+
+
+def batch_generator(name):
     def test_template(self):
-        for i in range(num_tests):
-            test_name = f'test_{name}_{i}' if i else f'test_{name}'
-            try:
-                getattr(self, test_name)()
-            except unittest.SkipTest:
-                print(f'Skipped {test_name}')
+        for label, num_tests in tests_counter[name].items():
+            for i in range(num_tests):
+                test_name = join_nonempty('test', name, label, i, delim='_')
+                try:
+                    getattr(self, test_name)()
+                except unittest.SkipTest:
+                    print(f'Skipped {test_name}')
 
     return test_template
 
 
-for name, num_tests in tests_counter.items():
-    test_method = batch_generator(name, num_tests)
+for name in tests_counter:
+    test_method = batch_generator(name)
     test_method.__name__ = f'test_{name}_all'
     test_method.add_ie = ''
     setattr(TestDownload, test_method.__name__, test_method)
-    del test_method
+del test_method
 
 
 if __name__ == '__main__':

diff --git a/test/test_http.py b/test/test_http.py
@@ -85,7 +85,7 @@ def test_nocheckcertificate(self):
 
         ydl = YoutubeDL({'logger': FakeLogger(), 'nocheckcertificate': True})
         r = ydl.extract_info('https://127.0.0.1:%d/video.html' % self.port)
-        self.assertEqual(r['entries'][0]['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port)
+        self.assertEqual(r['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port)
 
 
 class TestClientCert(unittest.TestCase):
@@ -113,7 +113,7 @@ def _run_test(self, **params):
             **params,
         })
         r = ydl.extract_info('https://127.0.0.1:%d/video.html' % self.port)
-        self.assertEqual(r['entries'][0]['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port)
+        self.assertEqual(r['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port)
 
     def test_certificate_combined_nopass(self):
         self._run_test(client_certificate=os.path.join(self.certdir, 'clientwithkey.crt'))

diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py
@@ -1566,7 +1566,8 @@ def process_ie_result(self, ie_result, download=True, extra_info=None):
         result_type = ie_result.get('_type', 'video')
 
         if result_type in ('url', 'url_transparent'):
-            ie_result['url'] = sanitize_url(ie_result['url'])
+            ie_result['url'] = sanitize_url(
+                ie_result['url'], scheme='http' if self.params.get('prefer_insecure') else 'https')
             if ie_result.get('original_url'):
                 extra_info.setdefault('original_url', ie_result['original_url'])
 
@@ -3454,11 +3455,9 @@ def in_download_archive(self, info_dict):
         if fn is None:
             return False
 
-        vid_id = self._make_archive_id(info_dict)
-        if not vid_id:
-            return False  # Incomplete video information
-
-        return vid_id in self.archive
+        vid_ids = [self._make_archive_id(info_dict)]
+        vid_ids.extend(info_dict.get('_old_archive_ids', []))
+        return any(id_ in self.archive for id_ in vid_ids)
 
     def record_download_archive(self, info_dict):
         fn = self.params.get('download_archive')

diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py
@@ -219,6 +219,7 @@
     CamdemyFolderIE
 )
 from .cammodels import CamModelsIE
+from .camtasia import CamtasiaEmbedIE
 from .camwithher import CamWithHerIE
 from .canalalpha import CanalAlphaIE
 from .canalplus import CanalplusIE
@@ -445,7 +446,7 @@
     DWIE,
     DWArticleIE,
 )
-from .eagleplatform import EaglePlatformIE
+from .eagleplatform import EaglePlatformIE, ClipYouEmbedIE
 from .ebaumsworld import EbaumsWorldIE
 from .echomsk import EchoMskIE
 from .egghead import (
@@ -661,6 +662,7 @@
     HSEShowIE,
     HSEProductIE,
 )
+from .genericembeds import HTML5MediaEmbedIE
 from .huajiao import HuajiaoIE
 from .huya import HuyaLiveIE
 from .huffpost import HuffPostIE
@@ -1553,6 +1555,7 @@
     SharedIE,
     VivoIE,
 )
+from .sharevideos import ShareVideosEmbedIE
 from .shemaroome import ShemarooMeIE
 from .showroomlive import ShowRoomLiveIE
 from .simplecast import (

diff --git a/yt_dlp/extractor/adobetv.py b/yt_dlp/extractor/adobetv.py
@@ -232,6 +232,7 @@ def _real_extract(self, url):
 class AdobeTVVideoIE(AdobeTVBaseIE):
     IE_NAME = 'adobetv:video'
     _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)'
+    _EMBED_REGEX = [r'<iframe[^>]+src=[\'"](?P<url>(?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]']
 
     _TEST = {
         # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners

diff --git a/yt_dlp/extractor/ant1newsgr.py b/yt_dlp/extractor/ant1newsgr.py
@@ -1,4 +1,3 @@
-import re
 import urllib.parse
 
 from .common import InfoExtractor
@@ -7,7 +6,6 @@
     ExtractorError,
     determine_ext,
     scale_thumbnails_to_max_format_width,
-    unescapeHTML,
 )
 
 
@@ -91,7 +89,7 @@ def _real_extract(self, url):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
         info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle')
-        embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage))
+        embed_urls = list(Ant1NewsGrEmbedIE._extract_embed_urls(url, webpage))
         if not embed_urls:
             raise ExtractorError('no videos found for %s' % video_id, expected=True)
         return self.playlist_from_matches(
@@ -104,6 +102,7 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE):
     IE_DESC = 'ant1news.gr embedded videos'
     _BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player'
     _VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)'
+    _EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)']
     _API_PATH = '/news/templates/data/jsonPlayer'
 
     _TESTS = [{
@@ -117,16 +116,6 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE):
         },
     }]
 
-    @classmethod
-    def _extract_urls(cls, webpage):
-        _EMBED_URL_RE = rf'{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+'
-        _EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_EMBED_URL_RE})(?P=_q1)'
-        for mobj in re.finditer(_EMBED_RE, webpage):
-            url = unescapeHTML(mobj.group('url'))
-            if not cls.suitable(url):
-                continue
-            yield url
-
     def _real_extract(self, url):
         video_id = self._match_id(url)
 

diff --git a/yt_dlp/extractor/anvato.py b/yt_dlp/extractor/anvato.py
@@ -340,30 +340,16 @@ def _get_anvato_videos(self, access_key, video_id):
             'subtitles': subtitles,
         }
 
-    @staticmethod
-    def _extract_urls(ie, webpage, video_id):
-        entries = []
-        for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage):
-            anvplayer_data = ie._parse_json(
-                mobj.group('anvp'), video_id, transform_source=unescapeHTML,
-                fatal=False)
-            if not anvplayer_data:
-                continue
-            video = anvplayer_data.get('video')
-            if not isinstance(video, compat_str) or not video.isdigit():
-                continue
-            access_key = anvplayer_data.get('accessKey')
-            if not access_key:
-                mcp = anvplayer_data.get('mcp')
-                if mcp:
-                    access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get(
-                        mcp.lower())
+    @classmethod
+    def _extract_from_webpage(cls, url, webpage):
+        for mobj in re.finditer(cls._ANVP_RE, webpage):
+            anvplayer_data = unescapeHTML(json.loads(mobj.group('anvp'))) or {}
+            video_id, access_key = anvplayer_data.get('video'), anvplayer_data.get('accessKey')
             if not access_key:
+                access_key = cls._MCP_TO_ACCESS_KEY_TABLE.get((anvplayer_data.get('mcp') or '').lower())
+            if not (video_id or '').isdigit() or not access_key:
                 continue
-            entries.append(ie.url_result(
-                'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(),
-                video_id=video))
-        return entries
+            yield cls.url_result(f'anvato:{access_key}:{video_id}', AnvatoIE, video_id)
 
     def _extract_anvato_videos(self, webpage, video_id):
         anvplayer_data = self._parse_json(

diff --git a/yt_dlp/extractor/apa.py b/yt_dlp/extractor/apa.py
@@ -1,5 +1,3 @@
-import re
-
 from .common import InfoExtractor
 from ..utils import (
     determine_ext,
@@ -10,6 +8,7 @@
 
 class APAIE(InfoExtractor):
     _VALID_URL = r'(?P<base_url>https?://[^/]+\.apa\.at)/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+    _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1']
     _TESTS = [{
         'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029',
         'md5': '2b12292faeb0a7d930c778c7a5b4759b',
@@ -30,14 +29,6 @@ class APAIE(InfoExtractor):
         'only_matching': True,
     }]
 
-    @staticmethod
-    def _extract_urls(webpage):
-        return [
-            mobj.group('url')
-            for mobj in re.finditer(
-                r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1',
-                webpage)]
-
     def _real_extract(self, url):
         mobj = self._match_valid_url(url)
         video_id, base_url = mobj.group('id', 'base_url')