Skip to content

Commit

Permalink
[extractor] Framework for embed detection (#4307)
Browse files Browse the repository at this point in the history
  • Loading branch information
pukkandan committed Aug 1, 2022
1 parent 47304e0 commit 8f97a15
Show file tree
Hide file tree
Showing 8 changed files with 149 additions and 77 deletions.
6 changes: 4 additions & 2 deletions devscripts/lazy_load_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@
write_string,
)

# These bloat the lazy_extractors, so allow them to passthrough silently
ALLOWED_CLASSMETHODS = {'get_testcases', 'extract_from_webpage'}


class LazyLoadMetaClass(type):
def __getattr__(cls, name):
# "_TESTS" bloat the lazy_extractors
if '_real_class' not in cls.__dict__ and name != 'get_testcases':
if '_real_class' not in cls.__dict__ and name not in ALLOWED_CLASSMETHODS:
write_string(
'WARNING: Falling back to normal extractor since lazy extractor '
f'{cls.__name__} does not have attribute {name}{bug_reports_message()}\n')
Expand Down
7 changes: 1 addition & 6 deletions devscripts/make_lazy_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from inspect import getsource

NO_ATTR = object()
STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_WORKING', '_NETRC_MACHINE', 'age_limit']
STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_VALID_URL', '_WORKING', '_NETRC_MACHINE', 'age_limit']
CLASS_METHODS = [
'ie_key', 'working', 'description', 'suitable', '_match_valid_url', '_match_id', 'get_temp_id', 'is_suitable'
]
Expand Down Expand Up @@ -116,11 +116,6 @@ def build_lazy_ie(ie, name, attr_base):
}.get(base.__name__, base.__name__) for base in ie.__bases__)

s = IE_TEMPLATE.format(name=name, module=ie.__module__, bases=bases)
valid_url = getattr(ie, '_VALID_URL', None)
if not valid_url and hasattr(ie, '_make_valid_url'):
valid_url = ie._make_valid_url()
if valid_url:
s += f' _VALID_URL = {valid_url!r}\n'
return s + '\n'.join(extra_ie_code(ie, attr_base))


Expand Down
3 changes: 2 additions & 1 deletion yt_dlp/YoutubeDL.py
Original file line number Diff line number Diff line change
Expand Up @@ -1566,7 +1566,8 @@ def process_ie_result(self, ie_result, download=True, extra_info=None):
result_type = ie_result.get('_type', 'video')

if result_type in ('url', 'url_transparent'):
ie_result['url'] = sanitize_url(ie_result['url'])
ie_result['url'] = sanitize_url(
ie_result['url'], scheme='http' if self.params.get('prefer_insecure') else 'https')
if ie_result.get('original_url'):
extra_info.setdefault('original_url', ie_result['original_url'])

Expand Down
4 changes: 2 additions & 2 deletions yt_dlp/extractor/brightcove.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,11 +402,11 @@ class BrightcoveNewIE(AdobePassIE):

@staticmethod
def _extract_url(ie, webpage):
urls = BrightcoveNewIE._extract_urls(ie, webpage)
urls = BrightcoveNewIE._extract_brightcove_urls(ie, webpage)
return urls[0] if urls else None

@staticmethod
def _extract_urls(ie, webpage):
def _extract_brightcove_urls(ie, webpage):
# Reference:
# 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
# 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag
Expand Down
99 changes: 77 additions & 22 deletions yt_dlp/extractor/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import re
import sys
import time
import types
import urllib.parse
import urllib.request
import xml.etree.ElementTree
Expand All @@ -23,6 +24,7 @@
from ..downloader import FileDownloader
from ..downloader.f4m import get_base_url, remove_encrypted_media
from ..utils import (
IDENTITY,
JSON_LD_RE,
NO_DEFAULT,
ExtractorError,
Expand Down Expand Up @@ -59,6 +61,7 @@
parse_m3u8_attributes,
parse_resolution,
sanitize_filename,
sanitize_url,
sanitized_Request,
str_or_none,
str_to_int,
Expand Down Expand Up @@ -431,14 +434,26 @@ class InfoExtractor:
title, description etc.
Subclasses of this should define a _VALID_URL regexp and, re-define the
_real_extract() and (optionally) _real_initialize() methods.
Probably, they should also be added to the list of extractors.
Subclasses of this should also be added to the list of extractors and
should define a _VALID_URL regexp and, re-define the _real_extract() and
(optionally) _real_initialize() methods.
Subclasses may also override suitable() if necessary, but ensure the function
signature is preserved and that this function imports everything it needs
(except other extractors), so that lazy_extractors works correctly.
Subclasses can define a list of _EMBED_REGEX, which will be searched for in
the HTML of Generic webpages. It may also override _extract_embed_urls
or _extract_from_webpage as necessary. While these are normally classmethods,
_extract_from_webpage is allowed to be an instance method.
_extract_from_webpage may raise self.StopExtraction() to stop further
processing of the webpage and obtain exclusive rights to it. This is useful
when the extractor cannot reliably be matched using just the URL.
Eg: invidious/peertube instances
Embed-only extractors can be defined by setting _VALID_URL = False.
To support username + password (or netrc) login, the extractor must define a
_NETRC_MACHINE and re-define _perform_login(username, password) and
(optionally) _initialize_pre_login() methods. The _perform_login method will
Expand Down Expand Up @@ -476,6 +491,8 @@ class InfoExtractor:
_NETRC_MACHINE = None
IE_DESC = None
SEARCH_KEY = None
_VALID_URL = None
_EMBED_REGEX = []

def _login_hint(self, method=NO_DEFAULT, netrc=None):
password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
Expand All @@ -499,12 +516,12 @@ def __init__(self, downloader=None):

@classmethod
def _match_valid_url(cls, url):
if cls._VALID_URL is False:
return None
# This does not use has/getattr intentionally - we want to know whether
# we have cached the regexp for *this* class, whereas getattr would also
# match the superclass
if '_VALID_URL_RE' not in cls.__dict__:
if '_VALID_URL' not in cls.__dict__:
cls._VALID_URL = cls._make_valid_url()
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
return cls._VALID_URL_RE.match(url)

Expand Down Expand Up @@ -1143,10 +1160,12 @@ def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent
'url': url,
}

def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
for m in orderedSet(map(getter, matches) if getter else matches))
return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
@classmethod
def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
return cls.playlist_result(
(cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
playlist_id, playlist_title, **kwargs)

@staticmethod
def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
Expand Down Expand Up @@ -1353,12 +1372,20 @@ def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs
def _dc_search_uploader(self, html):
return self._html_search_meta('dc.creator', html, 'uploader')

def _rta_search(self, html):
@staticmethod
def _rta_search(html):
# See http://www.rtalabel.org/index.php?content=howtofaq#single
if re.search(r'(?ix)<meta\s+name="rating"\s+'
r' content="RTA-5042-1996-1400-1577-RTA"',
html):
return 18

# And then there are the jokers who advertise that they use RTA, but actually don't.
AGE_LIMIT_MARKERS = [
r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
]
if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
return 18
return 0

def _media_rating_search(self, html):
Expand Down Expand Up @@ -1965,14 +1992,9 @@ def http_scheme(self):
else 'https:')

def _proto_relative_url(self, url, scheme=None):
if url is None:
return url
if url.startswith('//'):
if scheme is None:
scheme = self.http_scheme()
return scheme + url
else:
return url
scheme = scheme or self.http_scheme()
assert scheme.endswith(':')
return sanitize_url(url, scheme=scheme[:-1])

def _sleep(self, timeout, video_id, msg_template=None):
if msg_template is None:
Expand Down Expand Up @@ -3767,10 +3789,12 @@ def geo_verification_headers(self):
headers['Ytdl-request-proxy'] = geo_verification_proxy
return headers

def _generic_id(self, url):
@staticmethod
def _generic_id(url):
return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])

def _generic_title(self, url):
@staticmethod
def _generic_title(url):
return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])

@staticmethod
Expand Down Expand Up @@ -3816,6 +3840,37 @@ def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_l
self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
return True

@classmethod
def extract_from_webpage(cls, ydl, url, webpage):
ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
else ydl.get_info_extractor(cls.ie_key()))
yield from ie._extract_from_webpage(url, webpage) or []

@classmethod
def _extract_from_webpage(cls, url, webpage):
for embed_url in orderedSet(
cls._extract_embed_urls(url, webpage) or [], lazy=True):
yield cls.url_result(embed_url, cls)

@classmethod
def _extract_embed_urls(cls, url, webpage):
"""@returns all the embed urls on the webpage"""
if '_EMBED_URL_RE' not in cls.__dict__:
assert isinstance(cls._EMBED_REGEX, (list, tuple))
for idx, regex in enumerate(cls._EMBED_REGEX):
assert regex.count('(?P<url>') == 1, \
f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))

for regex in cls._EMBED_URL_RE:
for mobj in regex.finditer(webpage):
embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
if cls._VALID_URL is False or cls.suitable(embed_url):
yield embed_url

class StopExtraction(Exception):
pass


class SearchInfoExtractor(InfoExtractor):
"""
Expand All @@ -3826,8 +3881,8 @@ class SearchInfoExtractor(InfoExtractor):

_MAX_RESULTS = float('inf')

@classmethod
def _make_valid_url(cls):
@classproperty
def _VALID_URL(cls):
return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY

def _real_extract(self, query):
Expand Down

0 comments on commit 8f97a15

Please sign in to comment.