Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Misc] Under-the-hood fixes, improvements and tests (pt 3) #32490

Merged
merged 4 commits into from
Aug 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion youtube_dl/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
)
except NameError:
compat_str, compat_basestring, compat_chr = (
str, str, chr
str, (str, bytes), chr
)

# casefold
Expand Down
2 changes: 1 addition & 1 deletion youtube_dl/swfinterp.py
Original file line number Diff line number Diff line change
Expand Up @@ -727,7 +727,7 @@ def resfunc(args):
stack.append(res)
continue

assert isinstance(obj, (dict, _ScopeDict)),\
assert isinstance(obj, (dict, _ScopeDict)), \
'Accessing member %r on %r' % (pname, obj)
res = obj.get(pname, undefined)
stack.append(res)
Expand Down
177 changes: 89 additions & 88 deletions youtube_dl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1678,9 +1678,7 @@ def random_user_agent():

std_headers = {
'User-Agent': random_user_agent(),
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-us,en;q=0.5',
}

Expand Down Expand Up @@ -1826,11 +1824,11 @@ def write_json_file(obj, fn):
if sys.version_info < (3, 0) and sys.platform != 'win32':
encoding = get_filesystem_encoding()
# os.path.basename returns a bytes object, but NamedTemporaryFile
# will fail if the filename contains non ascii characters unless we
# will fail if the filename contains non-ascii characters unless we
# use a unicode object
path_basename = lambda f: os.path.basename(fn).decode(encoding)
path_basename = lambda f: os.path.basename(f).decode(encoding)
# the same for os.path.dirname
path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
path_dirname = lambda f: os.path.dirname(f).decode(encoding)
else:
path_basename = os.path.basename
path_dirname = os.path.dirname
Expand Down Expand Up @@ -1894,10 +1892,10 @@ def find_xpath_attr(node, xpath, key, val=None):
return f
return None


# On python2.6 the xml.etree.ElementTree.Element methods don't support
# the namespace parameter


def xpath_with_ns(path, ns_map):
components = [c.split(':') for c in path.split('/')]
replaced = []
Expand All @@ -1914,7 +1912,7 @@ def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
def _find_xpath(xpath):
return node.find(compat_xpath(xpath))

if isinstance(xpath, (str, compat_str)):
if isinstance(xpath, compat_basestring):
n = _find_xpath(xpath)
else:
for xp in xpath:
Expand Down Expand Up @@ -2235,7 +2233,7 @@ def _htmlentity_transform(entity_with_semicolon):
def unescapeHTML(s):
if s is None:
return None
assert type(s) == compat_str
assert isinstance(s, compat_str)

return re.sub(
r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
Expand All @@ -2262,39 +2260,32 @@ def get_subprocess_encoding():
return encoding


def encodeFilename(s, for_subprocess=False):
"""
@param s The name of the file
"""

assert type(s) == compat_str
# Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
if sys.version_info < (3, 0) and not sys.platform.startswith('java'):

# Python 3 has a Unicode API
if sys.version_info >= (3, 0):
return s

# Pass '' directly to use Unicode APIs on Windows 2000 and up
# (Detecting Windows NT 4 is tricky because 'major >= 4' would
# match Windows 9x series as well. Besides, NT 4 is obsolete.)
if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
return s

# Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
if sys.platform.startswith('java'):
return s

return s.encode(get_subprocess_encoding(), 'ignore')
def encodeFilename(s, for_subprocess=False):
"""
@param s The name of the file
"""

# Pass '' directly to use Unicode APIs on Windows 2000 and up
# (Detecting Windows NT 4 is tricky because 'major >= 4' would
# match Windows 9x series as well. Besides, NT 4 is obsolete.)
if (not for_subprocess
and sys.platform == 'win32'
and sys.getwindowsversion()[0] >= 5
and isinstance(s, compat_str)):
return s

def decodeFilename(b, for_subprocess=False):
return _encode_compat_str(s, get_subprocess_encoding(), 'ignore')

if sys.version_info >= (3, 0):
return b
def decodeFilename(b, for_subprocess=False):
return _decode_compat_str(b, get_subprocess_encoding(), 'ignore')

if not isinstance(b, bytes):
return b
else:

return b.decode(get_subprocess_encoding(), 'ignore')
# Python 3 has a Unicode API
encodeFilename = decodeFilename = lambda *s, **k: s[0]


def encodeArgument(s):
Expand All @@ -2313,11 +2304,7 @@ def decodeArgument(b):
def decodeOption(optval):
if optval is None:
return optval
if isinstance(optval, bytes):
optval = optval.decode(preferredencoding())

assert isinstance(optval, compat_str)
return optval
return _decode_compat_str(optval)


def formatSeconds(secs):
Expand Down Expand Up @@ -2363,7 +2350,7 @@ def set_alpn_protocols(ctx):

if sys.version_info < (3, 2):
return YoutubeDLHTTPSHandler(params, **kwargs)
else: # Python < 3.4
else: # Python3 < 3.4
context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
context.verify_mode = (ssl.CERT_NONE
if opts_no_check_certificate
Expand Down Expand Up @@ -2597,7 +2584,7 @@ def handle_youtubedl_headers(headers):
filtered_headers = headers

if 'Youtubedl-no-compression' in filtered_headers:
filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
filtered_headers = filter_dict(filtered_headers, cndn=lambda k, _: k.lower() != 'accept-encoding')
del filtered_headers['Youtubedl-no-compression']

return filtered_headers
Expand Down Expand Up @@ -2735,6 +2722,13 @@ def http_request(self, req):
if h.capitalize() not in req.headers:
req.add_header(h, v)

# Similarly, 'Accept-encoding'
if 'Accept-encoding' not in req.headers:
req.add_header(
'Accept-Encoding', join_nonempty(
'gzip', 'deflate', brotli and 'br', ncompress and 'compress',
delim=', '))

req.headers = handle_youtubedl_headers(req.headers)

if sys.version_info < (2, 7):
Expand Down Expand Up @@ -2818,8 +2812,7 @@ def encodings(hdrs):
location_escaped = escape_url(location_fixed)
if location != location_escaped:
del resp.headers['Location']
# if sys.version_info < (3, 0):
if not isinstance(location_escaped, str):
if not isinstance(location_escaped, str): # Py 2 case
location_escaped = location_escaped.encode('utf-8')
resp.headers['Location'] = location_escaped
return resp
Expand Down Expand Up @@ -3086,8 +3079,7 @@ def redirect_request(self, req, fp, code, msg, headers, newurl):
# On python 2 urlh.geturl() may sometimes return redirect URL
# as a byte string instead of unicode. This workaround forces
# it to return unicode.
if sys.version_info[0] < 3:
newurl = compat_str(newurl)
newurl = _decode_compat_str(newurl)

# Be conciliant with URIs containing a space. This is mainly
# redundant with the more complete encoding done in http_error_302(),
Expand Down Expand Up @@ -3115,9 +3107,7 @@ def redirect_request(self, req, fp, code, msg, headers, newurl):
new_data = None
remove_headers.extend(['Content-Length', 'Content-Type'])

# NB: don't use dict comprehension for python 2.6 compatibility
new_headers = dict((k, v) for k, v in req.headers.items()
if k.title() not in remove_headers)
new_headers = filter_dict(req.headers, cndn=lambda k, _: k.title() not in remove_headers)

return compat_urllib_request.Request(
newurl, headers=new_headers, origin_req_host=req.origin_req_host,
Expand Down Expand Up @@ -3333,11 +3323,7 @@ def __eq__(self, other):
def platform_name():
""" Returns the platform name as a compat_str """
res = platform.platform()
if isinstance(res, bytes):
res = res.decode(preferredencoding())

assert isinstance(res, compat_str)
return res
return _decode_compat_str(res)


def _windows_write_string(s, out):
Expand Down Expand Up @@ -3418,7 +3404,7 @@ def next_nonbmp_pos(s):
def write_string(s, out=None, encoding=None):
if out is None:
out = sys.stderr
assert type(s) == compat_str
assert isinstance(s, compat_str)

if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
if _windows_write_string(s, out):
Expand Down Expand Up @@ -3567,9 +3553,8 @@ def shell_quote(args):
quoted_args = []
encoding = get_filesystem_encoding()
for a in args:
if isinstance(a, bytes):
# We may get a filename encoded with 'encodeFilename'
a = a.decode(encoding)
# We may get a filename encoded with 'encodeFilename'
a = _decode_compat_str(a, encoding)
quoted_args.append(compat_shlex_quote(a))
return ' '.join(quoted_args)

Expand Down Expand Up @@ -3733,8 +3718,9 @@ def parse_resolution(s):


def parse_bitrate(s):
if not isinstance(s, compat_str):
return
s = txt_or_none(s)
if not s:
return None
mobj = re.search(r'\b(\d+)\s*kbps', s)
if mobj:
return int(mobj.group(1))
Expand Down Expand Up @@ -3822,18 +3808,17 @@ def base_url(url):


def urljoin(base, path):
if isinstance(path, bytes):
path = path.decode('utf-8')
if not isinstance(path, compat_str) or not path:
path = _decode_compat_str(path, encoding='utf-8', or_none=True)
if not path:
return None
if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
return path
if isinstance(base, bytes):
base = base.decode('utf-8')
if not isinstance(base, compat_str) or not re.match(
r'^(?:https?:)?//', base):
base = _decode_compat_str(base, encoding='utf-8', or_none=True)
if not base:
return None
return compat_urllib_parse.urljoin(base, path)
return (
re.match(r'^(?:https?:)?//', base)
and compat_urllib_parse.urljoin(base, path))


class HEADRequest(compat_urllib_request.Request):
Expand Down Expand Up @@ -3998,8 +3983,7 @@ def get_exe_version(exe, args=['--version'],
stdout=subprocess.PIPE, stderr=subprocess.STDOUT))
except OSError:
return False
if isinstance(out, bytes): # Python 2.x
out = out.decode('ascii', 'ignore')
out = _decode_compat_str(out, 'ascii', 'ignore')
return detect_exe_version(out, version_re, unrecognized)


Expand Down Expand Up @@ -4218,8 +4202,8 @@ def lowercase_escape(s):

def escape_rfc3986(s):
"""Escape non-ASCII characters as suggested by RFC 3986"""
if sys.version_info < (3, 0) and isinstance(s, compat_str):
s = s.encode('utf-8')
if sys.version_info < (3, 0):
s = _encode_compat_str(s, 'utf-8')
# ensure unicode: after quoting, it can always be converted
return compat_str(compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]"))

Expand All @@ -4242,8 +4226,7 @@ def parse_qs(url, **kwargs):

def read_batch_urls(batch_fd):
def fixup(url):
if not isinstance(url, compat_str):
url = url.decode('utf-8', 'replace')
url = _decode_compat_str(url, 'utf-8', 'replace')
BOM_UTF8 = '\xef\xbb\xbf'
if url.startswith(BOM_UTF8):
url = url[len(BOM_UTF8):]
Expand Down Expand Up @@ -4305,10 +4288,8 @@ def _multipart_encode_impl(data, boundary):
out = b''
for k, v in data.items():
out += b'--' + boundary.encode('ascii') + b'\r\n'
if isinstance(k, compat_str):
k = k.encode('utf-8')
if isinstance(v, compat_str):
v = v.encode('utf-8')
k = _encode_compat_str(k, 'utf-8')
v = _encode_compat_str(v, 'utf-8')
# RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
# suggests sending UTF-8 directly. Firefox sends UTF-8, too
content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
Expand Down Expand Up @@ -4399,6 +4380,11 @@ def try_get(src, getter, expected_type=None):
return v


def filter_dict(dct, cndn=lambda _, v: v is not None):
# NB: don't use dict comprehension for python 2.6 compatibility
return dict((k, v) for k, v in dct.items() if cndn(k, v))


def merge_dicts(*dicts, **kwargs):
"""
Merge the `dict`s in `dicts` using the first valid value for each key.
Expand Down Expand Up @@ -4435,8 +4421,26 @@ def can_merge_str(k, v, to_dict):
return merged


def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
# very poor choice of name, as if Python string encodings weren't confusing enough
def encode_compat_str(s, encoding=preferredencoding(), errors='strict'):
assert isinstance(s, compat_basestring)
return s if isinstance(s, compat_str) else compat_str(s, encoding, errors)


# what it could have been
def _decode_compat_str(s, encoding=preferredencoding(), errors='strict', or_none=False):
if not or_none:
assert isinstance(s, compat_basestring)
return (
s if isinstance(s, compat_str)
else compat_str(s, encoding, errors) if isinstance(s, compat_basestring)
else None)


# the real encode_compat_str, but only for internal use
def _encode_compat_str(s, encoding=preferredencoding(), errors='strict'):
assert isinstance(s, compat_basestring)
return s.encode(encoding, errors) if isinstance(s, compat_str) else s


US_RATINGS = {
Expand All @@ -4459,8 +4463,10 @@ def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):


def parse_age_limit(s):
if type(s) == int:
return s if 0 <= s <= 21 else None
if not isinstance(s, bool):
age = int_or_none(s)
if age is not None:
return age if 0 <= age <= 21 else None
if not isinstance(s, compat_basestring):
return None
m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
Expand Down Expand Up @@ -4637,12 +4643,7 @@ def args_to_str(args):


def error_to_compat_str(err):
err_str = str(err)
# On python 2 error byte string must be decoded with proper
# encoding rather than ascii
if sys.version_info[0] < 3:
err_str = err_str.decode(preferredencoding())
return err_str
return _decode_compat_str(str(err))


def mimetype2ext(mt):
Expand Down