ytdl-org · dirkf · Aug 1, 2023 · Jul 30, 2023 · Jul 30, 2023 · Jul 30, 2023
diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py
@@ -36,7 +36,7 @@
     )
 except NameError:
     compat_str, compat_basestring, compat_chr = (
-        str, str, chr
+        str, (str, bytes), chr
     )
 
 # casefold

diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py
@@ -727,7 +727,7 @@ def resfunc(args):
                             stack.append(res)
                             continue
 
-                        assert isinstance(obj, (dict, _ScopeDict)),\
+                        assert isinstance(obj, (dict, _ScopeDict)), \
                             'Accessing member %r on %r' % (pname, obj)
                         res = obj.get(pname, undefined)
                         stack.append(res)

diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
@@ -1678,9 +1678,7 @@ def random_user_agent():
 
 std_headers = {
     'User-Agent': random_user_agent(),
-    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-    'Accept-Encoding': 'gzip, deflate',
     'Accept-Language': 'en-us,en;q=0.5',
 }
 
@@ -1826,11 +1824,11 @@ def write_json_file(obj, fn):
     if sys.version_info < (3, 0) and sys.platform != 'win32':
         encoding = get_filesystem_encoding()
         # os.path.basename returns a bytes object, but NamedTemporaryFile
-        # will fail if the filename contains non ascii characters unless we
+        # will fail if the filename contains non-ascii characters unless we
         # use a unicode object
-        path_basename = lambda f: os.path.basename(fn).decode(encoding)
+        path_basename = lambda f: os.path.basename(f).decode(encoding)
         # the same for os.path.dirname
-        path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
+        path_dirname = lambda f: os.path.dirname(f).decode(encoding)
     else:
         path_basename = os.path.basename
         path_dirname = os.path.dirname
@@ -1894,10 +1892,10 @@ def find_xpath_attr(node, xpath, key, val=None):
                 return f
         return None
 
+
 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 # the namespace parameter
 
-
 def xpath_with_ns(path, ns_map):
     components = [c.split(':') for c in path.split('/')]
     replaced = []
@@ -1914,7 +1912,7 @@ def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
     def _find_xpath(xpath):
         return node.find(compat_xpath(xpath))
 
-    if isinstance(xpath, (str, compat_str)):
+    if isinstance(xpath, compat_basestring):
         n = _find_xpath(xpath)
     else:
         for xp in xpath:
@@ -2235,7 +2233,7 @@ def _htmlentity_transform(entity_with_semicolon):
 def unescapeHTML(s):
     if s is None:
         return None
-    assert type(s) == compat_str
+    assert isinstance(s, compat_str)
 
     return re.sub(
         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
@@ -2262,39 +2260,32 @@ def get_subprocess_encoding():
     return encoding
 
 
-def encodeFilename(s, for_subprocess=False):
-    """
-    @param s The name of the file
-    """
-
-    assert type(s) == compat_str
+# Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
+if sys.version_info < (3, 0) and not sys.platform.startswith('java'):
 
-    # Python 3 has a Unicode API
-    if sys.version_info >= (3, 0):
-        return s
-
-    # Pass '' directly to use Unicode APIs on Windows 2000 and up
-    # (Detecting Windows NT 4 is tricky because 'major >= 4' would
-    # match Windows 9x series as well. Besides, NT 4 is obsolete.)
-    if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
-        return s
-
-    # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
-    if sys.platform.startswith('java'):
-        return s
-
-    return s.encode(get_subprocess_encoding(), 'ignore')
+    def encodeFilename(s, for_subprocess=False):
+        """
+        @param s The name of the file
+        """
 
+        # Pass '' directly to use Unicode APIs on Windows 2000 and up
+        # (Detecting Windows NT 4 is tricky because 'major >= 4' would
+        # match Windows 9x series as well. Besides, NT 4 is obsolete.)
+        if (not for_subprocess
+                and sys.platform == 'win32'
+                and sys.getwindowsversion()[0] >= 5
+                and isinstance(s, compat_str)):
+            return s
 
-def decodeFilename(b, for_subprocess=False):
+        return _encode_compat_str(s, get_subprocess_encoding(), 'ignore')
 
-    if sys.version_info >= (3, 0):
-        return b
+    def decodeFilename(b, for_subprocess=False):
+        return _decode_compat_str(b, get_subprocess_encoding(), 'ignore')
 
-    if not isinstance(b, bytes):
-        return b
+else:
 
-    return b.decode(get_subprocess_encoding(), 'ignore')
+    # Python 3 has a Unicode API
+    encodeFilename = decodeFilename = lambda *s, **k: s[0]
 
 
 def encodeArgument(s):
@@ -2313,11 +2304,7 @@ def decodeArgument(b):
 def decodeOption(optval):
     if optval is None:
         return optval
-    if isinstance(optval, bytes):
-        optval = optval.decode(preferredencoding())
-
-    assert isinstance(optval, compat_str)
-    return optval
+    return _decode_compat_str(optval)
 
 
 def formatSeconds(secs):
@@ -2363,7 +2350,7 @@ def set_alpn_protocols(ctx):
 
     if sys.version_info < (3, 2):
         return YoutubeDLHTTPSHandler(params, **kwargs)
-    else:  # Python < 3.4
+    else:  # Python3 < 3.4
         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
         context.verify_mode = (ssl.CERT_NONE
                                if opts_no_check_certificate
@@ -2597,7 +2584,7 @@ def handle_youtubedl_headers(headers):
     filtered_headers = headers
 
     if 'Youtubedl-no-compression' in filtered_headers:
-        filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
+        filtered_headers = filter_dict(filtered_headers, cndn=lambda k, _: k.lower() != 'accept-encoding')
         del filtered_headers['Youtubedl-no-compression']
 
     return filtered_headers
@@ -2735,6 +2722,13 @@ def http_request(self, req):
             if h.capitalize() not in req.headers:
                 req.add_header(h, v)
 
+        # Similarly, 'Accept-encoding'
+        if 'Accept-encoding' not in req.headers:
+            req.add_header(
+                'Accept-Encoding', join_nonempty(
+                    'gzip', 'deflate', brotli and 'br', ncompress and 'compress',
+                    delim=', '))
+
         req.headers = handle_youtubedl_headers(req.headers)
 
         if sys.version_info < (2, 7):
@@ -2818,8 +2812,7 @@ def encodings(hdrs):
                 location_escaped = escape_url(location_fixed)
                 if location != location_escaped:
                     del resp.headers['Location']
-                    # if sys.version_info < (3, 0):
-                    if not isinstance(location_escaped, str):
+                    if not isinstance(location_escaped, str):  # Py 2 case
                         location_escaped = location_escaped.encode('utf-8')
                     resp.headers['Location'] = location_escaped
         return resp
@@ -3086,8 +3079,7 @@ def redirect_request(self, req, fp, code, msg, headers, newurl):
         # On python 2 urlh.geturl() may sometimes return redirect URL
         # as a byte string instead of unicode. This workaround forces
         # it to return unicode.
-        if sys.version_info[0] < 3:
-            newurl = compat_str(newurl)
+        newurl = _decode_compat_str(newurl)
 
         # Be conciliant with URIs containing a space.  This is mainly
         # redundant with the more complete encoding done in http_error_302(),
@@ -3115,9 +3107,7 @@ def redirect_request(self, req, fp, code, msg, headers, newurl):
             new_data = None
             remove_headers.extend(['Content-Length', 'Content-Type'])
 
-        # NB: don't use dict comprehension for python 2.6 compatibility
-        new_headers = dict((k, v) for k, v in req.headers.items()
-                           if k.title() not in remove_headers)
+        new_headers = filter_dict(req.headers, cndn=lambda k, _: k.title() not in remove_headers)
 
         return compat_urllib_request.Request(
             newurl, headers=new_headers, origin_req_host=req.origin_req_host,
@@ -3333,11 +3323,7 @@ def __eq__(self, other):
 def platform_name():
     """ Returns the platform name as a compat_str """
     res = platform.platform()
-    if isinstance(res, bytes):
-        res = res.decode(preferredencoding())
-
-    assert isinstance(res, compat_str)
-    return res
+    return _decode_compat_str(res)
 
 
 def _windows_write_string(s, out):
@@ -3418,7 +3404,7 @@ def next_nonbmp_pos(s):
 def write_string(s, out=None, encoding=None):
     if out is None:
         out = sys.stderr
-    assert type(s) == compat_str
+    assert isinstance(s, compat_str)
 
     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
         if _windows_write_string(s, out):
@@ -3567,9 +3553,8 @@ def shell_quote(args):
     quoted_args = []
     encoding = get_filesystem_encoding()
     for a in args:
-        if isinstance(a, bytes):
-            # We may get a filename encoded with 'encodeFilename'
-            a = a.decode(encoding)
+        # We may get a filename encoded with 'encodeFilename'
+        a = _decode_compat_str(a, encoding)
         quoted_args.append(compat_shlex_quote(a))
     return ' '.join(quoted_args)
 
@@ -3733,8 +3718,9 @@ def parse_resolution(s):
 
 
 def parse_bitrate(s):
-    if not isinstance(s, compat_str):
-        return
+    s = txt_or_none(s)
+    if not s:
+        return None
     mobj = re.search(r'\b(\d+)\s*kbps', s)
     if mobj:
         return int(mobj.group(1))
@@ -3822,18 +3808,17 @@ def base_url(url):
 
 
 def urljoin(base, path):
-    if isinstance(path, bytes):
-        path = path.decode('utf-8')
-    if not isinstance(path, compat_str) or not path:
+    path = _decode_compat_str(path, encoding='utf-8', or_none=True)
+    if not path:
         return None
     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
         return path
-    if isinstance(base, bytes):
-        base = base.decode('utf-8')
-    if not isinstance(base, compat_str) or not re.match(
-            r'^(?:https?:)?//', base):
+    base = _decode_compat_str(base, encoding='utf-8', or_none=True)
+    if not base:
         return None
-    return compat_urllib_parse.urljoin(base, path)
+    return (
+        re.match(r'^(?:https?:)?//', base)
+        and compat_urllib_parse.urljoin(base, path))
 
 
 class HEADRequest(compat_urllib_request.Request):
@@ -3998,8 +3983,7 @@ def get_exe_version(exe, args=['--version'],
             stdout=subprocess.PIPE, stderr=subprocess.STDOUT))
     except OSError:
         return False
-    if isinstance(out, bytes):  # Python 2.x
-        out = out.decode('ascii', 'ignore')
+    out = _decode_compat_str(out, 'ascii', 'ignore')
     return detect_exe_version(out, version_re, unrecognized)
 
 
@@ -4218,8 +4202,8 @@ def lowercase_escape(s):
 
 def escape_rfc3986(s):
     """Escape non-ASCII characters as suggested by RFC 3986"""
-    if sys.version_info < (3, 0) and isinstance(s, compat_str):
-        s = s.encode('utf-8')
+    if sys.version_info < (3, 0):
+        s = _encode_compat_str(s, 'utf-8')
     # ensure unicode: after quoting, it can always be converted
     return compat_str(compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]"))
 
@@ -4242,8 +4226,7 @@ def parse_qs(url, **kwargs):
 
 def read_batch_urls(batch_fd):
     def fixup(url):
-        if not isinstance(url, compat_str):
-            url = url.decode('utf-8', 'replace')
+        url = _decode_compat_str(url, 'utf-8', 'replace')
         BOM_UTF8 = '\xef\xbb\xbf'
         if url.startswith(BOM_UTF8):
             url = url[len(BOM_UTF8):]
@@ -4305,10 +4288,8 @@ def _multipart_encode_impl(data, boundary):
     out = b''
     for k, v in data.items():
         out += b'--' + boundary.encode('ascii') + b'\r\n'
-        if isinstance(k, compat_str):
-            k = k.encode('utf-8')
-        if isinstance(v, compat_str):
-            v = v.encode('utf-8')
+        k = _encode_compat_str(k, 'utf-8')
+        v = _encode_compat_str(v, 'utf-8')
         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
@@ -4399,6 +4380,11 @@ def try_get(src, getter, expected_type=None):
                 return v
 
 
+def filter_dict(dct, cndn=lambda _, v: v is not None):
+    # NB: don't use dict comprehension for python 2.6 compatibility
+    return dict((k, v) for k, v in dct.items() if cndn(k, v))
+
+
 def merge_dicts(*dicts, **kwargs):
     """
         Merge the `dict`s in `dicts` using the first valid value for each key.
@@ -4435,8 +4421,26 @@ def can_merge_str(k, v, to_dict):
     return merged
 
 
-def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
-    return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
+# very poor choice of name, as if Python string encodings weren't confusing enough
+def encode_compat_str(s, encoding=preferredencoding(), errors='strict'):
+    assert isinstance(s, compat_basestring)
+    return s if isinstance(s, compat_str) else compat_str(s, encoding, errors)
+
+
+# what it could have been
+def _decode_compat_str(s, encoding=preferredencoding(), errors='strict', or_none=False):
+    if not or_none:
+        assert isinstance(s, compat_basestring)
+    return (
+        s if isinstance(s, compat_str)
+        else compat_str(s, encoding, errors) if isinstance(s, compat_basestring)
+        else None)
+
+
+# the real encode_compat_str, but only for internal use
+def _encode_compat_str(s, encoding=preferredencoding(), errors='strict'):
+    assert isinstance(s, compat_basestring)
+    return s.encode(encoding, errors) if isinstance(s, compat_str) else s
 
 
 US_RATINGS = {
@@ -4459,8 +4463,10 @@ def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
 
 
 def parse_age_limit(s):
-    if type(s) == int:
-        return s if 0 <= s <= 21 else None
+    if not isinstance(s, bool):
+        age = int_or_none(s)
+        if age is not None:
+            return age if 0 <= age <= 21 else None
     if not isinstance(s, compat_basestring):
         return None
     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
@@ -4637,12 +4643,7 @@ def args_to_str(args):
 
 
 def error_to_compat_str(err):
-    err_str = str(err)
-    # On python 2 error byte string must be decoded with proper
-    # encoding rather than ascii
-    if sys.version_info[0] < 3:
-        err_str = err_str.decode(preferredencoding())
-    return err_str
+    return _decode_compat_str(str(err))
 
 
 def mimetype2ext(mt):