urllib3 · sethmlarson · Apr 21, 2019 · Jan 27, 2019 · Jan 27, 2019 · Jan 29, 2019
diff --git a/src/urllib3/connectionpool.py b/src/urllib3/connectionpool.py
@@ -26,6 +26,7 @@
 from .packages.ssl_match_hostname import CertificateError
 from .packages import six
 from .packages.six.moves import queue
+from .packages.rfc3986.normalizers import normalize_host
 from .connection import (
     port_by_scheme,
     DummyConnection,
@@ -65,7 +66,7 @@ def __init__(self, host, port=None):
         if not host:
             raise LocationValueError("No host specified.")
 
-        self.host = _ipv6_host(host, self.scheme)
+        self.host = _normalize_host(host, scheme=self.scheme)
         self._proxy_host = host.lower()
         self.port = port
 
@@ -432,8 +433,8 @@ def is_same_host(self, url):
 
         # TODO: Add optional support for socket.gethostbyname checking.
         scheme, host, port = get_host(url)
-
-        host = _ipv6_host(host, self.scheme)
+        if host is not None:
+            host = _normalize_host(host, scheme=scheme)
 
         # Use explicit default port for comparison when none is given
         if self.port and not port:
@@ -876,9 +877,9 @@ def connection_from_url(url, **kw):
         return HTTPConnectionPool(host, port=port, **kw)
 
 
-def _ipv6_host(host, scheme):
+def _normalize_host(host, scheme):
     """
-    Process IPv6 address literals
+    Normalize hosts for comparisons and use with sockets.
     """
 
     # httplib doesn't like it when we include brackets in IPv6 addresses
@@ -887,11 +888,8 @@ def _ipv6_host(host, scheme):
     # Instead, we need to make sure we never pass ``None`` as the port.
     # However, for backward compatibility reasons we can't actually
     # *assert* that.  See http://bugs.python.org/issue28539
-    #
-    # Also if an IPv6 address literal has a zone identifier, the
-    # percent sign might be URIencoded, convert it back into ASCII
     if host.startswith('[') and host.endswith(']'):
-        host = host.replace('%25', '%').strip('[]')
+        host = host.strip('[]')
     if scheme in NORMALIZABLE_SCHEMES:
-        host = host.lower()
+        host = normalize_host(host)
     return host
diff --git a/src/urllib3/packages/rfc3986/abnf_regexp.py b/src/urllib3/packages/rfc3986/abnf_regexp.py
@@ -115,14 +115,14 @@
     UNRESERVED_RE + SUB_DELIMITERS_RE + ':'
 )
 
-
 # RFC 6874 Zone ID ABNF
 ZONE_ID = '(?:[' + UNRESERVED_RE + ']|' + PCT_ENCODED + ')+'
-IPv6_ADDRZ_RE = IPv6_RE + '%25' + ZONE_ID
 
-IP_LITERAL_RE = r'\[({0}|(?:{1})|{2})\]'.format(
-    IPv6_RE,
-    IPv6_ADDRZ_RE,
+IPv6_ADDRZ_RFC4007_RE = IPv6_RE + '(?:(?:%25|%)' + ZONE_ID + ')?'
+IPv6_ADDRZ_RE = IPv6_RE + '(?:%25' + ZONE_ID + ')?'
+
+IP_LITERAL_RE = r'\[({0}|{1})\]'.format(
+    IPv6_ADDRZ_RFC4007_RE,
     IPv_FUTURE_RE,
 )
 

diff --git a/src/urllib3/packages/rfc3986/misc.py b/src/urllib3/packages/rfc3986/misc.py
@@ -58,7 +58,14 @@
              abnf_regexp.PORT_RE))
 
 
+HOST_MATCHER = re.compile('^' + abnf_regexp.HOST_RE + '$')
 IPv4_MATCHER = re.compile('^' + abnf_regexp.IPv4_RE + '$')
+IPv6_MATCHER = re.compile(r'^\[' + abnf_regexp.IPv6_ADDRZ_RFC4007_RE + r'\]$')
+
+# Used by host validator
+IPv6_NO_RFC4007_MATCHER = re.compile(r'^\[%s\]$' % (
+    abnf_regexp.IPv6_ADDRZ_RE
+))
 
 # Matcher used to validate path components
 PATH_MATCHER = re.compile(abnf_regexp.PATH_RE)

diff --git a/src/urllib3/packages/rfc3986/normalizers.py b/src/urllib3/packages/rfc3986/normalizers.py
@@ -49,6 +49,21 @@ def normalize_password(password):
 
 def normalize_host(host):
     """Normalize a host string."""
+    if misc.IPv6_MATCHER.match(host):
+        percent = host.find('%')
+        if percent != -1:
+            percent_25 = host.find('%25')
+
+            # Replace RFC 4007 IPv6 Zone ID delimiter '%' with '%25'
+            # from RFC 6874. If the host is '[<IPv6 addr>%25]' then we
+            # assume RFC 4007 and normalize to '[<IPV6 addr>%2525]'
+            if percent_25 == -1 or percent < percent_25 or \
+                    (percent == percent_25 and percent_25 == len(host) - 4):
+                host = host.replace('%', '%25', 1)
+
+            # Don't normalize the casing of the Zone ID
+            return host[:percent].lower() + host[percent:]
+
     return host.lower()
 
 

diff --git a/src/urllib3/packages/rfc3986/validators.py b/src/urllib3/packages/rfc3986/validators.py
@@ -304,8 +304,28 @@ def authority_is_valid(authority, host=None, require=False):
         bool
     """
     validated = is_valid(authority, misc.SUBAUTHORITY_MATCHER, require)
+    if validated and host is not None:
+        return host_is_valid(host, require)
+    return validated
+
+
+def host_is_valid(host, require=False):
+    """Determine if the host string is valid.
+
+    :param str host:
+        The host to validate.
+    :param bool require:
+        (optional) Specify if host must not be None.
+    :returns:
+        ``True`` if valid, ``False`` otherwise
+    :rtype:
+        bool
+    """
+    validated = is_valid(host, misc.HOST_MATCHER, require)
     if validated and host is not None and misc.IPv4_MATCHER.match(host):
         return valid_ipv4_host_address(host)
+    elif validated and host is not None and misc.IPv6_MATCHER.match(host):
+        return misc.IPv6_NO_RFC4007_MATCHER.match(host) is not None
     return validated
 
 
@@ -395,7 +415,9 @@ def subauthority_component_is_valid(uri, component):
 
     # If we can parse the authority into sub-components and we're not
     # validating the port, we can assume it's valid.
-    if component != 'port':
+    if component == 'host':
+        return host_is_valid(subauthority_dict['host'])
+    elif component != 'port':
         return True
 
     try:

diff --git a/src/urllib3/util/url.py b/src/urllib3/util/url.py
@@ -4,7 +4,9 @@
 
 from ..exceptions import LocationParseError
 from ..packages import six, rfc3986
-from ..packages.rfc3986.exceptions import RFC3986Exception
+from ..packages.rfc3986.exceptions import RFC3986Exception, ValidationError, InvalidAuthority
+from ..packages.rfc3986.validators import Validator
+from ..packages.rfc3986.normalizers import normalize_host
 
 
 url_attrs = ['scheme', 'auth', 'host', 'port', 'path', 'query', 'fragment']
@@ -14,12 +16,12 @@
 NORMALIZABLE_SCHEMES = ('http', 'https', None)
 
 # Regex for detecting URLs with schemes. RFC 3986 Section 3.1
-SCHEME_REGEX = re.compile(r"^[a-zA-Z][a-zA-Z0-9+\-.]*://")
+SCHEME_REGEX = re.compile(r"^[a-zA-Z][a-zA-Z0-9+\-]*:")
 
 
 class Url(namedtuple('Url', url_attrs)):
     """
-    Datastructure for representing an HTTP URL. Used as a return value for
+    Data structure for representing an HTTP URL. Used as a return value for
     :func:`parse_url`. Both the scheme and host are normalized as they are
     both case-insensitive according to RFC 3986.
     """
@@ -29,10 +31,8 @@ def __new__(cls, scheme=None, auth=None, host=None, port=None, path=None,
                 query=None, fragment=None):
         if path and not path.startswith('/'):
             path = '/' + path
-        if scheme:
+        if scheme is not None:
             scheme = scheme.lower()
-        if host and scheme in NORMALIZABLE_SCHEMES:
-            host = host.lower()
         return super(Url, cls).__new__(cls, scheme, auth, host, port, path,
                                        query, fragment)
 
@@ -104,7 +104,7 @@ def __str__(self):
 
 def split_first(s, delims):
     """
-    Deprecated. No longer used by parse_url().
+    .. deprecated:: 1.25
 
     Given a string and an iterable of delimiters, split on the first found
     delimiter. Return two split parts and the matched delimiter.
@@ -171,22 +171,50 @@ def parse_url(url):
         url = "//" + url
 
     try:
-        parse_result = rfc3986.urlparse(url, encoding="utf-8")
+        uri_ref = rfc3986.URIReference.from_string(url, encoding="utf-8")
     except (ValueError, RFC3986Exception):
         raise LocationParseError(url)
 
-    # RFC 3986 doesn't assert ports must be non-negative.
-    if parse_result.port and parse_result.port < 0:
+    # Find all components that the URI has before normalization.
+    # If there's an invalid authority already it'd be stripped
+    # off by normalization.
+    try:
+        required_components = [
+            k for k, v in six.iteritems(uri_ref.authority_info())
+            if v is not None
+        ]
+    except InvalidAuthority:
+        raise LocationParseError(url)
+
+    required_components.extend([
+        k for k in ['path', 'query', 'fragment']
+        if getattr(uri_ref, k) is not None
+    ])
+
+    if uri_ref.scheme is None or uri_ref.scheme.lower() in NORMALIZABLE_SCHEMES:
+        uri_ref = uri_ref.normalize()
+
+    # Validate all URIReference components and ensure that all
+    # components that were set before are still set after
+    # normalization has completed.
+    validator = Validator()
+    try:
+        validator.check_validity_of(
+            *validator.COMPONENT_NAMES
+        ).require_presence_of(
+            *required_components
+        ).validate(uri_ref)
+    except ValidationError:
         raise LocationParseError(url)
 
     # For the sake of backwards compatibility we put empty
     # string values for path if there are any defined values
     # beyond the path in the URL.
     # TODO: Remove this when we break backwards compatibility.
-    path = parse_result.path
+    path = uri_ref.path
     if not path:
-        if (parse_result.query is not None
-                or parse_result.fragment is not None):
+        if (uri_ref.query is not None
+                or uri_ref.fragment is not None):
             path = ""
         else:
             path = None
@@ -196,20 +224,18 @@ def parse_url(url):
     def to_input_type(x):
         if x is None:
             return None
-        elif is_string and isinstance(x, six.binary_type):
-            return x.decode('utf-8')
         elif not is_string and not isinstance(x, six.binary_type):
             return x.encode('utf-8')
         return x
 
     return Url(
-        scheme=to_input_type(parse_result.scheme),
-        auth=to_input_type(parse_result.userinfo),
-        host=to_input_type(parse_result.hostname),
-        port=parse_result.port,
+        scheme=to_input_type(uri_ref.scheme),
+        auth=to_input_type(uri_ref.userinfo),
+        host=to_input_type(uri_ref.host),
+        port=int(uri_ref.port) if uri_ref.port is not None else None,
         path=to_input_type(path),
-        query=to_input_type(parse_result.query),
-        fragment=to_input_type(parse_result.fragment)
+        query=to_input_type(uri_ref.query),
+        fragment=to_input_type(uri_ref.fragment)
     )
 
 

diff --git a/test/test_util.py b/test/test_util.py
@@ -131,12 +131,24 @@ def test_invalid_host(self, location):
         with pytest.raises(LocationParseError):
             get_host(location)
 
+    @pytest.mark.parametrize('url', [
+        'http://user\\@google.com',
+        'http://google\\.com',
+        'user\\@google.com',
+        'http://google.com#fragment#',
+        'http://user@user@google.com/',
+    ])
+    def test_invalid_url(self, url):
+        with pytest.raises(LocationParseError):
+            parse_url(url)
+
     @pytest.mark.parametrize('url, expected_normalized_url', [
         ('HTTP://GOOGLE.COM/MAIL/', 'http://google.com/MAIL/'),
         ('HTTP://JeremyCline:Hunter2@Example.com:8080/',
          'http://JeremyCline:Hunter2@example.com:8080/'),
         ('HTTPS://Example.Com/?Key=Value', 'https://example.com/?Key=Value'),
         ('Https://Example.Com/#Fragment', 'https://example.com/#Fragment'),
+        ('[::Ff%etH0%Ff]/%ab%Af', '[::ff%25etH0%Ff]/%AB%AF'),
     ])
     def test_parse_url_normalization(self, url, expected_normalized_url):
         """Assert parse_url normalizes the scheme/host, and only the scheme/host"""
@@ -155,8 +167,8 @@ def test_parse_url_normalization(self, url, expected_normalized_url):
         # Path/query/fragment
         ('', Url()),
         ('/', Url(path='/')),
-        ('/abc/../def', Url(path="/abc/../def")),
-        ('#?/!google.com/?foo#bar', Url(path='', fragment='?/!google.com/?foo#bar')),
+        ('/abc/../def', Url(path="/def")),
+        ('#?/!google.com/?foo', Url(path='', fragment='?/!google.com/?foo')),
         ('/foo', Url(path='/foo')),
         ('/foo?bar=baz', Url(path='/foo', query='bar=baz')),
         ('/foo?bar=baz#banana?apple/orange', Url(path='/foo',
@@ -173,10 +185,10 @@ def test_parse_url_normalization(self, url, expected_normalized_url):
         # Auth
         ('http://foo:bar@localhost/', Url('http', auth='foo:bar', host='localhost', path='/')),
         ('http://foo@localhost/', Url('http', auth='foo', host='localhost', path='/')),
-        ('http://foo:bar@baz@localhost/', Url('http',
-                                              auth='foo:bar@baz',
-                                              host='localhost',
-                                              path='/')),
+        ('http://foo:bar@localhost/', Url('http',
+                                          auth='foo:bar',
+                                          host='localhost',
+                                          path='/')),
 
         # Unicode type (Python 2.x)
         (u'http://foo:bar@localhost/', Url(u'http',
@@ -209,6 +221,10 @@ def test_parse_url(self, url, expected_url):
 
     @pytest.mark.parametrize('url, expected_url', parse_url_host_map)
     def test_unparse_url(self, url, expected_url):
+
+        if '/../' in url:
+            url = url.replace('/abc/../', '/')
+
         assert url == expected_url.url
 
     def test_parse_url_invalid_IPv6(self):
@@ -260,12 +276,16 @@ def test_netloc(self, url, expected_netloc):
 
         # CVE-2016-5699
         ("http://127.0.0.1%0d%0aConnection%3a%20keep-alive",
-         Url("http", host="127.0.0.1%0d%0aConnection%3a%20keep-alive")),
+         Url("http", host="127.0.0.1%0d%0aconnection%3a%20keep-alive")),
 
         # NodeJS unicode -> double dot
         (u"http://google.com/\uff2e\uff2e/abc", Url("http",
                                                     host="google.com",
-                                                    path='/%ef%bc%ae%ef%bc%ae/abc'))
+                                                    path='/%EF%BC%AE%EF%BC%AE/abc')),
+
+        # Scheme without ://
+        ("javascript:a='@google.com:12345/';alert(0)", Url(scheme="javascript",
+                                                           path="a='@google.com:12345/';alert(0)")),
     ]
 
     @pytest.mark.parametrize("url, expected_url", url_vulnerabilities)