urllib3 · sethmlarson · Apr 21, 2019 · Jan 27, 2019 · Jan 27, 2019 · Jan 29, 2019
diff --git a/src/urllib3/packages/rfc3986/misc.py b/src/urllib3/packages/rfc3986/misc.py
@@ -58,6 +58,7 @@
              abnf_regexp.PORT_RE))
 
 
+HOST_MATCHER = re.compile('^' + abnf_regexp.HOST_RE + '$')
 IPv4_MATCHER = re.compile('^' + abnf_regexp.IPv4_RE + '$')
 IPv6_MATCHER = re.compile(r'^\[' + abnf_regexp.IPv6_ADDRZ_RFC4007_RE + r'\]$')
 

diff --git a/src/urllib3/packages/rfc3986/validators.py b/src/urllib3/packages/rfc3986/validators.py
@@ -304,6 +304,24 @@ def authority_is_valid(authority, host=None, require=False):
         bool
     """
     validated = is_valid(authority, misc.SUBAUTHORITY_MATCHER, require)
+    if validated and host is not None:
+        return host_is_valid(host, require)
+    return validated
+
+
+def host_is_valid(host, require=False):
+    """Determine if the host string is valid.
+
+    :param str host:
+        The host to validate.
+    :param bool require:
+        (optional) Specify if host must not be None.
+    :returns:
+        ``True`` if valid, ``False`` otherwise
+    :rtype:
+        bool
+    """
+    validated = is_valid(host, misc.HOST_MATCHER, require)
     if validated and host is not None and misc.IPv4_MATCHER.match(host):
         return valid_ipv4_host_address(host)
     elif validated and host is not None and misc.IPv6_MATCHER.match(host):
@@ -395,19 +413,11 @@ def subauthority_component_is_valid(uri, component):
     except exceptions.InvalidAuthority:
         return False
 
-    # Check host here
-    if component == 'host':
-        host = subauthority_dict.get('host')
-        if (host and misc.IPv6_MATCHER.match(host) and not
-                misc.IPv6_NO_RFC4007_MATCHER.match(host)):
-            # If it's an IPv6 address that has RFC 4007 IPv6
-            # Zone IDs then it's invalid.
-            return False
-
-
     # If we can parse the authority into sub-components and we're not
     # validating the port, we can assume it's valid.
-    if component != 'port':
+    if component == 'host':
+        return host_is_valid(subauthority_dict['host'])
+    elif component != 'port':
         return True
 
     try:

diff --git a/src/urllib3/util/url.py b/src/urllib3/util/url.py
@@ -4,7 +4,7 @@
 
 from ..exceptions import LocationParseError
 from ..packages import six, rfc3986
-from ..packages.rfc3986.exceptions import RFC3986Exception, ValidationError
+from ..packages.rfc3986.exceptions import RFC3986Exception, ValidationError, InvalidAuthority
 from ..packages.rfc3986.validators import Validator
 from ..packages.rfc3986.normalizers import normalize_host
 
@@ -31,10 +31,8 @@ def __new__(cls, scheme=None, auth=None, host=None, port=None, path=None,
                 query=None, fragment=None):
         if path and not path.startswith('/'):
             path = '/' + path
-        if scheme:
+        if scheme is not None:
             scheme = scheme.lower()
-        if host and scheme in NORMALIZABLE_SCHEMES:
-            host = host.lower()
         return super(Url, cls).__new__(cls, scheme, auth, host, port, path,
                                        query, fragment)
 
@@ -106,7 +104,7 @@ def __str__(self):
 
 def split_first(s, delims):
     """
-    Deprecated. No longer used by parse_url().
+    .. deprecated:: 1.25
 
     Given a string and an iterable of delimiters, split on the first found
     delimiter. Return two split parts and the matched delimiter.
@@ -177,14 +175,34 @@ def parse_url(url):
     except (ValueError, RFC3986Exception):
         raise LocationParseError(url)
 
-    if uri_ref.scheme in NORMALIZABLE_SCHEMES:
+    # Find all components that the URI has before normalization.
+    # If there's an invalid authority already it'd be stripped
+    # off by normalization.
+    try:
+        required_components = [
+            k for k, v in six.iteritems(uri_ref.authority_info())
+            if v is not None
+        ]
+    except InvalidAuthority:
+        raise LocationParseError(url)
+
+    required_components.extend([
+        k for k in ['path', 'query', 'fragment']
+        if getattr(uri_ref, k) is not None
+    ])
+
+    if uri_ref.scheme is None or uri_ref.scheme.lower() in NORMALIZABLE_SCHEMES:
         uri_ref = uri_ref.normalize()
 
-    # Run validator on the internal URIReference within the ParseResult
+    # Validate all URIReference components and ensure that all
+    # components that were set before are still set after
+    # normalization has completed.
     validator = Validator()
     try:
         validator.check_validity_of(
             *validator.COMPONENT_NAMES
+        ).require_presence_of(
+            *required_components
         ).validate(uri_ref)
     except ValidationError:
         raise LocationParseError(url)

diff --git a/test/test_util.py b/test/test_util.py
@@ -148,6 +148,7 @@ def test_invalid_url(self, url):
          'http://JeremyCline:Hunter2@example.com:8080/'),
         ('HTTPS://Example.Com/?Key=Value', 'https://example.com/?Key=Value'),
         ('Https://Example.Com/#Fragment', 'https://example.com/#Fragment'),
+        ('[::Ff%etH0%Ff]/%ab%Af', '[::ff%25etH0%Ff]/%AB%AF'),
     ])
     def test_parse_url_normalization(self, url, expected_normalized_url):
         """Assert parse_url normalizes the scheme/host, and only the scheme/host"""
@@ -166,7 +167,7 @@ def test_parse_url_normalization(self, url, expected_normalized_url):
         # Path/query/fragment
         ('', Url()),
         ('/', Url(path='/')),
-        ('/abc/../def', Url(path="/abc/../def")),
+        ('/abc/../def', Url(path="/def")),
         ('#?/!google.com/?foo', Url(path='', fragment='?/!google.com/?foo')),
         ('/foo', Url(path='/foo')),
         ('/foo?bar=baz', Url(path='/foo', query='bar=baz')),
@@ -220,6 +221,10 @@ def test_parse_url(self, url, expected_url):
 
     @pytest.mark.parametrize('url, expected_url', parse_url_host_map)
     def test_unparse_url(self, url, expected_url):
+
+        if '/../' in url:
+            url = url.replace('/abc/../', '/')
+
         assert url == expected_url.url
 
     def test_parse_url_invalid_IPv6(self):
@@ -271,12 +276,12 @@ def test_netloc(self, url, expected_netloc):
 
         # CVE-2016-5699
         ("http://127.0.0.1%0d%0aConnection%3a%20keep-alive",
-         Url("http", host="127.0.0.1%0d%0aConnection%3a%20keep-alive")),
+         Url("http", host="127.0.0.1%0d%0aconnection%3a%20keep-alive")),
 
         # NodeJS unicode -> double dot
         (u"http://google.com/\uff2e\uff2e/abc", Url("http",
                                                     host="google.com",
-                                                    path='/%ef%bc%ae%ef%bc%ae/abc')),
+                                                    path='/%EF%BC%AE%EF%BC%AE/abc')),
 
         # Scheme without ://
         ("javascript:a='@google.com:12345/';alert(0)", Url(scheme="javascript",