urllib3 · sethmlarson · Apr 21, 2019 · Jan 27, 2019 · Jan 27, 2019 · Jan 29, 2019
diff --git a/src/urllib3/connectionpool.py b/src/urllib3/connectionpool.py
@@ -26,6 +26,7 @@
 from .packages.ssl_match_hostname import CertificateError
 from .packages import six
 from .packages.six.moves import queue
+from .packages.rfc3986.normalizers import normalize_host
 from .connection import (
     port_by_scheme,
     DummyConnection,
@@ -65,7 +66,7 @@ def __init__(self, host, port=None):
         if not host:
             raise LocationValueError("No host specified.")
 
-        self.host = _ipv6_host(host, self.scheme)
+        self.host = _normalize_host(host, scheme=self.scheme)
         self._proxy_host = host.lower()
         self.port = port
 
@@ -432,8 +433,8 @@ def is_same_host(self, url):
 
         # TODO: Add optional support for socket.gethostbyname checking.
         scheme, host, port = get_host(url)
-
-        host = _ipv6_host(host, self.scheme)
+        if host is not None:
+            host = _normalize_host(host, scheme=scheme)
 
         # Use explicit default port for comparison when none is given
         if self.port and not port:
@@ -876,9 +877,9 @@ def connection_from_url(url, **kw):
         return HTTPConnectionPool(host, port=port, **kw)
 
 
-def _ipv6_host(host, scheme):
+def _normalize_host(host, scheme):
     """
-    Process IPv6 address literals
+    Normalize hosts for comparisons and use with sockets.
     """
 
     # httplib doesn't like it when we include brackets in IPv6 addresses
@@ -887,11 +888,8 @@ def _ipv6_host(host, scheme):
     # Instead, we need to make sure we never pass ``None`` as the port.
     # However, for backward compatibility reasons we can't actually
     # *assert* that.  See http://bugs.python.org/issue28539
-    #
-    # Also if an IPv6 address literal has a zone identifier, the
-    # percent sign might be URIencoded, convert it back into ASCII
     if host.startswith('[') and host.endswith(']'):
-        host = host.replace('%25', '%').strip('[]')
+        host = host.strip('[]')
     if scheme in NORMALIZABLE_SCHEMES:
-        host = host.lower()
+        host = normalize_host(host)
     return host
diff --git a/src/urllib3/util/url.py b/src/urllib3/util/url.py
@@ -4,7 +4,9 @@
 
 from ..exceptions import LocationParseError
 from ..packages import six, rfc3986
-from ..packages.rfc3986.exceptions import RFC3986Exception
+from ..packages.rfc3986.exceptions import RFC3986Exception, ValidationError
+from ..packages.rfc3986.validators import Validator
+from ..packages.rfc3986.normalizers import normalize_host
 
 
 url_attrs = ['scheme', 'auth', 'host', 'port', 'path', 'query', 'fragment']
@@ -14,12 +16,12 @@
 NORMALIZABLE_SCHEMES = ('http', 'https', None)
 
 # Regex for detecting URLs with schemes. RFC 3986 Section 3.1
-SCHEME_REGEX = re.compile(r"^[a-zA-Z][a-zA-Z0-9+\-.]*://")
+SCHEME_REGEX = re.compile(r"^[a-zA-Z][a-zA-Z0-9+\-]*:")
 
 
 class Url(namedtuple('Url', url_attrs)):
     """
-    Datastructure for representing an HTTP URL. Used as a return value for
+    Data structure for representing an HTTP URL. Used as a return value for
     :func:`parse_url`. Both the scheme and host are normalized as they are
     both case-insensitive according to RFC 3986.
     """
@@ -171,22 +173,30 @@ def parse_url(url):
         url = "//" + url
 
     try:
-        parse_result = rfc3986.urlparse(url, encoding="utf-8")
+        uri_ref = rfc3986.URIReference.from_string(url, encoding="utf-8")
     except (ValueError, RFC3986Exception):
         raise LocationParseError(url)
 
-    # RFC 3986 doesn't assert ports must be non-negative.
-    if parse_result.port and parse_result.port < 0:
+    if uri_ref.scheme in NORMALIZABLE_SCHEMES:
+        uri_ref = uri_ref.normalize()
+
+    # Run validator on the internal URIReference within the ParseResult
+    validator = Validator()
+    try:
+        validator.check_validity_of(
+            *validator.COMPONENT_NAMES
+        ).validate(uri_ref)
+    except ValidationError:
         raise LocationParseError(url)
 
     # For the sake of backwards compatibility we put empty
     # string values for path if there are any defined values
     # beyond the path in the URL.
     # TODO: Remove this when we break backwards compatibility.
-    path = parse_result.path
+    path = uri_ref.path
     if not path:
-        if (parse_result.query is not None
-                or parse_result.fragment is not None):
+        if (uri_ref.query is not None
+                or uri_ref.fragment is not None):
             path = ""
         else:
             path = None
@@ -196,20 +206,18 @@ def parse_url(url):
     def to_input_type(x):
         if x is None:
             return None
-        elif is_string and isinstance(x, six.binary_type):
-            return x.decode('utf-8')
         elif not is_string and not isinstance(x, six.binary_type):
             return x.encode('utf-8')
         return x
 
     return Url(
-        scheme=to_input_type(parse_result.scheme),
-        auth=to_input_type(parse_result.userinfo),
-        host=to_input_type(parse_result.hostname),
-        port=parse_result.port,
+        scheme=to_input_type(uri_ref.scheme),
+        auth=to_input_type(uri_ref.userinfo),
+        host=to_input_type(uri_ref.host),
+        port=int(uri_ref.port) if uri_ref.port is not None else None,
         path=to_input_type(path),
-        query=to_input_type(parse_result.query),
-        fragment=to_input_type(parse_result.fragment)
+        query=to_input_type(uri_ref.query),
+        fragment=to_input_type(uri_ref.fragment)
     )
 
 

diff --git a/test/test_util.py b/test/test_util.py
@@ -131,6 +131,17 @@ def test_invalid_host(self, location):
         with pytest.raises(LocationParseError):
             get_host(location)
 
+    @pytest.mark.parametrize('url', [
+        'http://user\\@google.com',
+        'http://google\\.com',
+        'user\\@google.com',
+        'http://google.com#fragment#',
+        'http://user@user@google.com/',
+    ])
+    def test_invalid_url(self, url):
+        with pytest.raises(LocationParseError):
+            parse_url(url)
+
     @pytest.mark.parametrize('url, expected_normalized_url', [
         ('HTTP://GOOGLE.COM/MAIL/', 'http://google.com/MAIL/'),
         ('HTTP://JeremyCline:Hunter2@Example.com:8080/',
@@ -156,7 +167,7 @@ def test_parse_url_normalization(self, url, expected_normalized_url):
         ('', Url()),
         ('/', Url(path='/')),
         ('/abc/../def', Url(path="/abc/../def")),
-        ('#?/!google.com/?foo#bar', Url(path='', fragment='?/!google.com/?foo#bar')),
+        ('#?/!google.com/?foo', Url(path='', fragment='?/!google.com/?foo')),
         ('/foo', Url(path='/foo')),
         ('/foo?bar=baz', Url(path='/foo', query='bar=baz')),
         ('/foo?bar=baz#banana?apple/orange', Url(path='/foo',
@@ -173,10 +184,10 @@ def test_parse_url_normalization(self, url, expected_normalized_url):
         # Auth
         ('http://foo:bar@localhost/', Url('http', auth='foo:bar', host='localhost', path='/')),
         ('http://foo@localhost/', Url('http', auth='foo', host='localhost', path='/')),
-        ('http://foo:bar@baz@localhost/', Url('http',
-                                              auth='foo:bar@baz',
-                                              host='localhost',
-                                              path='/')),
+        ('http://foo:bar@localhost/', Url('http',
+                                          auth='foo:bar',
+                                          host='localhost',
+                                          path='/')),
 
         # Unicode type (Python 2.x)
         (u'http://foo:bar@localhost/', Url(u'http',
@@ -265,7 +276,11 @@ def test_netloc(self, url, expected_netloc):
         # NodeJS unicode -> double dot
         (u"http://google.com/\uff2e\uff2e/abc", Url("http",
                                                     host="google.com",
-                                                    path='/%ef%bc%ae%ef%bc%ae/abc'))
+                                                    path='/%ef%bc%ae%ef%bc%ae/abc')),
+
+        # Scheme without ://
+        ("javascript:a='@google.com:12345/';alert(0)", Url(scheme="javascript",
+                                                           path="a='@google.com:12345/';alert(0)")),
     ]
 
     @pytest.mark.parametrize("url, expected_url", url_vulnerabilities)