Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use rfc3986.validator.Validator for parse_url #1531

Merged
merged 16 commits into from
Apr 21, 2019
Merged
18 changes: 8 additions & 10 deletions src/urllib3/connectionpool.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from .packages.ssl_match_hostname import CertificateError
from .packages import six
from .packages.six.moves import queue
from .packages.rfc3986.normalizers import normalize_host
from .connection import (
port_by_scheme,
DummyConnection,
Expand Down Expand Up @@ -65,7 +66,7 @@ def __init__(self, host, port=None):
if not host:
raise LocationValueError("No host specified.")

self.host = _ipv6_host(host, self.scheme)
self.host = _normalize_host(host, scheme=self.scheme)
self._proxy_host = host.lower()
self.port = port

Expand Down Expand Up @@ -432,8 +433,8 @@ def is_same_host(self, url):

# TODO: Add optional support for socket.gethostbyname checking.
scheme, host, port = get_host(url)

host = _ipv6_host(host, self.scheme)
if host is not None:
host = _normalize_host(host, scheme=scheme)

# Use explicit default port for comparison when none is given
if self.port and not port:
Expand Down Expand Up @@ -876,9 +877,9 @@ def connection_from_url(url, **kw):
return HTTPConnectionPool(host, port=port, **kw)


def _ipv6_host(host, scheme):
def _normalize_host(host, scheme):
"""
Process IPv6 address literals
Normalize hosts for comparisons and use with sockets.
"""

# httplib doesn't like it when we include brackets in IPv6 addresses
Expand All @@ -887,11 +888,8 @@ def _ipv6_host(host, scheme):
# Instead, we need to make sure we never pass ``None`` as the port.
# However, for backward compatibility reasons we can't actually
# *assert* that. See http://bugs.python.org/issue28539
#
# Also if an IPv6 address literal has a zone identifier, the
# percent sign might be URIencoded, convert it back into ASCII
if host.startswith('[') and host.endswith(']'):
host = host.replace('%25', '%').strip('[]')
host = host.strip('[]')
if scheme in NORMALIZABLE_SCHEMES:
host = host.lower()
host = normalize_host(host)
return host
42 changes: 25 additions & 17 deletions src/urllib3/util/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

from ..exceptions import LocationParseError
from ..packages import six, rfc3986
from ..packages.rfc3986.exceptions import RFC3986Exception
from ..packages.rfc3986.exceptions import RFC3986Exception, ValidationError
from ..packages.rfc3986.validators import Validator
from ..packages.rfc3986.normalizers import normalize_host


url_attrs = ['scheme', 'auth', 'host', 'port', 'path', 'query', 'fragment']
Expand All @@ -14,12 +16,12 @@
NORMALIZABLE_SCHEMES = ('http', 'https', None)

# Regex for detecting URLs with schemes. RFC 3986 Section 3.1
SCHEME_REGEX = re.compile(r"^[a-zA-Z][a-zA-Z0-9+\-.]*://")
Copy link
Member Author

@sethmlarson sethmlarson Jan 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sigmavirus24 What do you think of taking this . out of this scheme regex? I did this because I don't think we support any scheme that has this . here but we support a lot of schemeless "URLs" where the authority section looks like a scheme (www.google.com is a valid "scheme"). Should we get even more strict and only support schemes that start with http? I'm not sure.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, I don't understand the purpose the . is serving. We can't, however, limit ourselves to what we think of as normal schemes because we (fortunately, or not) support http+unix://

Copy link
Member Author

@sethmlarson sethmlarson Jan 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The . is part of the scheme spec but I couldn't find an example scheme that contained a period in it. The three schemes I know of that we support are http, https, and http+unix case insensitive.

Removing the period prevents a few issues like us thinking that google.com:433/path is scheme=google.com, host=433, path=/path and instead forcing a parse on ://google.com:433/path which gives us a correct result.

SCHEME_REGEX = re.compile(r"^[a-zA-Z][a-zA-Z0-9+\-]*:")


class Url(namedtuple('Url', url_attrs)):
"""
Datastructure for representing an HTTP URL. Used as a return value for
Data structure for representing an HTTP URL. Used as a return value for
:func:`parse_url`. Both the scheme and host are normalized as they are
both case-insensitive according to RFC 3986.
"""
Expand Down Expand Up @@ -171,22 +173,30 @@ def parse_url(url):
url = "//" + url

try:
parse_result = rfc3986.urlparse(url, encoding="utf-8")
uri_ref = rfc3986.URIReference.from_string(url, encoding="utf-8")
except (ValueError, RFC3986Exception):
raise LocationParseError(url)

# RFC 3986 doesn't assert ports must be non-negative.
if parse_result.port and parse_result.port < 0:
if uri_ref.scheme in NORMALIZABLE_SCHEMES:
uri_ref = uri_ref.normalize()

# Run validator on the internal URIReference within the ParseResult
validator = Validator()
try:
validator.check_validity_of(
*validator.COMPONENT_NAMES
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So you want to validate literally everything, yes? I wonder if we could make a better API for this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah validate all components. Leaving the hard work to you I can whip up a patch, tests, and docs if you can think of a name for the interface. ;)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

validate_all_the_components_<emoji> :trollface:

).validate(uri_ref)
except ValidationError:
raise LocationParseError(url)

# For the sake of backwards compatibility we put empty
# string values for path if there are any defined values
# beyond the path in the URL.
# TODO: Remove this when we break backwards compatibility.
path = parse_result.path
path = uri_ref.path
if not path:
if (parse_result.query is not None
or parse_result.fragment is not None):
if (uri_ref.query is not None
or uri_ref.fragment is not None):
path = ""
else:
path = None
Expand All @@ -196,20 +206,18 @@ def parse_url(url):
def to_input_type(x):
if x is None:
return None
elif is_string and isinstance(x, six.binary_type):
return x.decode('utf-8')
elif not is_string and not isinstance(x, six.binary_type):
return x.encode('utf-8')
return x

return Url(
scheme=to_input_type(parse_result.scheme),
auth=to_input_type(parse_result.userinfo),
host=to_input_type(parse_result.hostname),
port=parse_result.port,
scheme=to_input_type(uri_ref.scheme),
auth=to_input_type(uri_ref.userinfo),
host=to_input_type(uri_ref.host),
port=int(uri_ref.port) if uri_ref.port is not None else None,
path=to_input_type(path),
query=to_input_type(parse_result.query),
fragment=to_input_type(parse_result.fragment)
query=to_input_type(uri_ref.query),
fragment=to_input_type(uri_ref.fragment)
)


Expand Down
27 changes: 21 additions & 6 deletions test/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,17 @@ def test_invalid_host(self, location):
with pytest.raises(LocationParseError):
get_host(location)

@pytest.mark.parametrize('url', [
'http://user\\@google.com',
'http://google\\.com',
'user\\@google.com',
'http://google.com#fragment#',
'http://user@user@google.com/',
])
def test_invalid_url(self, url):
with pytest.raises(LocationParseError):
parse_url(url)

@pytest.mark.parametrize('url, expected_normalized_url', [
('HTTP://GOOGLE.COM/MAIL/', 'http://google.com/MAIL/'),
('HTTP://JeremyCline:Hunter2@Example.com:8080/',
Expand All @@ -156,7 +167,7 @@ def test_parse_url_normalization(self, url, expected_normalized_url):
('', Url()),
('/', Url(path='/')),
('/abc/../def', Url(path="/abc/../def")),
('#?/!google.com/?foo#bar', Url(path='', fragment='?/!google.com/?foo#bar')),
('#?/!google.com/?foo', Url(path='', fragment='?/!google.com/?foo')),
('/foo', Url(path='/foo')),
('/foo?bar=baz', Url(path='/foo', query='bar=baz')),
('/foo?bar=baz#banana?apple/orange', Url(path='/foo',
Expand All @@ -173,10 +184,10 @@ def test_parse_url_normalization(self, url, expected_normalized_url):
# Auth
('http://foo:bar@localhost/', Url('http', auth='foo:bar', host='localhost', path='/')),
('http://foo@localhost/', Url('http', auth='foo', host='localhost', path='/')),
('http://foo:bar@baz@localhost/', Url('http',
auth='foo:bar@baz',
host='localhost',
path='/')),
('http://foo:bar@localhost/', Url('http',
auth='foo:bar',
host='localhost',
path='/')),

# Unicode type (Python 2.x)
(u'http://foo:bar@localhost/', Url(u'http',
Expand Down Expand Up @@ -265,7 +276,11 @@ def test_netloc(self, url, expected_netloc):
# NodeJS unicode -> double dot
(u"http://google.com/\uff2e\uff2e/abc", Url("http",
host="google.com",
path='/%ef%bc%ae%ef%bc%ae/abc'))
path='/%ef%bc%ae%ef%bc%ae/abc')),

# Scheme without ://
("javascript:a='@google.com:12345/';alert(0)", Url(scheme="javascript",
path="a='@google.com:12345/';alert(0)")),
]

@pytest.mark.parametrize("url, expected_url", url_vulnerabilities)
Expand Down