Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use rfc3986.validator.Validator for parse_url #1531

Merged
merged 16 commits into from
Apr 21, 2019
Merged
18 changes: 8 additions & 10 deletions src/urllib3/connectionpool.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from .packages.ssl_match_hostname import CertificateError
from .packages import six
from .packages.six.moves import queue
from .packages.rfc3986.normalizers import normalize_host
from .connection import (
port_by_scheme,
DummyConnection,
Expand Down Expand Up @@ -65,7 +66,7 @@ def __init__(self, host, port=None):
if not host:
raise LocationValueError("No host specified.")

self.host = _ipv6_host(host, self.scheme)
self.host = _normalize_host(host, scheme=self.scheme)
self._proxy_host = host.lower()
self.port = port

Expand Down Expand Up @@ -432,8 +433,8 @@ def is_same_host(self, url):

# TODO: Add optional support for socket.gethostbyname checking.
scheme, host, port = get_host(url)

host = _ipv6_host(host, self.scheme)
if host is not None:
host = _normalize_host(host, scheme=scheme)

# Use explicit default port for comparison when none is given
if self.port and not port:
Expand Down Expand Up @@ -876,9 +877,9 @@ def connection_from_url(url, **kw):
return HTTPConnectionPool(host, port=port, **kw)


def _ipv6_host(host, scheme):
def _normalize_host(host, scheme):
"""
Process IPv6 address literals
Normalize hosts for comparisons and use with sockets.
"""

# httplib doesn't like it when we include brackets in IPv6 addresses
Expand All @@ -887,11 +888,8 @@ def _ipv6_host(host, scheme):
# Instead, we need to make sure we never pass ``None`` as the port.
# However, for backward compatibility reasons we can't actually
# *assert* that. See http://bugs.python.org/issue28539
#
# Also if an IPv6 address literal has a zone identifier, the
# percent sign might be URIencoded, convert it back into ASCII
if host.startswith('[') and host.endswith(']'):
host = host.replace('%25', '%').strip('[]')
host = host.strip('[]')
if scheme in NORMALIZABLE_SCHEMES:
host = host.lower()
host = normalize_host(host)
return host
10 changes: 5 additions & 5 deletions src/urllib3/packages/rfc3986/abnf_regexp.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,14 +115,14 @@
UNRESERVED_RE + SUB_DELIMITERS_RE + ':'
)


# RFC 6874 Zone ID ABNF
ZONE_ID = '(?:[' + UNRESERVED_RE + ']|' + PCT_ENCODED + ')+'
IPv6_ADDRZ_RE = IPv6_RE + '%25' + ZONE_ID

IP_LITERAL_RE = r'\[({0}|(?:{1})|{2})\]'.format(
IPv6_RE,
IPv6_ADDRZ_RE,
IPv6_ADDRZ_RFC4007_RE = IPv6_RE + '(?:(?:%25|%)' + ZONE_ID + ')?'
IPv6_ADDRZ_RE = IPv6_RE + '(?:%25' + ZONE_ID + ')?'

IP_LITERAL_RE = r'\[({0}|{1})\]'.format(
IPv6_ADDRZ_RFC4007_RE,
IPv_FUTURE_RE,
)

Expand Down
7 changes: 7 additions & 0 deletions src/urllib3/packages/rfc3986/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,14 @@
abnf_regexp.PORT_RE))


HOST_MATCHER = re.compile('^' + abnf_regexp.HOST_RE + '$')
IPv4_MATCHER = re.compile('^' + abnf_regexp.IPv4_RE + '$')
IPv6_MATCHER = re.compile(r'^\[' + abnf_regexp.IPv6_ADDRZ_RFC4007_RE + r'\]$')

# Used by host validator
IPv6_NO_RFC4007_MATCHER = re.compile(r'^\[%s\]$' % (
abnf_regexp.IPv6_ADDRZ_RE
))

# Matcher used to validate path components
PATH_MATCHER = re.compile(abnf_regexp.PATH_RE)
Expand Down
15 changes: 15 additions & 0 deletions src/urllib3/packages/rfc3986/normalizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,21 @@ def normalize_password(password):

def normalize_host(host):
"""Normalize a host string."""
if misc.IPv6_MATCHER.match(host):
percent = host.find('%')
if percent != -1:
percent_25 = host.find('%25')

# Replace RFC 4007 IPv6 Zone ID delimiter '%' with '%25'
# from RFC 6874. If the host is '[<IPv6 addr>%25]' then we
# assume RFC 4007 and normalize to '[<IPV6 addr>%2525]'
if percent_25 == -1 or percent < percent_25 or \
(percent == percent_25 and percent_25 == len(host) - 4):
host = host.replace('%', '%25', 1)

# Don't normalize the casing of the Zone ID
return host[:percent].lower() + host[percent:]

return host.lower()


Expand Down
24 changes: 23 additions & 1 deletion src/urllib3/packages/rfc3986/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,8 +304,28 @@ def authority_is_valid(authority, host=None, require=False):
bool
"""
validated = is_valid(authority, misc.SUBAUTHORITY_MATCHER, require)
if validated and host is not None:
return host_is_valid(host, require)
return validated


def host_is_valid(host, require=False):
"""Determine if the host string is valid.

:param str host:
The host to validate.
:param bool require:
(optional) Specify if host must not be None.
:returns:
``True`` if valid, ``False`` otherwise
:rtype:
bool
"""
validated = is_valid(host, misc.HOST_MATCHER, require)
if validated and host is not None and misc.IPv4_MATCHER.match(host):
return valid_ipv4_host_address(host)
elif validated and host is not None and misc.IPv6_MATCHER.match(host):
return misc.IPv6_NO_RFC4007_MATCHER.match(host) is not None
return validated


Expand Down Expand Up @@ -395,7 +415,9 @@ def subauthority_component_is_valid(uri, component):

# If we can parse the authority into sub-components and we're not
# validating the port, we can assume it's valid.
if component != 'port':
if component == 'host':
return host_is_valid(subauthority_dict['host'])
elif component != 'port':
return True

try:
Expand Down
68 changes: 47 additions & 21 deletions src/urllib3/util/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

from ..exceptions import LocationParseError
from ..packages import six, rfc3986
from ..packages.rfc3986.exceptions import RFC3986Exception
from ..packages.rfc3986.exceptions import RFC3986Exception, ValidationError, InvalidAuthority
from ..packages.rfc3986.validators import Validator
from ..packages.rfc3986.normalizers import normalize_host


url_attrs = ['scheme', 'auth', 'host', 'port', 'path', 'query', 'fragment']
Expand All @@ -14,12 +16,12 @@
NORMALIZABLE_SCHEMES = ('http', 'https', None)

# Regex for detecting URLs with schemes. RFC 3986 Section 3.1
SCHEME_REGEX = re.compile(r"^[a-zA-Z][a-zA-Z0-9+\-.]*://")
Copy link
Member Author

@sethmlarson sethmlarson Jan 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sigmavirus24 What do you think of taking this . out of this scheme regex? I did this because I don't think we support any scheme that has this . here but we support a lot of schemeless "URLs" where the authority section looks like a scheme (www.google.com is a valid "scheme"). Should we get even more strict and only support schemes that start with http? I'm not sure.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, I don't understand the purpose the . is serving. We can't, however, limit ourselves to what we think of as normal schemes because we (fortunately, or not) support http+unix://

Copy link
Member Author

@sethmlarson sethmlarson Jan 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The . is part of the scheme spec but I couldn't find an example scheme that contained a period in it. The three schemes I know of that we support are http, https, and http+unix case insensitive.

Removing the period prevents a few issues like us thinking that google.com:433/path is scheme=google.com, host=433, path=/path and instead forcing a parse on ://google.com:433/path which gives us a correct result.

SCHEME_REGEX = re.compile(r"^[a-zA-Z][a-zA-Z0-9+\-]*:")


class Url(namedtuple('Url', url_attrs)):
"""
Datastructure for representing an HTTP URL. Used as a return value for
Data structure for representing an HTTP URL. Used as a return value for
:func:`parse_url`. Both the scheme and host are normalized as they are
both case-insensitive according to RFC 3986.
"""
Expand All @@ -29,10 +31,8 @@ def __new__(cls, scheme=None, auth=None, host=None, port=None, path=None,
query=None, fragment=None):
if path and not path.startswith('/'):
path = '/' + path
if scheme:
if scheme is not None:
scheme = scheme.lower()
if host and scheme in NORMALIZABLE_SCHEMES:
host = host.lower()
return super(Url, cls).__new__(cls, scheme, auth, host, port, path,
query, fragment)

Expand Down Expand Up @@ -104,7 +104,7 @@ def __str__(self):

def split_first(s, delims):
"""
Deprecated. No longer used by parse_url().
.. deprecated:: 1.25

Given a string and an iterable of delimiters, split on the first found
delimiter. Return two split parts and the matched delimiter.
Expand Down Expand Up @@ -171,22 +171,50 @@ def parse_url(url):
url = "//" + url

try:
parse_result = rfc3986.urlparse(url, encoding="utf-8")
uri_ref = rfc3986.URIReference.from_string(url, encoding="utf-8")
except (ValueError, RFC3986Exception):
raise LocationParseError(url)

# RFC 3986 doesn't assert ports must be non-negative.
if parse_result.port and parse_result.port < 0:
# Find all components that the URI has before normalization.
# If there's an invalid authority already it'd be stripped
# off by normalization.
try:
required_components = [
k for k, v in six.iteritems(uri_ref.authority_info())
if v is not None
]
except InvalidAuthority:
raise LocationParseError(url)

required_components.extend([
k for k in ['path', 'query', 'fragment']
if getattr(uri_ref, k) is not None
])

if uri_ref.scheme is None or uri_ref.scheme.lower() in NORMALIZABLE_SCHEMES:
uri_ref = uri_ref.normalize()

# Validate all URIReference components and ensure that all
# components that were set before are still set after
# normalization has completed.
validator = Validator()
try:
validator.check_validity_of(
*validator.COMPONENT_NAMES
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So you want to validate literally everything, yes? I wonder if we could make a better API for this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah validate all components. Leaving the hard work to you I can whip up a patch, tests, and docs if you can think of a name for the interface. ;)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

validate_all_the_components_<emoji> :trollface:

).require_presence_of(
*required_components
).validate(uri_ref)
except ValidationError:
raise LocationParseError(url)

# For the sake of backwards compatibility we put empty
# string values for path if there are any defined values
# beyond the path in the URL.
# TODO: Remove this when we break backwards compatibility.
path = parse_result.path
path = uri_ref.path
if not path:
if (parse_result.query is not None
or parse_result.fragment is not None):
if (uri_ref.query is not None
or uri_ref.fragment is not None):
path = ""
else:
path = None
Expand All @@ -196,20 +224,18 @@ def parse_url(url):
def to_input_type(x):
if x is None:
return None
elif is_string and isinstance(x, six.binary_type):
return x.decode('utf-8')
elif not is_string and not isinstance(x, six.binary_type):
return x.encode('utf-8')
return x

return Url(
scheme=to_input_type(parse_result.scheme),
auth=to_input_type(parse_result.userinfo),
host=to_input_type(parse_result.hostname),
port=parse_result.port,
scheme=to_input_type(uri_ref.scheme),
auth=to_input_type(uri_ref.userinfo),
host=to_input_type(uri_ref.host),
port=int(uri_ref.port) if uri_ref.port is not None else None,
path=to_input_type(path),
query=to_input_type(parse_result.query),
fragment=to_input_type(parse_result.fragment)
query=to_input_type(uri_ref.query),
fragment=to_input_type(uri_ref.fragment)
)


Expand Down
36 changes: 28 additions & 8 deletions test/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,24 @@ def test_invalid_host(self, location):
with pytest.raises(LocationParseError):
get_host(location)

@pytest.mark.parametrize('url', [
'http://user\\@google.com',
'http://google\\.com',
'user\\@google.com',
'http://google.com#fragment#',
'http://user@user@google.com/',
])
def test_invalid_url(self, url):
with pytest.raises(LocationParseError):
parse_url(url)

@pytest.mark.parametrize('url, expected_normalized_url', [
('HTTP://GOOGLE.COM/MAIL/', 'http://google.com/MAIL/'),
('HTTP://JeremyCline:Hunter2@Example.com:8080/',
'http://JeremyCline:Hunter2@example.com:8080/'),
('HTTPS://Example.Com/?Key=Value', 'https://example.com/?Key=Value'),
('Https://Example.Com/#Fragment', 'https://example.com/#Fragment'),
('[::Ff%etH0%Ff]/%ab%Af', '[::ff%25etH0%Ff]/%AB%AF'),
])
def test_parse_url_normalization(self, url, expected_normalized_url):
"""Assert parse_url normalizes the scheme/host, and only the scheme/host"""
Expand All @@ -155,8 +167,8 @@ def test_parse_url_normalization(self, url, expected_normalized_url):
# Path/query/fragment
('', Url()),
('/', Url(path='/')),
('/abc/../def', Url(path="/abc/../def")),
('#?/!google.com/?foo#bar', Url(path='', fragment='?/!google.com/?foo#bar')),
('/abc/../def', Url(path="/def")),
('#?/!google.com/?foo', Url(path='', fragment='?/!google.com/?foo')),
('/foo', Url(path='/foo')),
('/foo?bar=baz', Url(path='/foo', query='bar=baz')),
('/foo?bar=baz#banana?apple/orange', Url(path='/foo',
Expand All @@ -173,10 +185,10 @@ def test_parse_url_normalization(self, url, expected_normalized_url):
# Auth
('http://foo:bar@localhost/', Url('http', auth='foo:bar', host='localhost', path='/')),
('http://foo@localhost/', Url('http', auth='foo', host='localhost', path='/')),
('http://foo:bar@baz@localhost/', Url('http',
auth='foo:bar@baz',
host='localhost',
path='/')),
('http://foo:bar@localhost/', Url('http',
auth='foo:bar',
host='localhost',
path='/')),

# Unicode type (Python 2.x)
(u'http://foo:bar@localhost/', Url(u'http',
Expand Down Expand Up @@ -209,6 +221,10 @@ def test_parse_url(self, url, expected_url):

@pytest.mark.parametrize('url, expected_url', parse_url_host_map)
def test_unparse_url(self, url, expected_url):

if '/../' in url:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is odd

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, maybe I can split it out into a different test case about path normalization / unsplitting.

url = url.replace('/abc/../', '/')

assert url == expected_url.url

def test_parse_url_invalid_IPv6(self):
Expand Down Expand Up @@ -260,12 +276,16 @@ def test_netloc(self, url, expected_netloc):

# CVE-2016-5699
("http://127.0.0.1%0d%0aConnection%3a%20keep-alive",
Url("http", host="127.0.0.1%0d%0aConnection%3a%20keep-alive")),
Url("http", host="127.0.0.1%0d%0aconnection%3a%20keep-alive")),

# NodeJS unicode -> double dot
(u"http://google.com/\uff2e\uff2e/abc", Url("http",
host="google.com",
path='/%ef%bc%ae%ef%bc%ae/abc'))
path='/%EF%BC%AE%EF%BC%AE/abc')),

# Scheme without ://
("javascript:a='@google.com:12345/';alert(0)", Url(scheme="javascript",
path="a='@google.com:12345/';alert(0)")),
]

@pytest.mark.parametrize("url, expected_url", url_vulnerabilities)
Expand Down