Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use rfc3986.validator.Validator for parse_url #1531

Merged
merged 16 commits into from
Apr 21, 2019
Merged
1 change: 1 addition & 0 deletions src/urllib3/packages/rfc3986/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
abnf_regexp.PORT_RE))


HOST_MATCHER = re.compile('^' + abnf_regexp.HOST_RE + '$')
IPv4_MATCHER = re.compile('^' + abnf_regexp.IPv4_RE + '$')
IPv6_MATCHER = re.compile(r'^\[' + abnf_regexp.IPv6_ADDRZ_RFC4007_RE + r'\]$')

Expand Down
32 changes: 21 additions & 11 deletions src/urllib3/packages/rfc3986/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,24 @@ def authority_is_valid(authority, host=None, require=False):
bool
"""
validated = is_valid(authority, misc.SUBAUTHORITY_MATCHER, require)
if validated and host is not None:
return host_is_valid(host, require)
return validated


def host_is_valid(host, require=False):
"""Determine if the host string is valid.

:param str host:
The host to validate.
:param bool require:
(optional) Specify if host must not be None.
:returns:
``True`` if valid, ``False`` otherwise
:rtype:
bool
"""
validated = is_valid(host, misc.HOST_MATCHER, require)
if validated and host is not None and misc.IPv4_MATCHER.match(host):
return valid_ipv4_host_address(host)
elif validated and host is not None and misc.IPv6_MATCHER.match(host):
Expand Down Expand Up @@ -395,19 +413,11 @@ def subauthority_component_is_valid(uri, component):
except exceptions.InvalidAuthority:
return False

# Check host here
if component == 'host':
host = subauthority_dict.get('host')
if (host and misc.IPv6_MATCHER.match(host) and not
misc.IPv6_NO_RFC4007_MATCHER.match(host)):
# If it's an IPv6 address that has RFC 4007 IPv6
# Zone IDs then it's invalid.
return False


# If we can parse the authority into sub-components and we're not
# validating the port, we can assume it's valid.
if component != 'port':
if component == 'host':
return host_is_valid(subauthority_dict['host'])
elif component != 'port':
return True

try:
Expand Down
32 changes: 25 additions & 7 deletions src/urllib3/util/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from ..exceptions import LocationParseError
from ..packages import six, rfc3986
from ..packages.rfc3986.exceptions import RFC3986Exception, ValidationError
from ..packages.rfc3986.exceptions import RFC3986Exception, ValidationError, InvalidAuthority
from ..packages.rfc3986.validators import Validator
from ..packages.rfc3986.normalizers import normalize_host

Expand All @@ -31,10 +31,8 @@ def __new__(cls, scheme=None, auth=None, host=None, port=None, path=None,
query=None, fragment=None):
if path and not path.startswith('/'):
path = '/' + path
if scheme:
if scheme is not None:
scheme = scheme.lower()
if host and scheme in NORMALIZABLE_SCHEMES:
host = host.lower()
return super(Url, cls).__new__(cls, scheme, auth, host, port, path,
query, fragment)

Expand Down Expand Up @@ -106,7 +104,7 @@ def __str__(self):

def split_first(s, delims):
"""
Deprecated. No longer used by parse_url().
.. deprecated:: 1.25

Given a string and an iterable of delimiters, split on the first found
delimiter. Return two split parts and the matched delimiter.
Expand Down Expand Up @@ -177,14 +175,34 @@ def parse_url(url):
except (ValueError, RFC3986Exception):
raise LocationParseError(url)

if uri_ref.scheme in NORMALIZABLE_SCHEMES:
# Find all components that the URI has before normalization.
# If there's an invalid authority already it'd be stripped
# off by normalization.
try:
required_components = [
k for k, v in six.iteritems(uri_ref.authority_info())
if v is not None
]
except InvalidAuthority:
raise LocationParseError(url)

required_components.extend([
k for k in ['path', 'query', 'fragment']
if getattr(uri_ref, k) is not None
])

if uri_ref.scheme is None or uri_ref.scheme.lower() in NORMALIZABLE_SCHEMES:
uri_ref = uri_ref.normalize()

# Run validator on the internal URIReference within the ParseResult
# Validate all URIReference components and ensure that all
# components that were set before are still set after
# normalization has completed.
validator = Validator()
try:
validator.check_validity_of(
*validator.COMPONENT_NAMES
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So you want to validate literally everything, yes? I wonder if we could make a better API for this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah validate all components. Leaving the hard work to you I can whip up a patch, tests, and docs if you can think of a name for the interface. ;)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

validate_all_the_components_<emoji> :trollface:

).require_presence_of(
*required_components
).validate(uri_ref)
except ValidationError:
raise LocationParseError(url)
Expand Down
11 changes: 8 additions & 3 deletions test/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ def test_invalid_url(self, url):
'http://JeremyCline:Hunter2@example.com:8080/'),
('HTTPS://Example.Com/?Key=Value', 'https://example.com/?Key=Value'),
('Https://Example.Com/#Fragment', 'https://example.com/#Fragment'),
('[::Ff%etH0%Ff]/%ab%Af', '[::ff%25etH0%Ff]/%AB%AF'),
])
def test_parse_url_normalization(self, url, expected_normalized_url):
"""Assert parse_url normalizes the scheme/host, and only the scheme/host"""
Expand All @@ -166,7 +167,7 @@ def test_parse_url_normalization(self, url, expected_normalized_url):
# Path/query/fragment
('', Url()),
('/', Url(path='/')),
('/abc/../def', Url(path="/abc/../def")),
('/abc/../def', Url(path="/def")),
('#?/!google.com/?foo', Url(path='', fragment='?/!google.com/?foo')),
('/foo', Url(path='/foo')),
('/foo?bar=baz', Url(path='/foo', query='bar=baz')),
Expand Down Expand Up @@ -220,6 +221,10 @@ def test_parse_url(self, url, expected_url):

@pytest.mark.parametrize('url, expected_url', parse_url_host_map)
def test_unparse_url(self, url, expected_url):

if '/../' in url:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is odd

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, maybe I can split it out into a different test case about path normalization / unsplitting.

url = url.replace('/abc/../', '/')

assert url == expected_url.url

def test_parse_url_invalid_IPv6(self):
Expand Down Expand Up @@ -271,12 +276,12 @@ def test_netloc(self, url, expected_netloc):

# CVE-2016-5699
("http://127.0.0.1%0d%0aConnection%3a%20keep-alive",
Url("http", host="127.0.0.1%0d%0aConnection%3a%20keep-alive")),
Url("http", host="127.0.0.1%0d%0aconnection%3a%20keep-alive")),

# NodeJS unicode -> double dot
(u"http://google.com/\uff2e\uff2e/abc", Url("http",
host="google.com",
path='/%ef%bc%ae%ef%bc%ae/abc')),
path='/%EF%BC%AE%EF%BC%AE/abc')),

# Scheme without ://
("javascript:a='@google.com:12345/';alert(0)", Url(scheme="javascript",
Expand Down