-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use rfc3986.validator.Validator for parse_url #1531
Changes from 3 commits
5135adf
40b6e20
637bd13
a05d157
821c108
134367e
65f6cb2
cc82882
323eb56
7866572
0ef8b81
1e6681e
357d701
0b1727e
c2367d8
15be9ca
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,9 @@ | |
|
||
from ..exceptions import LocationParseError | ||
from ..packages import six, rfc3986 | ||
from ..packages.rfc3986.exceptions import RFC3986Exception | ||
from ..packages.rfc3986.exceptions import RFC3986Exception, ValidationError | ||
from ..packages.rfc3986.validators import Validator | ||
from ..packages.rfc3986.normalizers import normalize_host | ||
|
||
|
||
url_attrs = ['scheme', 'auth', 'host', 'port', 'path', 'query', 'fragment'] | ||
|
@@ -14,12 +16,12 @@ | |
NORMALIZABLE_SCHEMES = ('http', 'https', None) | ||
|
||
# Regex for detecting URLs with schemes. RFC 3986 Section 3.1 | ||
SCHEME_REGEX = re.compile(r"^[a-zA-Z][a-zA-Z0-9+\-.]*://") | ||
SCHEME_REGEX = re.compile(r"^[a-zA-Z][a-zA-Z0-9+\-]*:") | ||
|
||
|
||
class Url(namedtuple('Url', url_attrs)): | ||
""" | ||
Datastructure for representing an HTTP URL. Used as a return value for | ||
Data structure for representing an HTTP URL. Used as a return value for | ||
:func:`parse_url`. Both the scheme and host are normalized as they are | ||
both case-insensitive according to RFC 3986. | ||
""" | ||
|
@@ -171,22 +173,30 @@ def parse_url(url): | |
url = "//" + url | ||
|
||
try: | ||
parse_result = rfc3986.urlparse(url, encoding="utf-8") | ||
uri_ref = rfc3986.URIReference.from_string(url, encoding="utf-8") | ||
except (ValueError, RFC3986Exception): | ||
raise LocationParseError(url) | ||
|
||
# RFC 3986 doesn't assert ports must be non-negative. | ||
if parse_result.port and parse_result.port < 0: | ||
if uri_ref.scheme in NORMALIZABLE_SCHEMES: | ||
uri_ref = uri_ref.normalize() | ||
|
||
# Run validator on the internal URIReference within the ParseResult | ||
validator = Validator() | ||
try: | ||
validator.check_validity_of( | ||
*validator.COMPONENT_NAMES | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So you want to validate literally everything, yes? I wonder if we could make a better API for this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah validate all components. Leaving the hard work to you I can whip up a patch, tests, and docs if you can think of a name for the interface. ;) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
).validate(uri_ref) | ||
except ValidationError: | ||
raise LocationParseError(url) | ||
|
||
# For the sake of backwards compatibility we put empty | ||
# string values for path if there are any defined values | ||
# beyond the path in the URL. | ||
# TODO: Remove this when we break backwards compatibility. | ||
path = parse_result.path | ||
path = uri_ref.path | ||
if not path: | ||
if (parse_result.query is not None | ||
or parse_result.fragment is not None): | ||
if (uri_ref.query is not None | ||
or uri_ref.fragment is not None): | ||
path = "" | ||
else: | ||
path = None | ||
|
@@ -196,20 +206,18 @@ def parse_url(url): | |
def to_input_type(x): | ||
if x is None: | ||
return None | ||
elif is_string and isinstance(x, six.binary_type): | ||
return x.decode('utf-8') | ||
elif not is_string and not isinstance(x, six.binary_type): | ||
return x.encode('utf-8') | ||
return x | ||
|
||
return Url( | ||
scheme=to_input_type(parse_result.scheme), | ||
auth=to_input_type(parse_result.userinfo), | ||
host=to_input_type(parse_result.hostname), | ||
port=parse_result.port, | ||
scheme=to_input_type(uri_ref.scheme), | ||
auth=to_input_type(uri_ref.userinfo), | ||
host=to_input_type(uri_ref.host), | ||
port=int(uri_ref.port) if uri_ref.port is not None else None, | ||
path=to_input_type(path), | ||
query=to_input_type(parse_result.query), | ||
fragment=to_input_type(parse_result.fragment) | ||
query=to_input_type(uri_ref.query), | ||
fragment=to_input_type(uri_ref.fragment) | ||
) | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@sigmavirus24 What do you think of taking this
.
out of this scheme regex? I did this because I don't think we support any scheme that has this.
here but we support a lot of schemeless "URLs" where the authority section looks like a scheme (www.google.com
is a valid "scheme"). Should we get even more strict and only support schemes that start withhttp
? I'm not sure.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So, I don't understand the purpose the
.
is serving. We can't, however, limit ourselves to what we think of as normal schemes because we (fortunately, or not) supporthttp+unix://
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
.
is part of the scheme spec but I couldn't find an example scheme that contained a period in it. The three schemes I know of that we support arehttp
,https
, andhttp+unix
case insensitive.Removing the period prevents a few issues like us thinking that
google.com:433/path
isscheme=google.com
,host=433
,path=/path
and instead forcing a parse on://google.com:433/path
which gives us a correct result.