diff --git a/src/urllib3/connectionpool.py b/src/urllib3/connectionpool.py index 57502c3345..157568a395 100644 --- a/src/urllib3/connectionpool.py +++ b/src/urllib3/connectionpool.py @@ -26,6 +26,7 @@ from .packages.ssl_match_hostname import CertificateError from .packages import six from .packages.six.moves import queue +from .packages.rfc3986.normalizers import normalize_host from .connection import ( port_by_scheme, DummyConnection, @@ -65,7 +66,7 @@ def __init__(self, host, port=None): if not host: raise LocationValueError("No host specified.") - self.host = _ipv6_host(host, self.scheme) + self.host = _normalize_host(host, scheme=self.scheme) self._proxy_host = host.lower() self.port = port @@ -434,8 +435,8 @@ def is_same_host(self, url): # TODO: Add optional support for socket.gethostbyname checking. scheme, host, port = get_host(url) - - host = _ipv6_host(host, self.scheme) + if host is not None: + host = _normalize_host(host, scheme=scheme) # Use explicit default port for comparison when none is given if self.port and not port: @@ -878,9 +879,9 @@ def connection_from_url(url, **kw): return HTTPConnectionPool(host, port=port, **kw) -def _ipv6_host(host, scheme): +def _normalize_host(host, scheme): """ - Process IPv6 address literals + Normalize hosts for comparisons and use with sockets. """ # httplib doesn't like it when we include brackets in IPv6 addresses @@ -889,11 +890,8 @@ def _ipv6_host(host, scheme): # Instead, we need to make sure we never pass ``None`` as the port. # However, for backward compatibility reasons we can't actually # *assert* that. See http://bugs.python.org/issue28539 - # - # Also if an IPv6 address literal has a zone identifier, the - # percent sign might be URIencoded, convert it back into ASCII if host.startswith('[') and host.endswith(']'): - host = host.replace('%25', '%').strip('[]') + host = host.strip('[]') if scheme in NORMALIZABLE_SCHEMES: - host = host.lower() + host = normalize_host(host) return host diff --git a/src/urllib3/packages/rfc3986/__init__.py b/src/urllib3/packages/rfc3986/__init__.py index 9719d6f7ec..13a786dfb0 100644 --- a/src/urllib3/packages/rfc3986/__init__.py +++ b/src/urllib3/packages/rfc3986/__init__.py @@ -22,6 +22,8 @@ :license: Apache v2.0, see LICENSE for details """ +from .api import iri_reference +from .api import IRIReference from .api import is_valid_uri from .api import normalize_uri from .api import uri_reference @@ -34,14 +36,16 @@ __author_email__ = 'graffatcolmingov@gmail.com' __license__ = 'Apache v2.0' __copyright__ = 'Copyright 2014 Rackspace' -__version__ = '1.2.0' +__version__ = '1.3.0' __all__ = ( 'ParseResult', 'URIReference', + 'IRIReference', 'is_valid_uri', 'normalize_uri', 'uri_reference', + 'iri_reference', 'urlparse', '__title__', '__author__', diff --git a/src/urllib3/packages/rfc3986/_mixin.py b/src/urllib3/packages/rfc3986/_mixin.py new file mode 100644 index 0000000000..543925cdbc --- /dev/null +++ b/src/urllib3/packages/rfc3986/_mixin.py @@ -0,0 +1,353 @@ +"""Module containing the implementation of the URIMixin class.""" +import warnings + +from . import exceptions as exc +from . import misc +from . import normalizers +from . import validators + + +class URIMixin(object): + """Mixin with all shared methods for URIs and IRIs.""" + + __hash__ = tuple.__hash__ + + def authority_info(self): + """Return a dictionary with the ``userinfo``, ``host``, and ``port``. + + If the authority is not valid, it will raise a + :class:`~rfc3986.exceptions.InvalidAuthority` Exception. + + :returns: + ``{'userinfo': 'username:password', 'host': 'www.example.com', + 'port': '80'}`` + :rtype: dict + :raises rfc3986.exceptions.InvalidAuthority: + If the authority is not ``None`` and can not be parsed. + """ + if not self.authority: + return {'userinfo': None, 'host': None, 'port': None} + + match = self._match_subauthority() + + if match is None: + # In this case, we have an authority that was parsed from the URI + # Reference, but it cannot be further parsed by our + # misc.SUBAUTHORITY_MATCHER. In this case it must not be a valid + # authority. + raise exc.InvalidAuthority(self.authority.encode(self.encoding)) + + # We had a match, now let's ensure that it is actually a valid host + # address if it is IPv4 + matches = match.groupdict() + host = matches.get('host') + + if (host and misc.IPv4_MATCHER.match(host) and not + validators.valid_ipv4_host_address(host)): + # If we have a host, it appears to be IPv4 and it does not have + # valid bytes, it is an InvalidAuthority. + raise exc.InvalidAuthority(self.authority.encode(self.encoding)) + + return matches + + def _match_subauthority(self): + return misc.SUBAUTHORITY_MATCHER.match(self.authority) + + @property + def host(self): + """If present, a string representing the host.""" + try: + authority = self.authority_info() + except exc.InvalidAuthority: + return None + return authority['host'] + + @property + def port(self): + """If present, the port extracted from the authority.""" + try: + authority = self.authority_info() + except exc.InvalidAuthority: + return None + return authority['port'] + + @property + def userinfo(self): + """If present, the userinfo extracted from the authority.""" + try: + authority = self.authority_info() + except exc.InvalidAuthority: + return None + return authority['userinfo'] + + def is_absolute(self): + """Determine if this URI Reference is an absolute URI. + + See http://tools.ietf.org/html/rfc3986#section-4.3 for explanation. + + :returns: ``True`` if it is an absolute URI, ``False`` otherwise. + :rtype: bool + """ + return bool(misc.ABSOLUTE_URI_MATCHER.match(self.unsplit())) + + def is_valid(self, **kwargs): + """Determine if the URI is valid. + + .. deprecated:: 1.1.0 + + Use the :class:`~rfc3986.validators.Validator` object instead. + + :param bool require_scheme: Set to ``True`` if you wish to require the + presence of the scheme component. + :param bool require_authority: Set to ``True`` if you wish to require + the presence of the authority component. + :param bool require_path: Set to ``True`` if you wish to require the + presence of the path component. + :param bool require_query: Set to ``True`` if you wish to require the + presence of the query component. + :param bool require_fragment: Set to ``True`` if you wish to require + the presence of the fragment component. + :returns: ``True`` if the URI is valid. ``False`` otherwise. + :rtype: bool + """ + warnings.warn("Please use rfc3986.validators.Validator instead. " + "This method will be eventually removed.", + DeprecationWarning) + validators = [ + (self.scheme_is_valid, kwargs.get('require_scheme', False)), + (self.authority_is_valid, kwargs.get('require_authority', False)), + (self.path_is_valid, kwargs.get('require_path', False)), + (self.query_is_valid, kwargs.get('require_query', False)), + (self.fragment_is_valid, kwargs.get('require_fragment', False)), + ] + return all(v(r) for v, r in validators) + + def authority_is_valid(self, require=False): + """Determine if the authority component is valid. + + .. deprecated:: 1.1.0 + + Use the :class:`~rfc3986.validators.Validator` object instead. + + :param bool require: + Set to ``True`` to require the presence of this component. + :returns: + ``True`` if the authority is valid. ``False`` otherwise. + :rtype: + bool + """ + warnings.warn("Please use rfc3986.validators.Validator instead. " + "This method will be eventually removed.", + DeprecationWarning) + try: + self.authority_info() + except exc.InvalidAuthority: + return False + + return validators.authority_is_valid( + self.authority, + host=self.host, + require=require, + ) + + def scheme_is_valid(self, require=False): + """Determine if the scheme component is valid. + + .. deprecated:: 1.1.0 + + Use the :class:`~rfc3986.validators.Validator` object instead. + + :param str require: Set to ``True`` to require the presence of this + component. + :returns: ``True`` if the scheme is valid. ``False`` otherwise. + :rtype: bool + """ + warnings.warn("Please use rfc3986.validators.Validator instead. " + "This method will be eventually removed.", + DeprecationWarning) + return validators.scheme_is_valid(self.scheme, require) + + def path_is_valid(self, require=False): + """Determine if the path component is valid. + + .. deprecated:: 1.1.0 + + Use the :class:`~rfc3986.validators.Validator` object instead. + + :param str require: Set to ``True`` to require the presence of this + component. + :returns: ``True`` if the path is valid. ``False`` otherwise. + :rtype: bool + """ + warnings.warn("Please use rfc3986.validators.Validator instead. " + "This method will be eventually removed.", + DeprecationWarning) + return validators.path_is_valid(self.path, require) + + def query_is_valid(self, require=False): + """Determine if the query component is valid. + + .. deprecated:: 1.1.0 + + Use the :class:`~rfc3986.validators.Validator` object instead. + + :param str require: Set to ``True`` to require the presence of this + component. + :returns: ``True`` if the query is valid. ``False`` otherwise. + :rtype: bool + """ + warnings.warn("Please use rfc3986.validators.Validator instead. " + "This method will be eventually removed.", + DeprecationWarning) + return validators.query_is_valid(self.query, require) + + def fragment_is_valid(self, require=False): + """Determine if the fragment component is valid. + + .. deprecated:: 1.1.0 + + Use the Validator object instead. + + :param str require: Set to ``True`` to require the presence of this + component. + :returns: ``True`` if the fragment is valid. ``False`` otherwise. + :rtype: bool + """ + warnings.warn("Please use rfc3986.validators.Validator instead. " + "This method will be eventually removed.", + DeprecationWarning) + return validators.fragment_is_valid(self.fragment, require) + + def normalized_equality(self, other_ref): + """Compare this URIReference to another URIReference. + + :param URIReference other_ref: (required), The reference with which + we're comparing. + :returns: ``True`` if the references are equal, ``False`` otherwise. + :rtype: bool + """ + return tuple(self.normalize()) == tuple(other_ref.normalize()) + + def resolve_with(self, base_uri, strict=False): + """Use an absolute URI Reference to resolve this relative reference. + + Assuming this is a relative reference that you would like to resolve, + use the provided base URI to resolve it. + + See http://tools.ietf.org/html/rfc3986#section-5 for more information. + + :param base_uri: Either a string or URIReference. It must be an + absolute URI or it will raise an exception. + :returns: A new URIReference which is the result of resolving this + reference using ``base_uri``. + :rtype: :class:`URIReference` + :raises rfc3986.exceptions.ResolutionError: + If the ``base_uri`` is not an absolute URI. + """ + if not isinstance(base_uri, URIMixin): + base_uri = type(self).from_string(base_uri) + + if not base_uri.is_absolute(): + raise exc.ResolutionError(base_uri) + + # This is optional per + # http://tools.ietf.org/html/rfc3986#section-5.2.1 + base_uri = base_uri.normalize() + + # The reference we're resolving + resolving = self + + if not strict and resolving.scheme == base_uri.scheme: + resolving = resolving.copy_with(scheme=None) + + # http://tools.ietf.org/html/rfc3986#page-32 + if resolving.scheme is not None: + target = resolving.copy_with( + path=normalizers.normalize_path(resolving.path) + ) + else: + if resolving.authority is not None: + target = resolving.copy_with( + scheme=base_uri.scheme, + path=normalizers.normalize_path(resolving.path) + ) + else: + if resolving.path is None: + if resolving.query is not None: + query = resolving.query + else: + query = base_uri.query + target = resolving.copy_with( + scheme=base_uri.scheme, + authority=base_uri.authority, + path=base_uri.path, + query=query + ) + else: + if resolving.path.startswith('/'): + path = normalizers.normalize_path(resolving.path) + else: + path = normalizers.normalize_path( + misc.merge_paths(base_uri, resolving.path) + ) + target = resolving.copy_with( + scheme=base_uri.scheme, + authority=base_uri.authority, + path=path, + query=resolving.query + ) + return target + + def unsplit(self): + """Create a URI string from the components. + + :returns: The URI Reference reconstituted as a string. + :rtype: str + """ + # See http://tools.ietf.org/html/rfc3986#section-5.3 + result_list = [] + if self.scheme: + result_list.extend([self.scheme, ':']) + if self.authority: + result_list.extend(['//', self.authority]) + if self.path: + result_list.append(self.path) + if self.query is not None: + result_list.extend(['?', self.query]) + if self.fragment is not None: + result_list.extend(['#', self.fragment]) + return ''.join(result_list) + + def copy_with(self, scheme=misc.UseExisting, authority=misc.UseExisting, + path=misc.UseExisting, query=misc.UseExisting, + fragment=misc.UseExisting): + """Create a copy of this reference with the new components. + + :param str scheme: + (optional) The scheme to use for the new reference. + :param str authority: + (optional) The authority to use for the new reference. + :param str path: + (optional) The path to use for the new reference. + :param str query: + (optional) The query to use for the new reference. + :param str fragment: + (optional) The fragment to use for the new reference. + :returns: + New URIReference with provided components. + :rtype: + URIReference + """ + attributes = { + 'scheme': scheme, + 'authority': authority, + 'path': path, + 'query': query, + 'fragment': fragment, + } + for key, value in list(attributes.items()): + if value is misc.UseExisting: + del attributes[key] + uri = self._replace(**attributes) + uri.encoding = self.encoding + return uri diff --git a/src/urllib3/packages/rfc3986/abnf_regexp.py b/src/urllib3/packages/rfc3986/abnf_regexp.py index 5b6da1771b..24c9c3d00a 100644 --- a/src/urllib3/packages/rfc3986/abnf_regexp.py +++ b/src/urllib3/packages/rfc3986/abnf_regexp.py @@ -13,6 +13,8 @@ # limitations under the License. """Module for the regular expressions crafted from ABNF.""" +import sys + # https://tools.ietf.org/html/rfc3986#page-13 GEN_DELIMS = GENERIC_DELIMITERS = ":/?#[]@" GENERIC_DELIMITERS_SET = set(GENERIC_DELIMITERS) @@ -25,7 +27,7 @@ ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' DIGIT = '0123456789' # https://tools.ietf.org/html/rfc3986#section-2.3 -UNRESERVED = UNRESERVED_CHARS = ALPHA + DIGIT + '._!-' +UNRESERVED = UNRESERVED_CHARS = ALPHA + DIGIT + r'._!-' UNRESERVED_CHARS_SET = set(UNRESERVED_CHARS) NON_PCT_ENCODED_SET = RESERVED_CHARS_SET.union(UNRESERVED_CHARS_SET) # We need to escape the '-' in this case: @@ -75,7 +77,7 @@ '%[0-9A-Fa-f]{2}', SUB_DELIMITERS_RE + UNRESERVED_RE ) # The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1, -IPv4_RE = '([0-9]{1,3}.){3}[0-9]{1,3}' +IPv4_RE = r'([0-9]{1,3}\.){3}[0-9]{1,3}' # Hexadecimal characters used in each piece of an IPv6 address HEXDIG_RE = '[0-9A-Fa-f]{1,4}' # Least-significant 32 bits of an IPv6 address @@ -111,18 +113,18 @@ *variations ) -IPv_FUTURE_RE = 'v[0-9A-Fa-f]+.[%s]+' % ( +IPv_FUTURE_RE = r'v[0-9A-Fa-f]+\.[%s]+' % ( UNRESERVED_RE + SUB_DELIMITERS_RE + ':' ) - # RFC 6874 Zone ID ABNF ZONE_ID = '(?:[' + UNRESERVED_RE + ']|' + PCT_ENCODED + ')+' -IPv6_ADDRZ_RE = IPv6_RE + '%25' + ZONE_ID -IP_LITERAL_RE = r'\[({0}|(?:{1})|{2})\]'.format( - IPv6_RE, - IPv6_ADDRZ_RE, +IPv6_ADDRZ_RFC4007_RE = IPv6_RE + '(?:(?:%25|%)' + ZONE_ID + ')?' +IPv6_ADDRZ_RE = IPv6_RE + '(?:%25' + ZONE_ID + ')?' + +IP_LITERAL_RE = r'\[({0}|{1})\]'.format( + IPv6_ADDRZ_RFC4007_RE, IPv_FUTURE_RE, ) @@ -186,3 +188,80 @@ PATH_ROOTLESS, PATH_EMPTY, ) + +# ############### +# IRIs / RFC 3987 +# ############### + +# Only wide-unicode gets the high-ranges of UCSCHAR +if sys.maxunicode > 0xFFFF: # pragma: no cover + IPRIVATE = u'\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD' + UCSCHAR_RE = ( + u'\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF' + u'\U00010000-\U0001FFFD\U00020000-\U0002FFFD' + u'\U00030000-\U0003FFFD\U00040000-\U0004FFFD' + u'\U00050000-\U0005FFFD\U00060000-\U0006FFFD' + u'\U00070000-\U0007FFFD\U00080000-\U0008FFFD' + u'\U00090000-\U0009FFFD\U000A0000-\U000AFFFD' + u'\U000B0000-\U000BFFFD\U000C0000-\U000CFFFD' + u'\U000D0000-\U000DFFFD\U000E1000-\U000EFFFD' + ) +else: # pragma: no cover + IPRIVATE = u'\uE000-\uF8FF' + UCSCHAR_RE = ( + u'\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF' + ) + +IUNRESERVED_RE = u'A-Za-z0-9\\._~\\-' + UCSCHAR_RE +IPCHAR = u'([' + IUNRESERVED_RE + SUB_DELIMITERS_RE + u':@]|%s)' % PCT_ENCODED + +isegments = { + 'isegment': IPCHAR + u'*', + # Non-zero length segment + 'isegment-nz': IPCHAR + u'+', + # Non-zero length segment without ":" + 'isegment-nz-nc': IPCHAR.replace(':', '') + u'+' +} + +IPATH_ROOTLESS = u'%(isegment-nz)s(/%(isegment)s)*' % isegments +IPATH_NOSCHEME = u'%(isegment-nz-nc)s(/%(isegment)s)*' % isegments +IPATH_ABSOLUTE = u'/(?:%s)?' % IPATH_ROOTLESS +IPATH_ABEMPTY = u'(?:/%(isegment)s)*' % isegments +IPATH_RE = u'^(?:%s|%s|%s|%s|%s)$' % ( + IPATH_ABEMPTY, IPATH_ABSOLUTE, IPATH_NOSCHEME, IPATH_ROOTLESS, PATH_EMPTY +) + +IREGULAR_NAME_RE = IREG_NAME = u'(?:{0}|[{1}])*'.format( + u'%[0-9A-Fa-f]{2}', SUB_DELIMITERS_RE + IUNRESERVED_RE +) + +IHOST_RE = IHOST_PATTERN = u'({0}|{1}|{2})'.format( + IREG_NAME, + IPv4_RE, + IP_LITERAL_RE, +) + +IUSERINFO_RE = u'^(?:[' + IUNRESERVED_RE + SUB_DELIMITERS_RE + u':]|%s)+' % ( + PCT_ENCODED +) + +IFRAGMENT_RE = (u'^(?:[/?:@' + IUNRESERVED_RE + SUB_DELIMITERS_RE + + u']|%s)*$' % PCT_ENCODED) +IQUERY_RE = (u'^(?:[/?:@' + IUNRESERVED_RE + SUB_DELIMITERS_RE + + IPRIVATE + u']|%s)*$' % PCT_ENCODED) + +IRELATIVE_PART_RE = u'(//%s%s|%s|%s|%s)' % ( + COMPONENT_PATTERN_DICT['authority'], + IPATH_ABEMPTY, + IPATH_ABSOLUTE, + IPATH_NOSCHEME, + PATH_EMPTY, +) + +IHIER_PART_RE = u'(//%s%s|%s|%s|%s)' % ( + COMPONENT_PATTERN_DICT['authority'], + IPATH_ABEMPTY, + IPATH_ABSOLUTE, + IPATH_ROOTLESS, + PATH_EMPTY, +) diff --git a/src/urllib3/packages/rfc3986/api.py b/src/urllib3/packages/rfc3986/api.py index 17f4daf927..ddc4a1cd28 100644 --- a/src/urllib3/packages/rfc3986/api.py +++ b/src/urllib3/packages/rfc3986/api.py @@ -19,6 +19,7 @@ and classes of rfc3986. """ +from .iri import IRIReference from .parseresult import ParseResult from .uri import URIReference @@ -37,6 +38,20 @@ def uri_reference(uri, encoding='utf-8'): return URIReference.from_string(uri, encoding) +def iri_reference(iri, encoding='utf-8'): + """Parse a IRI string into an IRIReference. + + This is a convenience function. You could achieve the same end by using + ``IRIReference.from_string(iri)``. + + :param str iri: The IRI which needs to be parsed into a reference. + :param str encoding: The encoding of the string provided + :returns: A parsed IRI + :rtype: :class:`IRIReference` + """ + return IRIReference.from_string(iri, encoding) + + def is_valid_uri(uri, encoding='utf-8', **kwargs): """Determine if the URI given is valid. diff --git a/src/urllib3/packages/rfc3986/exceptions.py b/src/urllib3/packages/rfc3986/exceptions.py index e0886a5ff0..da8ca7cb1f 100644 --- a/src/urllib3/packages/rfc3986/exceptions.py +++ b/src/urllib3/packages/rfc3986/exceptions.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- """Exceptions module for rfc3986.""" +from . import compat + class RFC3986Exception(Exception): """Base class for all rfc3986 exception classes.""" @@ -14,7 +16,8 @@ class InvalidAuthority(RFC3986Exception): def __init__(self, authority): """Initialize the exception with the invalid authority.""" super(InvalidAuthority, self).__init__( - "The authority ({0}) is not valid.".format(authority)) + u"The authority ({0}) is not valid.".format( + compat.to_str(authority))) class InvalidPort(RFC3986Exception): @@ -109,3 +112,7 @@ def __init__(self, uri, *component_names): uri, self.components, ) + + +class MissingDependencyError(RFC3986Exception): + """Exception raised when an IRI is encoded without the 'idna' module.""" diff --git a/src/urllib3/packages/rfc3986/iri.py b/src/urllib3/packages/rfc3986/iri.py new file mode 100644 index 0000000000..2c708d853a --- /dev/null +++ b/src/urllib3/packages/rfc3986/iri.py @@ -0,0 +1,143 @@ +"""Module containing the implementation of the IRIReference class.""" +# -*- coding: utf-8 -*- +# Copyright (c) 2014 Rackspace +# Copyright (c) 2015 Ian Stapleton Cordasco +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import namedtuple + +from . import compat +from . import exceptions +from . import misc +from . import normalizers +from . import uri + + +try: + import idna +except ImportError: # pragma: no cover + idna = None + + +class IRIReference(namedtuple('IRIReference', misc.URI_COMPONENTS), + uri.URIMixin): + """Immutable object representing a parsed IRI Reference. + + Can be encoded into an URIReference object via the procedure + specified in RFC 3987 Section 3.1 + + .. note:: + The IRI submodule is a new interface and may possibly change in + the future. Check for changes to the interface when upgrading. + """ + + slots = () + + def __new__(cls, scheme, authority, path, query, fragment, + encoding='utf-8'): + """Create a new IRIReference.""" + ref = super(IRIReference, cls).__new__( + cls, + scheme or None, + authority or None, + path or None, + query, + fragment) + ref.encoding = encoding + return ref + + def __eq__(self, other): + """Compare this reference to another.""" + other_ref = other + if isinstance(other, tuple): + other_ref = self.__class__(*other) + elif not isinstance(other, IRIReference): + try: + other_ref = self.__class__.from_string(other) + except TypeError: + raise TypeError( + 'Unable to compare {0}() to {1}()'.format( + type(self).__name__, type(other).__name__)) + + # See http://tools.ietf.org/html/rfc3986#section-6.2 + return tuple(self) == tuple(other_ref) + + def _match_subauthority(self): + return misc.ISUBAUTHORITY_MATCHER.match(self.authority) + + @classmethod + def from_string(cls, iri_string, encoding='utf-8'): + """Parse a IRI reference from the given unicode IRI string. + + :param str iri_string: Unicode IRI to be parsed into a reference. + :param str encoding: The encoding of the string provided + :returns: :class:`IRIReference` or subclass thereof + """ + iri_string = compat.to_str(iri_string, encoding) + + split_iri = misc.IRI_MATCHER.match(iri_string).groupdict() + return cls( + split_iri['scheme'], split_iri['authority'], + normalizers.encode_component(split_iri['path'], encoding), + normalizers.encode_component(split_iri['query'], encoding), + normalizers.encode_component(split_iri['fragment'], encoding), + encoding, + ) + + def encode(self, idna_encoder=None): + """Encode an IRIReference into a URIReference instance. + + If the ``idna`` module is installed or the ``rfc3986[idna]`` + extra is used then unicode characters in the IRI host + component will be encoded with IDNA2008. + + :param idna_encoder: + Function that encodes each part of the host component + If not given will raise an exception if the IRI + contains a host component. + :rtype: uri.URIReference + :returns: A URI reference + """ + authority = self.authority + if authority: + if idna_encoder is None: + if idna is None: # pragma: no cover + raise exceptions.MissingDependencyError( + "Could not import the 'idna' module " + "and the IRI hostname requires encoding" + ) + else: + def idna_encoder(x): + try: + return idna.encode(x, strict=True, std3_rules=True).lower() + except idna.IDNAError: + raise exceptions.InvalidAuthority(self.authority) + + authority = "" + if self.host: + authority = ".".join([compat.to_str(idna_encoder(part)) + for part in self.host.split(".")]) + + if self.userinfo is not None: + authority = (normalizers.encode_component( + self.userinfo, self.encoding) + '@' + authority) + + if self.port is not None: + authority += ":" + str(self.port) + + return uri.URIReference(self.scheme, + authority, + path=self.path, + query=self.query, + fragment=self.fragment, + encoding=self.encoding) diff --git a/src/urllib3/packages/rfc3986/misc.py b/src/urllib3/packages/rfc3986/misc.py index 697039a98c..00f9f3b94d 100644 --- a/src/urllib3/packages/rfc3986/misc.py +++ b/src/urllib3/packages/rfc3986/misc.py @@ -58,7 +58,14 @@ abnf_regexp.PORT_RE)) +HOST_MATCHER = re.compile('^' + abnf_regexp.HOST_RE + '$') IPv4_MATCHER = re.compile('^' + abnf_regexp.IPv4_RE + '$') +IPv6_MATCHER = re.compile(r'^\[' + abnf_regexp.IPv6_ADDRZ_RFC4007_RE + r'\]$') + +# Used by host validator +IPv6_NO_RFC4007_MATCHER = re.compile(r'^\[%s\]$' % ( + abnf_regexp.IPv6_ADDRZ_RE +)) # Matcher used to validate path components PATH_MATCHER = re.compile(abnf_regexp.PATH_RE) @@ -76,7 +83,8 @@ SCHEME_MATCHER = re.compile('^{0}$'.format(abnf_regexp.SCHEME_RE)) RELATIVE_REF_MATCHER = re.compile(r'^%s(\?%s)?(#%s)?$' % ( - abnf_regexp.RELATIVE_PART_RE, abnf_regexp.QUERY_RE, + abnf_regexp.RELATIVE_PART_RE, + abnf_regexp.QUERY_RE, abnf_regexp.FRAGMENT_RE, )) @@ -87,6 +95,42 @@ abnf_regexp.QUERY_RE[1:-1], )) +# ############### +# IRIs / RFC 3987 +# ############### + +IRI_MATCHER = re.compile(abnf_regexp.URL_PARSING_RE, re.UNICODE) + +ISUBAUTHORITY_MATCHER = re.compile(( + u'^(?:(?P{0})@)?' # iuserinfo + u'(?P{1})' # ihost + u':?(?P{2})?$' # port + ).format(abnf_regexp.IUSERINFO_RE, + abnf_regexp.IHOST_RE, + abnf_regexp.PORT_RE), re.UNICODE) + + +IHOST_MATCHER = re.compile('^' + abnf_regexp.IHOST_RE + '$', re.UNICODE) + +IPATH_MATCHER = re.compile(abnf_regexp.IPATH_RE, re.UNICODE) + +IQUERY_MATCHER = re.compile(abnf_regexp.IQUERY_RE, re.UNICODE) + +IFRAGMENT_MATCHER = re.compile(abnf_regexp.IFRAGMENT_RE, re.UNICODE) + + +RELATIVE_IRI_MATCHER = re.compile(u'^%s(?:\\?%s)?(?:%s)?$' % ( + abnf_regexp.IRELATIVE_PART_RE, + abnf_regexp.IQUERY_RE, + abnf_regexp.IFRAGMENT_RE +), re.UNICODE) + +ABSOLUTE_IRI_MATCHER = re.compile(u'^%s:%s(?:\\?%s)?$' % ( + abnf_regexp.COMPONENT_PATTERN_DICT['scheme'], + abnf_regexp.IHIER_PART_RE, + abnf_regexp.IQUERY_RE[1:-1] +), re.UNICODE) + # Path merger as defined in http://tools.ietf.org/html/rfc3986#section-5.2.3 def merge_paths(base_uri, relative_path): diff --git a/src/urllib3/packages/rfc3986/normalizers.py b/src/urllib3/packages/rfc3986/normalizers.py index ea6c6e18aa..2eb1bb36f7 100644 --- a/src/urllib3/packages/rfc3986/normalizers.py +++ b/src/urllib3/packages/rfc3986/normalizers.py @@ -49,6 +49,21 @@ def normalize_password(password): def normalize_host(host): """Normalize a host string.""" + if misc.IPv6_MATCHER.match(host): + percent = host.find('%') + if percent != -1: + percent_25 = host.find('%25') + + # Replace RFC 4007 IPv6 Zone ID delimiter '%' with '%25' + # from RFC 6874. If the host is '[%25]' then we + # assume RFC 4007 and normalize to '[%2525]' + if percent_25 == -1 or percent < percent_25 or \ + (percent == percent_25 and percent_25 == len(host) - 4): + host = host.replace('%', '%25', 1) + + # Don't normalize the casing of the Zone ID + return host[:percent].lower() + host[percent:] + return host.lower() @@ -147,6 +162,6 @@ def encode_component(uri_component, encoding): or (byte_ord < 128 and byte.decode() in misc.NON_PCT_ENCODED)): encoded_uri.extend(byte) continue - encoded_uri.extend('%{0:02x}'.format(byte_ord).encode()) + encoded_uri.extend('%{0:02x}'.format(byte_ord).encode().upper()) return encoded_uri.decode(encoding) diff --git a/src/urllib3/packages/rfc3986/uri.py b/src/urllib3/packages/rfc3986/uri.py index 244fff5565..d1d71505e2 100644 --- a/src/urllib3/packages/rfc3986/uri.py +++ b/src/urllib3/packages/rfc3986/uri.py @@ -15,16 +15,14 @@ # See the License for the specific language governing permissions and # limitations under the License. from collections import namedtuple -import warnings from . import compat -from . import exceptions as exc from . import misc from . import normalizers -from . import validators +from ._mixin import URIMixin -class URIReference(namedtuple('URIReference', misc.URI_COMPONENTS)): +class URIReference(namedtuple('URIReference', misc.URI_COMPONENTS), URIMixin): """Immutable object representing a parsed URI Reference. .. note:: @@ -116,228 +114,6 @@ def __eq__(self, other): naive_equality = tuple(self) == tuple(other_ref) return naive_equality or self.normalized_equality(other_ref) - @classmethod - def from_string(cls, uri_string, encoding='utf-8'): - """Parse a URI reference from the given unicode URI string. - - :param str uri_string: Unicode URI to be parsed into a reference. - :param str encoding: The encoding of the string provided - :returns: :class:`URIReference` or subclass thereof - """ - uri_string = compat.to_str(uri_string, encoding) - - split_uri = misc.URI_MATCHER.match(uri_string).groupdict() - return cls( - split_uri['scheme'], split_uri['authority'], - normalizers.encode_component(split_uri['path'], encoding), - normalizers.encode_component(split_uri['query'], encoding), - normalizers.encode_component(split_uri['fragment'], encoding), - encoding, - ) - - def authority_info(self): - """Return a dictionary with the ``userinfo``, ``host``, and ``port``. - - If the authority is not valid, it will raise a - :class:`~rfc3986.exceptions.InvalidAuthority` Exception. - - :returns: - ``{'userinfo': 'username:password', 'host': 'www.example.com', - 'port': '80'}`` - :rtype: dict - :raises rfc3986.exceptions.InvalidAuthority: - If the authority is not ``None`` and can not be parsed. - """ - if not self.authority: - return {'userinfo': None, 'host': None, 'port': None} - - match = misc.SUBAUTHORITY_MATCHER.match(self.authority) - - if match is None: - # In this case, we have an authority that was parsed from the URI - # Reference, but it cannot be further parsed by our - # misc.SUBAUTHORITY_MATCHER. In this case it must not be a valid - # authority. - raise exc.InvalidAuthority(self.authority.encode(self.encoding)) - - # We had a match, now let's ensure that it is actually a valid host - # address if it is IPv4 - matches = match.groupdict() - host = matches.get('host') - - if (host and misc.IPv4_MATCHER.match(host) and not - validators.valid_ipv4_host_address(host)): - # If we have a host, it appears to be IPv4 and it does not have - # valid bytes, it is an InvalidAuthority. - raise exc.InvalidAuthority(self.authority.encode(self.encoding)) - - return matches - - @property - def host(self): - """If present, a string representing the host.""" - try: - authority = self.authority_info() - except exc.InvalidAuthority: - return None - return authority['host'] - - @property - def port(self): - """If present, the port extracted from the authority.""" - try: - authority = self.authority_info() - except exc.InvalidAuthority: - return None - return authority['port'] - - @property - def userinfo(self): - """If present, the userinfo extracted from the authority.""" - try: - authority = self.authority_info() - except exc.InvalidAuthority: - return None - return authority['userinfo'] - - def is_absolute(self): - """Determine if this URI Reference is an absolute URI. - - See http://tools.ietf.org/html/rfc3986#section-4.3 for explanation. - - :returns: ``True`` if it is an absolute URI, ``False`` otherwise. - :rtype: bool - """ - return bool(misc.ABSOLUTE_URI_MATCHER.match(self.unsplit())) - - def is_valid(self, **kwargs): - """Determine if the URI is valid. - - .. deprecated:: 1.1.0 - - Use the :class:`~rfc3986.validators.Validator` object instead. - - :param bool require_scheme: Set to ``True`` if you wish to require the - presence of the scheme component. - :param bool require_authority: Set to ``True`` if you wish to require - the presence of the authority component. - :param bool require_path: Set to ``True`` if you wish to require the - presence of the path component. - :param bool require_query: Set to ``True`` if you wish to require the - presence of the query component. - :param bool require_fragment: Set to ``True`` if you wish to require - the presence of the fragment component. - :returns: ``True`` if the URI is valid. ``False`` otherwise. - :rtype: bool - """ - warnings.warn("Please use rfc3986.validators.Validator instead. " - "This method will be eventually removed.", - DeprecationWarning) - validators = [ - (self.scheme_is_valid, kwargs.get('require_scheme', False)), - (self.authority_is_valid, kwargs.get('require_authority', False)), - (self.path_is_valid, kwargs.get('require_path', False)), - (self.query_is_valid, kwargs.get('require_query', False)), - (self.fragment_is_valid, kwargs.get('require_fragment', False)), - ] - return all(v(r) for v, r in validators) - - def authority_is_valid(self, require=False): - """Determine if the authority component is valid. - - .. deprecated:: 1.1.0 - - Use the :class:`~rfc3986.validators.Validator` object instead. - - :param bool require: - Set to ``True`` to require the presence of this component. - :returns: - ``True`` if the authority is valid. ``False`` otherwise. - :rtype: - bool - """ - warnings.warn("Please use rfc3986.validators.Validator instead. " - "This method will be eventually removed.", - DeprecationWarning) - try: - self.authority_info() - except exc.InvalidAuthority: - return False - - return validators.authority_is_valid( - self.authority, - host=self.host, - require=require, - ) - - def scheme_is_valid(self, require=False): - """Determine if the scheme component is valid. - - .. deprecated:: 1.1.0 - - Use the :class:`~rfc3986.validators.Validator` object instead. - - :param str require: Set to ``True`` to require the presence of this - component. - :returns: ``True`` if the scheme is valid. ``False`` otherwise. - :rtype: bool - """ - warnings.warn("Please use rfc3986.validators.Validator instead. " - "This method will be eventually removed.", - DeprecationWarning) - return validators.scheme_is_valid(self.scheme, require) - - def path_is_valid(self, require=False): - """Determine if the path component is valid. - - .. deprecated:: 1.1.0 - - Use the :class:`~rfc3986.validators.Validator` object instead. - - :param str require: Set to ``True`` to require the presence of this - component. - :returns: ``True`` if the path is valid. ``False`` otherwise. - :rtype: bool - """ - warnings.warn("Please use rfc3986.validators.Validator instead. " - "This method will be eventually removed.", - DeprecationWarning) - return validators.path_is_valid(self.path, require) - - def query_is_valid(self, require=False): - """Determine if the query component is valid. - - .. deprecated:: 1.1.0 - - Use the :class:`~rfc3986.validators.Validator` object instead. - - :param str require: Set to ``True`` to require the presence of this - component. - :returns: ``True`` if the query is valid. ``False`` otherwise. - :rtype: bool - """ - warnings.warn("Please use rfc3986.validators.Validator instead. " - "This method will be eventually removed.", - DeprecationWarning) - return validators.query_is_valid(self.query, require) - - def fragment_is_valid(self, require=False): - """Determine if the fragment component is valid. - - .. deprecated:: 1.1.0 - - Use the Validator object instead. - - :param str require: Set to ``True`` to require the presence of this - component. - :returns: ``True`` if the fragment is valid. ``False`` otherwise. - :rtype: bool - """ - warnings.warn("Please use rfc3986.validators.Validator instead. " - "This method will be eventually removed.", - DeprecationWarning) - return validators.fragment_is_valid(self.fragment, require) - def normalize(self): """Normalize this reference as described in Section 6.2.2. @@ -357,136 +133,21 @@ def normalize(self): normalizers.normalize_fragment(self.fragment), self.encoding) - def normalized_equality(self, other_ref): - """Compare this URIReference to another URIReference. + @classmethod + def from_string(cls, uri_string, encoding='utf-8'): + """Parse a URI reference from the given unicode URI string. - :param URIReference other_ref: (required), The reference with which - we're comparing. - :returns: ``True`` if the references are equal, ``False`` otherwise. - :rtype: bool + :param str uri_string: Unicode URI to be parsed into a reference. + :param str encoding: The encoding of the string provided + :returns: :class:`URIReference` or subclass thereof """ - return tuple(self.normalize()) == tuple(other_ref.normalize()) - - def resolve_with(self, base_uri, strict=False): - """Use an absolute URI Reference to resolve this relative reference. - - Assuming this is a relative reference that you would like to resolve, - use the provided base URI to resolve it. - - See http://tools.ietf.org/html/rfc3986#section-5 for more information. + uri_string = compat.to_str(uri_string, encoding) - :param base_uri: Either a string or URIReference. It must be an - absolute URI or it will raise an exception. - :returns: A new URIReference which is the result of resolving this - reference using ``base_uri``. - :rtype: :class:`URIReference` - :raises rfc3986.exceptions.ResolutionError: - If the ``base_uri`` is not an absolute URI. - """ - if not isinstance(base_uri, URIReference): - base_uri = URIReference.from_string(base_uri) - - if not base_uri.is_absolute(): - raise exc.ResolutionError(base_uri) - - # This is optional per - # http://tools.ietf.org/html/rfc3986#section-5.2.1 - base_uri = base_uri.normalize() - - # The reference we're resolving - resolving = self - - if not strict and resolving.scheme == base_uri.scheme: - resolving = resolving.copy_with(scheme=None) - - # http://tools.ietf.org/html/rfc3986#page-32 - if resolving.scheme is not None: - target = resolving.copy_with( - path=normalizers.normalize_path(resolving.path) - ) - else: - if resolving.authority is not None: - target = resolving.copy_with( - scheme=base_uri.scheme, - path=normalizers.normalize_path(resolving.path) - ) - else: - if resolving.path is None: - if resolving.query is not None: - query = resolving.query - else: - query = base_uri.query - target = resolving.copy_with( - scheme=base_uri.scheme, - authority=base_uri.authority, - path=base_uri.path, - query=query - ) - else: - if resolving.path.startswith('/'): - path = normalizers.normalize_path(resolving.path) - else: - path = normalizers.normalize_path( - misc.merge_paths(base_uri, resolving.path) - ) - target = resolving.copy_with( - scheme=base_uri.scheme, - authority=base_uri.authority, - path=path, - query=resolving.query - ) - return target - - def unsplit(self): - """Create a URI string from the components. - - :returns: The URI Reference reconstituted as a string. - :rtype: str - """ - # See http://tools.ietf.org/html/rfc3986#section-5.3 - result_list = [] - if self.scheme: - result_list.extend([self.scheme, ':']) - if self.authority: - result_list.extend(['//', self.authority]) - if self.path: - result_list.append(self.path) - if self.query is not None: - result_list.extend(['?', self.query]) - if self.fragment is not None: - result_list.extend(['#', self.fragment]) - return ''.join(result_list) - - def copy_with(self, scheme=misc.UseExisting, authority=misc.UseExisting, - path=misc.UseExisting, query=misc.UseExisting, - fragment=misc.UseExisting): - """Create a copy of this reference with the new components. - - :param str scheme: - (optional) The scheme to use for the new reference. - :param str authority: - (optional) The authority to use for the new reference. - :param str path: - (optional) The path to use for the new reference. - :param str query: - (optional) The query to use for the new reference. - :param str fragment: - (optional) The fragment to use for the new reference. - :returns: - New URIReference with provided components. - :rtype: - URIReference - """ - attributes = { - 'scheme': scheme, - 'authority': authority, - 'path': path, - 'query': query, - 'fragment': fragment, - } - for key, value in list(attributes.items()): - if value is misc.UseExisting: - del attributes[key] - uri = self._replace(**attributes) - uri.encoding = self.encoding - return uri + split_uri = misc.URI_MATCHER.match(uri_string).groupdict() + return cls( + split_uri['scheme'], split_uri['authority'], + normalizers.encode_component(split_uri['path'], encoding), + normalizers.encode_component(split_uri['query'], encoding), + normalizers.encode_component(split_uri['fragment'], encoding), + encoding, + ) diff --git a/src/urllib3/packages/rfc3986/validators.py b/src/urllib3/packages/rfc3986/validators.py index c781325e03..7fc97215b1 100644 --- a/src/urllib3/packages/rfc3986/validators.py +++ b/src/urllib3/packages/rfc3986/validators.py @@ -304,8 +304,28 @@ def authority_is_valid(authority, host=None, require=False): bool """ validated = is_valid(authority, misc.SUBAUTHORITY_MATCHER, require) + if validated and host is not None: + return host_is_valid(host, require) + return validated + + +def host_is_valid(host, require=False): + """Determine if the host string is valid. + + :param str host: + The host to validate. + :param bool require: + (optional) Specify if host must not be None. + :returns: + ``True`` if valid, ``False`` otherwise + :rtype: + bool + """ + validated = is_valid(host, misc.HOST_MATCHER, require) if validated and host is not None and misc.IPv4_MATCHER.match(host): return valid_ipv4_host_address(host) + elif validated and host is not None and misc.IPv6_MATCHER.match(host): + return misc.IPv6_NO_RFC4007_MATCHER.match(host) is not None return validated @@ -395,7 +415,9 @@ def subauthority_component_is_valid(uri, component): # If we can parse the authority into sub-components and we're not # validating the port, we can assume it's valid. - if component != 'port': + if component == 'host': + return host_is_valid(subauthority_dict['host']) + elif component != 'port': return True try: diff --git a/src/urllib3/util/url.py b/src/urllib3/util/url.py index e12278b53d..0127e2fe5c 100644 --- a/src/urllib3/util/url.py +++ b/src/urllib3/util/url.py @@ -4,7 +4,8 @@ from ..exceptions import LocationParseError from ..packages import six, rfc3986 -from ..packages.rfc3986.exceptions import RFC3986Exception +from ..packages.rfc3986.exceptions import RFC3986Exception, ValidationError +from ..packages.rfc3986.validators import Validator url_attrs = ['scheme', 'auth', 'host', 'port', 'path', 'query', 'fragment'] @@ -14,12 +15,12 @@ NORMALIZABLE_SCHEMES = ('http', 'https', None) # Regex for detecting URLs with schemes. RFC 3986 Section 3.1 -SCHEME_REGEX = re.compile(r"^[a-zA-Z][a-zA-Z0-9+\-.]*://") +SCHEME_REGEX = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+\-]*:|/)") class Url(namedtuple('Url', url_attrs)): """ - Datastructure for representing an HTTP URL. Used as a return value for + Data structure for representing an HTTP URL. Used as a return value for :func:`parse_url`. Both the scheme and host are normalized as they are both case-insensitive according to RFC 3986. """ @@ -29,10 +30,8 @@ def __new__(cls, scheme=None, auth=None, host=None, port=None, path=None, query=None, fragment=None): if path and not path.startswith('/'): path = '/' + path - if scheme: + if scheme is not None: scheme = scheme.lower() - if host and scheme in NORMALIZABLE_SCHEMES: - host = host.lower() return super(Url, cls).__new__(cls, scheme, auth, host, port, path, query, fragment) @@ -78,23 +77,23 @@ def url(self): 'http://username:password@host.com:80/path?query#fragment' """ scheme, auth, host, port, path, query, fragment = self - url = '' + url = u'' # We use "is not None" we want things to happen with empty strings (or 0 port) if scheme is not None: - url += scheme + '://' + url += scheme + u'://' if auth is not None: - url += auth + '@' + url += auth + u'@' if host is not None: url += host if port is not None: - url += ':' + str(port) + url += u':' + str(port) if path is not None: url += path if query is not None: - url += '?' + query + url += u'?' + query if fragment is not None: - url += '#' + fragment + url += u'#' + fragment return url @@ -104,7 +103,7 @@ def __str__(self): def split_first(s, delims): """ - Deprecated. No longer used by parse_url(). + .. deprecated:: 1.25 Given a string and an iterable of delimiters, split on the first found delimiter. Return two split parts and the matched delimiter. @@ -161,6 +160,8 @@ def parse_url(url): return Url() is_string = not isinstance(url, six.binary_type) + if not is_string: + url = url.decode("utf-8") # RFC 3986 doesn't like URLs that have a host but don't start # with a scheme and we support URLs like that so we need to @@ -171,22 +172,53 @@ def parse_url(url): url = "//" + url try: - parse_result = rfc3986.urlparse(url, encoding="utf-8") + iri_ref = rfc3986.IRIReference.from_string(url, encoding="utf-8") except (ValueError, RFC3986Exception): + six.raise_from(LocationParseError(url), None) + + def idna_encode(name): + if name and any([ord(x) > 128 for x in name]): + try: + import idna + except ImportError: + raise LocationParseError("Unable to parse URL without the 'idna' module") + try: + return idna.encode(name, strict=True, std3_rules=True).lower() + except idna.IDNAError: + raise LocationParseError(u"Name '%s' is not a valid IDNA label" % name) + return name + + has_authority = iri_ref.authority is not None + uri_ref = iri_ref.encode(idna_encoder=idna_encode) + + # rfc3986 strips the authority if it's invalid + if has_authority and uri_ref.authority is None: raise LocationParseError(url) - # RFC 3986 doesn't assert ports must be non-negative. - if parse_result.port and parse_result.port < 0: - raise LocationParseError(url) + # Only normalize schemes we understand to not break http+unix + # or other schemes that don't follow RFC 3986. + if uri_ref.scheme is None or uri_ref.scheme.lower() in NORMALIZABLE_SCHEMES: + uri_ref = uri_ref.normalize() + + # Validate all URIReference components and ensure that all + # components that were set before are still set after + # normalization has completed. + validator = Validator() + try: + validator.check_validity_of( + *validator.COMPONENT_NAMES + ).validate(uri_ref) + except ValidationError: + six.raise_from(LocationParseError(url), None) # For the sake of backwards compatibility we put empty # string values for path if there are any defined values # beyond the path in the URL. # TODO: Remove this when we break backwards compatibility. - path = parse_result.path + path = uri_ref.path if not path: - if (parse_result.query is not None - or parse_result.fragment is not None): + if (uri_ref.query is not None + or uri_ref.fragment is not None): path = "" else: path = None @@ -201,13 +233,13 @@ def to_input_type(x): return x return Url( - scheme=to_input_type(parse_result.scheme), - auth=to_input_type(parse_result.userinfo), - host=to_input_type(parse_result.hostname), - port=parse_result.port, + scheme=to_input_type(uri_ref.scheme), + auth=to_input_type(uri_ref.userinfo), + host=to_input_type(uri_ref.host), + port=int(uri_ref.port) if uri_ref.port is not None else None, path=to_input_type(path), - query=to_input_type(parse_result.query), - fragment=to_input_type(parse_result.fragment) + query=to_input_type(uri_ref.query), + fragment=to_input_type(uri_ref.fragment) ) diff --git a/test/test_util.py b/test/test_util.py index ac527355a1..b8ab2e6862 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -131,12 +131,24 @@ def test_invalid_host(self, location): with pytest.raises(LocationParseError): get_host(location) + @pytest.mark.parametrize('url', [ + 'http://user\\@google.com', + 'http://google\\.com', + 'user\\@google.com', + 'http://google.com#fragment#', + 'http://user@user@google.com/', + ]) + def test_invalid_url(self, url): + with pytest.raises(LocationParseError): + parse_url(url) + @pytest.mark.parametrize('url, expected_normalized_url', [ ('HTTP://GOOGLE.COM/MAIL/', 'http://google.com/MAIL/'), ('HTTP://JeremyCline:Hunter2@Example.com:8080/', 'http://JeremyCline:Hunter2@example.com:8080/'), ('HTTPS://Example.Com/?Key=Value', 'https://example.com/?Key=Value'), ('Https://Example.Com/#Fragment', 'https://example.com/#Fragment'), + ('[::Ff%etH0%Ff]/%ab%Af', '[::ff%25etH0%Ff]/%AB%AF'), ]) def test_parse_url_normalization(self, url, expected_normalized_url): """Assert parse_url normalizes the scheme/host, and only the scheme/host""" @@ -155,8 +167,7 @@ def test_parse_url_normalization(self, url, expected_normalized_url): # Path/query/fragment ('', Url()), ('/', Url(path='/')), - ('/abc/../def', Url(path="/abc/../def")), - ('#?/!google.com/?foo#bar', Url(path='', fragment='?/!google.com/?foo#bar')), + ('#?/!google.com/?foo', Url(path='', fragment='?/!google.com/?foo')), ('/foo', Url(path='/foo')), ('/foo?bar=baz', Url(path='/foo', query='bar=baz')), ('/foo?bar=baz#banana?apple/orange', Url(path='/foo', @@ -173,10 +184,10 @@ def test_parse_url_normalization(self, url, expected_normalized_url): # Auth ('http://foo:bar@localhost/', Url('http', auth='foo:bar', host='localhost', path='/')), ('http://foo@localhost/', Url('http', auth='foo', host='localhost', path='/')), - ('http://foo:bar@baz@localhost/', Url('http', - auth='foo:bar@baz', - host='localhost', - path='/')), + ('http://foo:bar@localhost/', Url('http', + auth='foo:bar', + host='localhost', + path='/')), # Unicode type (Python 2.x) (u'http://foo:bar@localhost/', Url(u'http', @@ -194,6 +205,9 @@ def test_parse_url_normalization(self, url, expected_normalized_url): ('?', Url(path='', query='')), ('#', Url(path='', fragment='')), + # Path normalization + ('/abc/../def', Url(path="/def")), + # Empty Port ('http://google.com:', Url('http', host='google.com')), ('http://google.com:/', Url('http', host='google.com', path='/')), @@ -211,6 +225,23 @@ def test_parse_url(self, url, expected_url): def test_unparse_url(self, url, expected_url): assert url == expected_url.url + @pytest.mark.parametrize( + ['url', 'expected_url'], + [ + # RFC 3986 5.2.4 + ('/abc/../def', Url(path="/def")), + ('/..', Url(path="/")), + ('/./abc/./def/', Url(path='/abc/def/')), + ('/.', Url(path='/')), + ('/./', Url(path='/')), + ('/abc/./.././d/././e/.././f/./../../ghi', Url(path='/ghi')) + ] + ) + def test_parse_and_normalize_url_paths(self, url, expected_url): + actual_url = parse_url(url) + assert actual_url == expected_url + assert actual_url.url == expected_url.url + def test_parse_url_invalid_IPv6(self): with pytest.raises(LocationParseError): parse_url('[::1') @@ -260,12 +291,36 @@ def test_netloc(self, url, expected_netloc): # CVE-2016-5699 ("http://127.0.0.1%0d%0aConnection%3a%20keep-alive", - Url("http", host="127.0.0.1%0d%0aConnection%3a%20keep-alive")), + Url("http", host="127.0.0.1%0d%0aconnection%3a%20keep-alive")), # NodeJS unicode -> double dot (u"http://google.com/\uff2e\uff2e/abc", Url("http", host="google.com", - path='/%ef%bc%ae%ef%bc%ae/abc')) + path='/%EF%BC%AE%EF%BC%AE/abc')), + + # Scheme without :// + ("javascript:a='@google.com:12345/';alert(0)", + Url(scheme="javascript", + path="a='@google.com:12345/';alert(0)")), + + ("//google.com/a/b/c", Url(host="google.com", path="/a/b/c")), + + # International URLs + (u'http://ヒ:キ@ヒ.abc.ニ/ヒ?キ#ワ', Url(u'http', + host=u'xn--pdk.abc.xn--idk', + auth=u'%E3%83%92:%E3%82%AD', + path=u'/%E3%83%92', + query=u'%E3%82%AD', + fragment=u'%E3%83%AF')), + + # Injected headers (CVE-2016-5699, CVE-2019-9740, CVE-2019-9947) + ("10.251.0.83:7777?a=1 HTTP/1.1\r\nX-injected: header", + Url(host='10.251.0.83', port=7777, path='', + query='a=1%20HTTP/1.1%0D%0AX-injected:%20header')), + + ("http://127.0.0.1:6379?\r\nSET test failure12\r\n:8080/test/?test=a", + Url(scheme='http', host='127.0.0.1', port=6379, path='', + query='%0D%0ASET%20test%20failure12%0D%0A:8080/test/?test=a')), ] @pytest.mark.parametrize("url, expected_url", url_vulnerabilities)