Skip to content
Permalink
Browse files Browse the repository at this point in the history
Percent-encode invalid characters with request target (#1586)
  • Loading branch information
sethmlarson committed Apr 28, 2019
1 parent a0d2bfd commit a74c9cf
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 12 deletions.
3 changes: 3 additions & 0 deletions CHANGES.rst
Expand Up @@ -6,6 +6,9 @@ dev (master)

* Change ``is_ipaddress`` to not detect IPvFuture addresses. (Pull #1583)

* Change ``parse_url`` to percent-encode invalid characters within the
path, query, and target components. (Pull #1586)


1.25.1 (2019-04-24)
-------------------
Expand Down
58 changes: 48 additions & 10 deletions src/urllib3/util/url.py
Expand Up @@ -6,6 +6,7 @@
from ..packages import six, rfc3986
from ..packages.rfc3986.exceptions import RFC3986Exception, ValidationError
from ..packages.rfc3986.validators import Validator
from ..packages.rfc3986 import abnf_regexp, normalizers, compat, misc


url_attrs = ['scheme', 'auth', 'host', 'port', 'path', 'query', 'fragment']
Expand All @@ -17,6 +18,9 @@
# Regex for detecting URLs with schemes. RFC 3986 Section 3.1
SCHEME_REGEX = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+\-]*:|/)")

PATH_CHARS = abnf_regexp.UNRESERVED_CHARS_SET | abnf_regexp.SUB_DELIMITERS_SET | {':', '@', '/'}
QUERY_CHARS = FRAGMENT_CHARS = PATH_CHARS | {'?'}


class Url(namedtuple('Url', url_attrs)):
"""
Expand Down Expand Up @@ -136,6 +140,37 @@ def split_first(s, delims):
return s[:min_idx], s[min_idx + 1:], min_delim


def _encode_invalid_chars(component, allowed_chars, encoding='utf-8'):
"""Percent-encodes a URI component without reapplying
onto an already percent-encoded component. Based on
rfc3986.normalizers.encode_component()
"""
if component is None:
return component

# Try to see if the component we're encoding is already percent-encoded
# so we can skip all '%' characters but still encode all others.
percent_encodings = len(normalizers.PERCENT_MATCHER.findall(
compat.to_str(component, encoding)))

uri_bytes = component.encode('utf-8', 'surrogatepass')
is_percent_encoded = percent_encodings == uri_bytes.count(b'%')

encoded_component = bytearray()

for i in range(0, len(uri_bytes)):
# Will return a single character bytestring on both Python 2 & 3
byte = uri_bytes[i:i+1]
byte_ord = ord(byte)
if ((is_percent_encoded and byte == b'%')
or (byte_ord < 128 and byte.decode() in allowed_chars)):
encoded_component.extend(byte)
continue
encoded_component.extend('%{0:02x}'.format(byte_ord).encode().upper())

return encoded_component.decode(encoding)


def parse_url(url):
"""
Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is
Expand All @@ -160,8 +195,6 @@ def parse_url(url):
return Url()

is_string = not isinstance(url, six.binary_type)
if not is_string:
url = url.decode("utf-8")

# RFC 3986 doesn't like URLs that have a host but don't start
# with a scheme and we support URLs like that so we need to
Expand All @@ -171,11 +204,6 @@ def parse_url(url):
if not SCHEME_REGEX.search(url):
url = "//" + url

try:
iri_ref = rfc3986.IRIReference.from_string(url, encoding="utf-8")
except (ValueError, RFC3986Exception):
six.raise_from(LocationParseError(url), None)

def idna_encode(name):
if name and any([ord(x) > 128 for x in name]):
try:
Expand All @@ -188,8 +216,18 @@ def idna_encode(name):
raise LocationParseError(u"Name '%s' is not a valid IDNA label" % name)
return name

has_authority = iri_ref.authority is not None
uri_ref = iri_ref.encode(idna_encoder=idna_encode)
try:
split_iri = misc.IRI_MATCHER.match(compat.to_str(url)).groupdict()
iri_ref = rfc3986.IRIReference(
split_iri['scheme'], split_iri['authority'],
_encode_invalid_chars(split_iri['path'], PATH_CHARS),
_encode_invalid_chars(split_iri['query'], QUERY_CHARS),
_encode_invalid_chars(split_iri['fragment'], FRAGMENT_CHARS)
)
has_authority = iri_ref.authority is not None
uri_ref = iri_ref.encode(idna_encoder=idna_encode)
except (ValueError, RFC3986Exception):
return six.raise_from(LocationParseError(url), None)

# rfc3986 strips the authority if it's invalid
if has_authority and uri_ref.authority is None:
Expand All @@ -209,7 +247,7 @@ def idna_encode(name):
*validator.COMPONENT_NAMES
).validate(uri_ref)
except ValidationError:
six.raise_from(LocationParseError(url), None)
return six.raise_from(LocationParseError(url), None)

# For the sake of backwards compatibility we put empty
# string values for path if there are any defined values
Expand Down
27 changes: 25 additions & 2 deletions test/test_util.py
Expand Up @@ -135,8 +135,15 @@ def test_invalid_host(self, location):
'http://user\\@google.com',
'http://google\\.com',
'user\\@google.com',
'http://google.com#fragment#',
'http://user@user@google.com/',
# Invalid IDNA labels
u'http://\uD7FF.com',
u'http://❤️',
# Unicode surrogates
u'http://\uD800.com',
u'http://\uDC00.com',
])
def test_invalid_url(self, url):
with pytest.raises(LocationParseError):
Expand All @@ -149,6 +156,15 @@ def test_invalid_url(self, url):
('HTTPS://Example.Com/?Key=Value', 'https://example.com/?Key=Value'),
('Https://Example.Com/#Fragment', 'https://example.com/#Fragment'),
('[::Ff%etH0%Ff]/%ab%Af', '[::ff%25etH0%Ff]/%AB%AF'),
# Invalid characters for the query/fragment getting encoded
('http://google.com/p[]?parameter[]=\"hello\"#fragment#',
'http://google.com/p%5B%5D?parameter%5B%5D=%22hello%22#fragment%23'),
# Percent encoding isn't applied twice despite '%' being invalid
# but the percent encoding is still normalized.
('http://google.com/p%5B%5d?parameter%5b%5D=%22hello%22#fragment%23',
'http://google.com/p%5B%5D?parameter%5B%5D=%22hello%22#fragment%23')
])
def test_parse_url_normalization(self, url, expected_normalized_url):
"""Assert parse_url normalizes the scheme/host, and only the scheme/host"""
Expand Down Expand Up @@ -214,7 +230,14 @@ def test_parse_url_normalization(self, url, expected_normalized_url):

# Uppercase IRI
(u'http://Königsgäßchen.de/straße',
Url('http', host='xn--knigsgchen-b4a3dun.de', path='/stra%C3%9Fe'))
Url('http', host='xn--knigsgchen-b4a3dun.de', path='/stra%C3%9Fe')),

# Unicode Surrogates
(u'http://google.com/\uD800', Url('http', host='google.com', path='%ED%A0%80')),
(u'http://google.com?q=\uDC00',
Url('http', host='google.com', path='', query='q=%ED%B0%80')),
(u'http://google.com#\uDC00',
Url('http', host='google.com', path='', fragment='%ED%B0%80')),
]

@pytest.mark.parametrize(
Expand Down

0 comments on commit a74c9cf

Please sign in to comment.