Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

[networking] Remove dot segments during URL normalization #7662

Merged
merged 6 commits into from
Jul 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
21 changes: 21 additions & 0 deletions test/test_networking.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,12 @@ def do_GET(self):
self.send_header('Location', self.path)
self.send_header('Content-Length', '0')
self.end_headers()
elif self.path == '/redirect_dotsegments':
self.send_response(301)
# redirect to /headers but with dot segments before
self.send_header('Location', '/a/b/./../../headers')
self.send_header('Content-Length', '0')
self.end_headers()
elif self.path.startswith('/redirect_'):
self._redirect()
elif self.path.startswith('/method'):
Expand Down Expand Up @@ -355,6 +361,21 @@ def test_percent_encode(self, handler):
assert res.status == 200
res.close()

@pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
def test_remove_dot_segments(self, handler):
with handler() as rh:
# This isn't a comprehensive test,
# but it should be enough to check whether the handler is removing dot segments
res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/a/b/./../../headers'))
assert res.status == 200
assert res.url == f'http://127.0.0.1:{self.http_port}/headers'
res.close()

res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_dotsegments'))
assert res.status == 200
assert res.url == f'http://127.0.0.1:{self.http_port}/headers'
res.close()

@pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
def test_unicode_path_redirection(self, handler):
with handler() as rh:
Expand Down
42 changes: 33 additions & 9 deletions test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,6 @@
encode_base_n,
encode_compat_str,
encodeFilename,
escape_rfc3986,
escape_url,
expand_path,
extract_attributes,
extract_basic_auth,
Expand Down Expand Up @@ -132,7 +130,12 @@
xpath_text,
xpath_with_ns,
)
from yt_dlp.utils.networking import HTTPHeaderDict
from yt_dlp.utils.networking import (
HTTPHeaderDict,
escape_rfc3986,
normalize_url,
remove_dot_segments,
)


class TestUtil(unittest.TestCase):
Expand Down Expand Up @@ -933,24 +936,45 @@ def test_escape_rfc3986(self):
self.assertEqual(escape_rfc3986('foo bar'), 'foo%20bar')
self.assertEqual(escape_rfc3986('foo%20bar'), 'foo%20bar')

def test_escape_url(self):
def test_normalize_url(self):
self.assertEqual(
escape_url('http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavre虂_FD.mp4'),
normalize_url('http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavre虂_FD.mp4'),
'http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavre%CC%81_FD.mp4'
)
self.assertEqual(
escape_url('http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erkl盲rt/Das-Erste/Video?documentId=22673108&bcastId=5290'),
normalize_url('http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erkl盲rt/Das-Erste/Video?documentId=22673108&bcastId=5290'),
'http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erkl%C3%A4rt/Das-Erste/Video?documentId=22673108&bcastId=5290'
)
self.assertEqual(
escape_url('http://褌械褋褌.褉褎/褎褉邪谐屑械薪褌'),
normalize_url('http://褌械褋褌.褉褎/褎褉邪谐屑械薪褌'),
'http://xn--e1aybc.xn--p1ai/%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82'
)
self.assertEqual(
escape_url('http://褌械褋褌.褉褎/邪斜胁?邪斜胁=邪斜胁#邪斜胁'),
normalize_url('http://褌械褋褌.褉褎/邪斜胁?邪斜胁=邪斜胁#邪斜胁'),
'http://xn--e1aybc.xn--p1ai/%D0%B0%D0%B1%D0%B2?%D0%B0%D0%B1%D0%B2=%D0%B0%D0%B1%D0%B2#%D0%B0%D0%B1%D0%B2'
)
self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0')
self.assertEqual(normalize_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0')

self.assertEqual(normalize_url('http://www.example.com/../a/b/../c/./d.html'), 'http://www.example.com/a/c/d.html')

def test_remove_dot_segments(self):
self.assertEqual(remove_dot_segments('/a/b/c/./../../g'), '/a/g')
self.assertEqual(remove_dot_segments('mid/content=5/../6'), 'mid/6')
self.assertEqual(remove_dot_segments('/ad/../cd'), '/cd')
self.assertEqual(remove_dot_segments('/ad/../cd/'), '/cd/')
self.assertEqual(remove_dot_segments('/..'), '/')
self.assertEqual(remove_dot_segments('/./'), '/')
self.assertEqual(remove_dot_segments('/./a'), '/a')
self.assertEqual(remove_dot_segments('/abc/./.././d/././e/.././f/./../../ghi'), '/ghi')
self.assertEqual(remove_dot_segments('/'), '/')
self.assertEqual(remove_dot_segments('/t'), '/t')
self.assertEqual(remove_dot_segments('t'), 't')
self.assertEqual(remove_dot_segments(''), '')
self.assertEqual(remove_dot_segments('/../a/b/c'), '/a/b/c')
self.assertEqual(remove_dot_segments('../a'), 'a')
self.assertEqual(remove_dot_segments('./a'), 'a')
self.assertEqual(remove_dot_segments('.'), '')
self.assertEqual(remove_dot_segments('////'), '////')

def test_js_to_json_vars_strings(self):
self.assertDictEqual(
Expand Down
6 changes: 3 additions & 3 deletions yt_dlp/cookies.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
from .utils import (
Popen,
error_to_str,
escape_url,
expand_path,
is_path_like,
sanitize_url,
Expand All @@ -42,6 +41,7 @@
write_string,
)
from .utils._utils import _YDLLogger
from .utils.networking import normalize_url

CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'}
SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'}
Expand Down Expand Up @@ -1308,7 +1308,7 @@ def prepare_line(line):

def get_cookie_header(self, url):
"""Generate a Cookie HTTP header for a given url"""
cookie_req = urllib.request.Request(escape_url(sanitize_url(url)))
cookie_req = urllib.request.Request(normalize_url(sanitize_url(url)))
self.add_cookie_header(cookie_req)
return cookie_req.get_header('Cookie')

Expand All @@ -1317,7 +1317,7 @@ def get_cookies_for_url(self, url):
# Policy `_now` attribute must be set before calling `_cookies_for_request`
# Ref: https://github.com/python/cpython/blob/3.7/Lib/http/cookiejar.py#L1360
self._policy._now = self._now = int(time.time())
return self._cookies_for_request(urllib.request.Request(escape_url(sanitize_url(url))))
return self._cookies_for_request(urllib.request.Request(normalize_url(sanitize_url(url))))

def clear(self, *args, **kwargs):
with contextlib.suppress(KeyError):
Expand Down
7 changes: 4 additions & 3 deletions yt_dlp/networking/_urllib.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@
from ..dependencies import brotli
from ..socks import ProxyError as SocksProxyError
from ..socks import sockssocket
from ..utils import escape_url, update_url_query
from ..utils import update_url_query
from ..utils.networking import normalize_url

SUPPORTED_ENCODINGS = ['gzip', 'deflate']
CONTENT_DECODE_ERRORS = [zlib.error, OSError]
Expand Down Expand Up @@ -179,7 +180,7 @@ def http_request(self, req):
# Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
# the code of this workaround has been moved here from YoutubeDL.urlopen()
url = req.get_full_url()
url_escaped = escape_url(url)
url_escaped = normalize_url(url)

# Substitute URL if any change after escaping
if url != url_escaped:
Expand Down Expand Up @@ -212,7 +213,7 @@ def http_response(self, req, resp):
if location:
# As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
location = location.encode('iso-8859-1').decode()
location_escaped = escape_url(location)
location_escaped = normalize_url(location)
if location != location_escaped:
del resp.headers['Location']
resp.headers['Location'] = location_escaped
Expand Down
5 changes: 2 additions & 3 deletions yt_dlp/networking/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,9 @@
classproperty,
deprecation_warning,
error_to_str,
escape_url,
update_url_query,
)
from ..utils.networking import HTTPHeaderDict
from ..utils.networking import HTTPHeaderDict, normalize_url

if typing.TYPE_CHECKING:
RequestData = bytes | Iterable[bytes] | typing.IO | None
Expand Down Expand Up @@ -368,7 +367,7 @@ def url(self, url):
raise TypeError('url must be a string')
elif url.startswith('//'):
url = 'http:' + url
self._url = escape_url(url)
self._url = normalize_url(url)

@property
def method(self):
Expand Down
4 changes: 3 additions & 1 deletion yt_dlp/utils/_legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import zlib

from ._utils import Popen, decode_base_n, preferredencoding
from .networking import escape_rfc3986 # noqa: F401
from .networking import normalize_url as escape_url # noqa: F401
from .traversal import traverse_obj
from ..dependencies import certifi, websockets
from ..networking._helper import make_ssl_context
Expand Down Expand Up @@ -197,7 +199,7 @@ def request_to_url(req):


def sanitized_Request(url, *args, **kwargs):
from ..utils import escape_url, extract_basic_auth, sanitize_url
from ..utils import extract_basic_auth, sanitize_url
url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
if auth_header is not None:
headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
Expand Down
17 changes: 0 additions & 17 deletions yt_dlp/utils/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2464,23 +2464,6 @@ def lowercase_escape(s):
s)


def escape_rfc3986(s):
"""Escape non-ASCII characters as suggested by RFC 3986"""
return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")


def escape_url(url):
"""Escape URL as suggested by RFC 3986"""
url_parsed = urllib.parse.urlparse(url)
return url_parsed._replace(
netloc=url_parsed.netloc.encode('idna').decode('ascii'),
path=escape_rfc3986(url_parsed.path),
params=escape_rfc3986(url_parsed.params),
query=escape_rfc3986(url_parsed.query),
fragment=escape_rfc3986(url_parsed.fragment)
).geturl()


def parse_qs(url, **kwargs):
return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)

Expand Down
38 changes: 38 additions & 0 deletions yt_dlp/utils/networking.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,41 @@ def clean_headers(headers: HTTPHeaderDict):
if 'Youtubedl-No-Compression' in headers: # compat
del headers['Youtubedl-No-Compression']
headers['Accept-Encoding'] = 'identity'


def remove_dot_segments(path):
# Implements RFC3986 5.2.4 remote_dot_segments
# Pseudo-code: https://tools.ietf.org/html/rfc3986#section-5.2.4
# https://github.com/urllib3/urllib3/blob/ba49f5c4e19e6bca6827282feb77a3c9f937e64b/src/urllib3/util/url.py#L263
output = []
segments = path.split('/')
for s in segments:
if s == '.':
continue
elif s == '..':
if output:
output.pop()
else:
output.append(s)
pukkandan marked this conversation as resolved.
Show resolved Hide resolved
if not segments[0] and (not output or output[0]):
output.insert(0, '')
if segments[-1] in ('.', '..'):
output.append('')
return '/'.join(output)


def escape_rfc3986(s):
"""Escape non-ASCII characters as suggested by RFC 3986"""
return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")


def normalize_url(url):
"""Normalize URL as suggested by RFC 3986"""
url_parsed = urllib.parse.urlparse(url)
return url_parsed._replace(
netloc=url_parsed.netloc.encode('idna').decode('ascii'),
path=escape_rfc3986(remove_dot_segments(url_parsed.path)),
params=escape_rfc3986(url_parsed.params),
query=escape_rfc3986(url_parsed.query),
fragment=escape_rfc3986(url_parsed.fragment)
).geturl()