From 8c7a43b4a4ca0c8d36d55f132daa2a43d06fe3c4 Mon Sep 17 00:00:00 2001 From: Jorge Date: Thu, 12 Mar 2020 01:39:39 -0700 Subject: [PATCH] Add support for HTTPS connections to proxies. (#1679) * Add support to talk HTTPS to proxies. Currently there's no way to validate identify for the proxy you might be connecting. Proxies supporting HTTPS endpoints are becoming more common and we need to extend the support for them. When an HTTPS proxy is provided, instead of doing the HTTP CONNECT, we'll forward any requests directly to the proxy and ultimately to the destination. * Fix proxy_headers missing on HTTPS proxy connections. * blackfmt missing files. * Prevent usage of HTTPS proxies when fetching HTTPS resources. - Will be supported by default when we can do TLS within TLS. * Update proxy documentation with more information. * Renamed flag for HTTPS websites through HTTPS proxies. * Added myself to contributors. * Documentation and contributors fixes. * Removed mention that TLS in TLS is being developed as requested. * Space in between my name and the github page. * Add flag to enable HTTPS proxy support. Now that we're adding support for HTTPS proxies we want to avoid a breaking change with clients that had an improper proxy configuration. For now, we're adding a warning an defaulting to the previous behavior. In the future we'll change the behavior to enable HTTPS proxies by default. * Remove guard flag, error out on HTTPS/HTTPS. As requested in the last revision for the PR: - Removed the _enable_https_proxies flag. Instead the feature will be enabled and will error out on invalid configurations. (HTTPS + HTTPS) - Other comments: rename a method, parentheses to clarify order of operations. --- CONTRIBUTORS.txt | 3 + docs/advanced-usage.rst | 26 +++++-- dummyserver/proxy.py | 8 ++ dummyserver/testcase.py | 9 +++ src/urllib3/connection.py | 12 +-- src/urllib3/connectionpool.py | 19 +++-- src/urllib3/exceptions.py | 5 ++ src/urllib3/poolmanager.py | 74 ++++++++++++++++--- test/test_proxymanager.py | 18 ++++- .../test_proxy_poolmanager.py | 67 ++++++++++++++++- 10 files changed, 211 insertions(+), 30 deletions(-) diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt index 5eeaeb9013..5887c3aef7 100644 --- a/CONTRIBUTORS.txt +++ b/CONTRIBUTORS.txt @@ -282,6 +282,9 @@ In chronological order: * Remove a spurious TypeError from the exception chain inside HTTPConnectionPool._make_request(), also for BaseExceptions. +* Jorge Lopez Silva + * Added support for forwarding requests through HTTPS proxies. + * Benno Rice * Allow cadata parameter to be passed to underlying ``SSLContext.load_verify_locations()``. diff --git a/docs/advanced-usage.rst b/docs/advanced-usage.rst index c842757070..5bcc93adc5 100644 --- a/docs/advanced-usage.rst +++ b/docs/advanced-usage.rst @@ -72,7 +72,7 @@ Setting ``preload_content`` to ``False`` means that urllib3 will stream the response content. :meth:`~response.HTTPResponse.stream` lets you iterate over chunks of the response content. -.. note:: When using ``preload_content=False``, you should call +.. note:: When using ``preload_content=False``, you should call :meth:`~response.HTTPResponse.release_conn` to release the http connection back to the connection pool so that it can be re-used. @@ -87,7 +87,7 @@ a file-like object. This allows you to do buffering:: b'\x88\x1f\x8b\xe5' Calls to :meth:`~response.HTTPResponse.read()` will block until more response -data is available. +data is available. >>> import io >>> reader = io.BufferedReader(r, 8) @@ -122,10 +122,24 @@ HTTP proxy:: The usage of :class:`~poolmanager.ProxyManager` is the same as :class:`~poolmanager.PoolManager`. -You can use :class:`~contrib.socks.SOCKSProxyManager` to connect to SOCKS4 or -SOCKS5 proxies. In order to use SOCKS proxies you will need to install -`PySocks `_ or install urllib3 with the -``socks`` extra:: +You can connect to a proxy using HTTP, HTTPS or SOCKS. urllib3's behavior will +be different depending on the type of proxy you selected and the destination +you're contacting. + +When contacting a HTTP website through a HTTP or HTTPS proxy, the request will +be forwarded with the `absolute URI +`_. + +When contacting a HTTPS website through a HTTP proxy, a TCP tunnel will be +established with a HTTP CONNECT. Afterward a TLS connection will be established +with the destination and your request will be sent. + +Contacting HTTPS websites through HTTPS proxies is currently not supported. + +For SOCKS, you can use :class:`~contrib.socks.SOCKSProxyManager` to connect to +SOCKS4 or SOCKS5 proxies. In order to use SOCKS proxies you will need to +install `PySocks `_ or install urllib3 with +the ``socks`` extra:: pip install urllib3[socks] diff --git a/dummyserver/proxy.py b/dummyserver/proxy.py index c4f0b824f7..42f293104d 100755 --- a/dummyserver/proxy.py +++ b/dummyserver/proxy.py @@ -34,6 +34,7 @@ import tornado.iostream import tornado.web import tornado.httpclient +import ssl __all__ = ["ProxyHandler", "run_proxy"] @@ -66,6 +67,12 @@ def handle_response(response): self.write(response.body) self.finish() + upstream_ca_certs = self.application.settings.get("upstream_ca_certs", None) + ssl_options = None + + if upstream_ca_certs: + ssl_options = ssl.create_default_context(cafile=upstream_ca_certs) + req = tornado.httpclient.HTTPRequest( url=self.request.uri, method=self.request.method, @@ -73,6 +80,7 @@ def handle_response(response): headers=self.request.headers, follow_redirects=False, allow_nonstandard_methods=True, + ssl_options=ssl_options, ) client = tornado.httpclient.AsyncHTTPClient() diff --git a/dummyserver/testcase.py b/dummyserver/testcase.py index 90c6b2240c..4e3cf593c5 100644 --- a/dummyserver/testcase.py +++ b/dummyserver/testcase.py @@ -180,6 +180,14 @@ def setup_class(cls): app, cls.io_loop, None, "http", cls.proxy_host ) + upstream_ca_certs = cls.https_certs.get("ca_certs", None) + app = web.Application( + [(r".*", ProxyHandler)], upstream_ca_certs=upstream_ca_certs + ) + cls.https_proxy_server, cls.https_proxy_port = run_tornado_app( + app, cls.io_loop, cls.https_certs, "https", cls.proxy_host + ) + cls.server_thread = run_loop_in_thread(cls.io_loop) @classmethod @@ -187,6 +195,7 @@ def teardown_class(cls): cls.io_loop.add_callback(cls.http_server.stop) cls.io_loop.add_callback(cls.https_server.stop) cls.io_loop.add_callback(cls.proxy_server.stop) + cls.io_loop.add_callback(cls.https_proxy_server.stop) cls.io_loop.add_callback(cls.io_loop.stop) cls.server_thread.join() diff --git a/src/urllib3/connection.py b/src/urllib3/connection.py index 6da1cf4b6d..ce94b25640 100644 --- a/src/urllib3/connection.py +++ b/src/urllib3/connection.py @@ -111,7 +111,6 @@ def __init__(self, *args, **kw): #: The socket options provided by the user. If no options are #: provided, we use the default options. self.socket_options = kw.pop("socket_options", self.default_socket_options) - _HTTPConnection.__init__(self, *args, **kw) @property @@ -174,10 +173,13 @@ def _new_conn(self): return conn + def _is_using_tunnel(self): + # Google App Engine's httplib does not define _tunnel_host + return getattr(self, "_tunnel_host", None) + def _prepare_conn(self, conn): self.sock = conn - # Google App Engine's httplib does not define _tunnel_host - if getattr(self, "_tunnel_host", None): + if self._is_using_tunnel(): # TODO: Fix tunnel so it doesn't depend on self.sock state. self._tunnel() # Mark this connection as not reusable @@ -308,9 +310,9 @@ def connect(self): conn = self._new_conn() hostname = self.host - # Google App Engine's httplib does not define _tunnel_host - if getattr(self, "_tunnel_host", None): + if self._is_using_tunnel(): self.sock = conn + # Calls self._set_hostport(), so self.host is # self._tunnel_host below. self._tunnel() diff --git a/src/urllib3/connectionpool.py b/src/urllib3/connectionpool.py index d42eb7be67..a09c78f1f7 100644 --- a/src/urllib3/connectionpool.py +++ b/src/urllib3/connectionpool.py @@ -629,10 +629,10 @@ def urlopen( # [1] release_this_conn = release_conn - # Merge the proxy headers. Only do this in HTTP. We have to copy the - # headers dict so we can safely change it without those changes being - # reflected in anyone else's copy. - if self.scheme == "http": + # Merge the proxy headers. Only done when not using HTTP CONNECT. We + # have to copy the headers dict so we can safely change it without those + # changes being reflected in anyone else's copy. + if self.scheme == "http" or (self.proxy and self.proxy.scheme == "https"): headers = headers.copy() headers.update(self.proxy_headers) @@ -941,10 +941,15 @@ def _prepare_conn(self, conn): def _prepare_proxy(self, conn): """ - Establish tunnel connection early, because otherwise httplib - would improperly set Host: header to proxy's IP:port. + Establishes a tunnel connection through HTTP CONNECT. + + Tunnel connection is established early because otherwise httplib would + improperly set Host: header to proxy's IP:port. """ - conn.set_tunnel(self._proxy_host, self.port, self.proxy_headers) + + if self.proxy.scheme != "https": + conn.set_tunnel(self._proxy_host, self.port, self.proxy_headers) + conn.connect() def _new_conn(self): diff --git a/src/urllib3/exceptions.py b/src/urllib3/exceptions.py index 0a74c79b5e..202ba58a9f 100644 --- a/src/urllib3/exceptions.py +++ b/src/urllib3/exceptions.py @@ -242,6 +242,11 @@ def __init__(self, scheme): super(ProxySchemeUnknown, self).__init__(message) +class ProxySchemeUnsupported(ValueError): + "Fetching HTTPS resources through HTTPS proxies is unsupported" + pass + + class HeaderParsingError(HTTPError): "Raised by assert_header_parsing, but we convert it to a log.warning statement." diff --git a/src/urllib3/poolmanager.py b/src/urllib3/poolmanager.py index 242a2f8203..d081d753b6 100644 --- a/src/urllib3/poolmanager.py +++ b/src/urllib3/poolmanager.py @@ -2,11 +2,18 @@ import collections import functools import logging +import warnings from ._collections import RecentlyUsedContainer from .connectionpool import HTTPConnectionPool, HTTPSConnectionPool from .connectionpool import port_by_scheme -from .exceptions import LocationValueError, MaxRetryError, ProxySchemeUnknown +from .exceptions import ( + HTTPWarning, + LocationValueError, + MaxRetryError, + ProxySchemeUnknown, + ProxySchemeUnsupported, +) from .packages import six from .packages.six.moves.urllib.parse import urljoin from .request import RequestMethods @@ -17,6 +24,12 @@ __all__ = ["PoolManager", "ProxyManager", "proxy_from_url"] +class InvalidProxyConfigurationWarning(HTTPWarning): + """Raised when a user has an HTTPS proxy without enabling HTTPS proxies.""" + + pass + + log = logging.getLogger(__name__) SSL_KEYWORDS = ( @@ -306,6 +319,18 @@ def _merge_pool_kwargs(self, override): base_pool_kwargs[key] = value return base_pool_kwargs + def _proxy_requires_url_absolute_form(self, parsed_url): + """ + Indicates if the proxy requires the complete destination URL in the + request. + + Normally this is only needed when not using an HTTP CONNECT tunnel. + """ + if self.proxy is None: + return False + + return parsed_url.scheme == "http" or self.proxy.scheme == "https" + def urlopen(self, method, url, redirect=True, **kw): """ Same as :meth:`urllib3.connectionpool.HTTPConnectionPool.urlopen` @@ -324,7 +349,7 @@ def urlopen(self, method, url, redirect=True, **kw): if "headers" not in kw: kw["headers"] = self.headers.copy() - if self.proxy is not None and u.scheme == "http": + if self._proxy_requires_url_absolute_form(u): response = conn.urlopen(method, url, **kw) else: response = conn.urlopen(method, u.request_uri, **kw) @@ -383,6 +408,12 @@ class ProxyManager(PoolManager): HTTPS/CONNECT case they are sent only once. Could be used for proxy authentication. + :param _allow_https_proxy_to_see_traffic: + Allows forwarding of HTTPS requests to HTTPS proxies. The proxy will + have visibility of all the traffic sent. ONLY USE IF YOU KNOW WHAT + YOU'RE DOING. This flag might be removed at any time in any future + update. + Example: >>> proxy = urllib3.ProxyManager('http://localhost:3128/') >>> r1 = proxy.request('GET', 'http://google.com/') @@ -402,6 +433,7 @@ def __init__( num_pools=10, headers=None, proxy_headers=None, + _allow_https_proxy_to_see_traffic=False, **connection_pool_kw ): @@ -412,19 +444,22 @@ def __init__( proxy_url.port, ) proxy = parse_url(proxy_url) - if not proxy.port: - port = port_by_scheme.get(proxy.scheme, 80) - proxy = proxy._replace(port=port) if proxy.scheme not in ("http", "https"): raise ProxySchemeUnknown(proxy.scheme) + if not proxy.port: + port = port_by_scheme.get(proxy.scheme, 80) + proxy = proxy._replace(port=port) + self.proxy = proxy self.proxy_headers = proxy_headers or {} connection_pool_kw["_proxy"] = self.proxy connection_pool_kw["_proxy_headers"] = self.proxy_headers + self.allow_insecure_proxy = _allow_https_proxy_to_see_traffic + super(ProxyManager, self).__init__(num_pools, headers, **connection_pool_kw) def connection_from_host(self, host, port=None, scheme="http", pool_kwargs=None): @@ -452,14 +487,35 @@ def _set_proxy_headers(self, url, headers=None): headers_.update(headers) return headers_ + def _validate_proxy_scheme_url_selection(self, url_scheme): + if ( + url_scheme == "https" + and self.proxy.scheme == "https" + and not self.allow_insecure_proxy + ): + warnings.warn( + "Your proxy configuration specified an HTTPS scheme for the proxy. " + "Are you sure you want to use HTTPS to contact the proxy? " + "This most likely indicates an error in your configuration." + "If you are sure you want use HTTPS to contact the proxy, enable " + "the _allow_https_proxy_to_see_traffic.", + InvalidProxyConfigurationWarning, + ) + + raise ProxySchemeUnsupported( + "Contacting HTTPS destinations through HTTPS proxies is not supported." + ) + def urlopen(self, method, url, redirect=True, **kw): "Same as HTTP(S)ConnectionPool.urlopen, ``url`` must be absolute." u = parse_url(url) + self._validate_proxy_scheme_url_selection(u.scheme) - if u.scheme == "http": - # For proxied HTTPS requests, httplib sets the necessary headers - # on the CONNECT to the proxy. For HTTP, we'll definitely - # need to set 'Host' at the very least. + if u.scheme == "http" or self.proxy.scheme == "https": + # For connections using HTTP CONNECT, httplib sets the necessary + # headers on the CONNECT to the proxy. For HTTP or when talking + # HTTPS to the proxy, we'll definitely need to set 'Host' at the + # very least. headers = kw.get("headers", self.headers) kw["headers"] = self._set_proxy_headers(url, headers) diff --git a/test/test_proxymanager.py b/test/test_proxymanager.py index 51adc4b805..fb947dae38 100644 --- a/test/test_proxymanager.py +++ b/test/test_proxymanager.py @@ -1,12 +1,15 @@ import pytest from urllib3.poolmanager import ProxyManager +from urllib3.util.url import parse_url class TestProxyManager(object): - def test_proxy_headers(self): + @pytest.mark.parametrize("proxy_scheme", ["http", "https"]) + def test_proxy_headers(self, proxy_scheme): url = "http://pypi.org/project/urllib3/" - with ProxyManager("http://something:1234") as p: + proxy_url = "{}://something:1234".format(proxy_scheme) + with ProxyManager(proxy_url) as p: # Verify default headers default_headers = {"Accept": "*/*", "Host": "pypi.org"} headers = p._set_proxy_headers(url) @@ -43,3 +46,14 @@ def test_invalid_scheme(self): ProxyManager("invalid://host/p") with pytest.raises(ValueError): ProxyManager("invalid://host/p") + + def test_proxy_tunnel(self): + http_url = parse_url("http://example.com") + https_url = parse_url("https://example.com") + with ProxyManager("http://proxy:8080") as p: + assert p._proxy_requires_url_absolute_form(http_url) + assert p._proxy_requires_url_absolute_form(https_url) is False + + with ProxyManager("https://proxy:8080") as p: + assert p._proxy_requires_url_absolute_form(http_url) + assert p._proxy_requires_url_absolute_form(https_url) diff --git a/test/with_dummyserver/test_proxy_poolmanager.py b/test/with_dummyserver/test_proxy_poolmanager.py index 063ab36021..feface09e2 100644 --- a/test/with_dummyserver/test_proxy_poolmanager.py +++ b/test/with_dummyserver/test_proxy_poolmanager.py @@ -14,7 +14,13 @@ from urllib3._collections import HTTPHeaderDict from urllib3.poolmanager import proxy_from_url, ProxyManager -from urllib3.exceptions import MaxRetryError, SSLError, ProxyError, ConnectTimeoutError +from urllib3.exceptions import ( + MaxRetryError, + SSLError, + ProxyError, + ConnectTimeoutError, + ProxySchemeUnsupported, +) from urllib3.connectionpool import connection_from_url, VerifiedHTTPSConnection from test import SHORT_TIMEOUT, LONG_TIMEOUT @@ -32,6 +38,7 @@ def setup_class(cls): cls.https_url = "https://%s:%d" % (cls.https_host, cls.https_port) cls.https_url_alt = "https://%s:%d" % (cls.https_host_alt, cls.https_port) cls.proxy_url = "http://%s:%d" % (cls.proxy_host, cls.proxy_port) + cls.https_proxy_url = "https://%s:%d" % (cls.proxy_host, cls.https_proxy_port,) # Generate another CA to test verification failure cls.certs_dir = tempfile.mkdtemp() @@ -53,6 +60,23 @@ def test_basic_proxy(self): r = http.request("GET", "%s/" % self.https_url) assert r.status == 200 + def test_https_proxy(self): + with proxy_from_url(self.https_proxy_url, ca_certs=DEFAULT_CA) as https: + r = https.request("GET", "%s/" % self.http_url) + assert r.status == 200 + + with pytest.raises(ProxySchemeUnsupported): + https.request("GET", "%s/" % self.https_url) + + with proxy_from_url( + self.https_proxy_url, + ca_certs=DEFAULT_CA, + _allow_https_proxy_to_see_traffic=True, + ) as https: + r = https.request("GET", "%s/" % self.http_url) + https.request("GET", "%s/" % self.https_url) + assert r.status == 200 + def test_nagle_proxy(self): """ Test that proxy connections do not have TCP_NODELAY turned on """ with ProxyManager(self.proxy_url) as http: @@ -275,6 +299,47 @@ def test_headers(self): self.https_port, ) + def test_https_headers(self): + with proxy_from_url( + self.https_proxy_url, + headers={"Foo": "bar"}, + proxy_headers={"Hickory": "dickory"}, + ca_certs=DEFAULT_CA, + ) as http: + + r = http.request_encode_url("GET", "%s/headers" % self.http_url) + returned_headers = json.loads(r.data.decode()) + assert returned_headers.get("Foo") == "bar" + assert returned_headers.get("Hickory") == "dickory" + assert returned_headers.get("Host") == "%s:%s" % ( + self.http_host, + self.http_port, + ) + + r = http.request_encode_url("GET", "%s/headers" % self.http_url_alt) + returned_headers = json.loads(r.data.decode()) + assert returned_headers.get("Foo") == "bar" + assert returned_headers.get("Hickory") == "dickory" + assert returned_headers.get("Host") == "%s:%s" % ( + self.http_host_alt, + self.http_port, + ) + + with pytest.raises(ProxySchemeUnsupported): + http.request_encode_url("GET", "%s/headers" % self.https_url) + + r = http.request_encode_url( + "GET", "%s/headers" % self.http_url, headers={"Baz": "quux"} + ) + returned_headers = json.loads(r.data.decode()) + assert returned_headers.get("Foo") is None + assert returned_headers.get("Baz") == "quux" + assert returned_headers.get("Hickory") == "dickory" + assert returned_headers.get("Host") == "%s:%s" % ( + self.http_host, + self.http_port, + ) + def test_headerdict(self): default_headers = HTTPHeaderDict(a="b") proxy_headers = HTTPHeaderDict()