Skip to content

Commit

Permalink
SOCKS proxy (#281)
Browse files Browse the repository at this point in the history
warcserver: SOCKS proxy:
- add support for running warcserver through a socks proxy specified via SOCKS_HOST and SOCKS_PORT
- move socks patch setup, http max_header adjustment to http module
- logging: print stack trace only if debugging
- add pysocks to extra_requirements, enable in ci
- add simple test (not actual proxy) to check that connection through proxy is attempted
- docs: add SOCKS proxy section to docs
  • Loading branch information
ikreymer committed Jan 17, 2018
1 parent 4f34093 commit 131c5ff
Show file tree
Hide file tree
Showing 6 changed files with 131 additions and 8 deletions.
2 changes: 1 addition & 1 deletion appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ install:
- "pip install coverage pytest-cov coveralls"
- "pip install cffi"
- "pip install pyopenssl"
- "pip install certauth boto3 youtube-dl"
- "pip install certauth boto3 youtube-dl pysocks"
- "pip install codecov"

build_script:
Expand Down
10 changes: 10 additions & 0 deletions docs/manual/configuring.rst
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,16 @@ This configures the ``/live/`` route to point to the live web.
This collection can be useful for testing, or even more powerful, when combined with recording.


SOCKS Proxy for Live Web
""""""""""""""""""""""""

pywb can be configured to use a SOCKS5 proxy when connecting to the live web. This allows pywb to be used with `Tor <https://torproject.org/>`_ and other
services that require a SOCKS proxy.

If the ``SOCKS_HOST`` and optionally ``SOCKS_PORT`` environment variables are set, pywb will attempt to route all live web traffic through the SOCKS5 proxy.
Note that, at this time, it is not possible to configure a SOCKS proxy per pywb collection -- all live web traffic will use the SOCKS proxy if enabled.


.. _auto-all:

Auto "All" Aggregate Collection
Expand Down
1 change: 1 addition & 0 deletions extra_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ boto3
uwsgi
git+https://github.com/t0m/pyamf.git@python3
git+https://github.com/esnme/ultrajson.git
pysocks
69 changes: 69 additions & 0 deletions pywb/warcserver/http.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,78 @@
from requests.adapters import HTTPAdapter
import requests
import os

import six.moves.http_client
six.moves.http_client._MAXHEADERS = 10000

SOCKS_PROXIES = None
orig_getaddrinfo = None


#=============================================================================
class DefaultAdapters(object):
live_adapter = HTTPAdapter(max_retries=3)
remote_adapter = HTTPAdapter(max_retries=3)

requests.packages.urllib3.disable_warnings()


#=============================================================================
def patch_socks():
try:
import socks
except ImportError: #pragma: no cover
print('Ignoring SOCKS_HOST: PySocks must be installed to use SOCKS proxy')
return

import socket

socks_host = os.environ.get('SOCKS_HOST')
socks_port = os.environ.get('SOCKS_PORT', 9050)

# Set socks proxy and wrap the urllib module
socks.set_default_proxy(socks.PROXY_TYPE_SOCKS5, socks_host, socks_port, True)
#socket.socket = socks.socksocket # sets default socket to be the sockipy socket

# store original getaddrinfo
global orig_getaddrinfo
orig_getaddrinfo = socks.socket.getaddrinfo

# Perform DNS resolution through socket
def getaddrinfo(*args):
if args[0] in ('127.0.0.1', 'localhost'):
res = orig_getaddrinfo(*args)

else:
res = [(socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1]))]

return res

socks.socket.getaddrinfo = getaddrinfo

socks_url = 'socks5h://{0}:{1}'.format(socks_host, socks_port)

global SOCKS_PROXIES
SOCKS_PROXIES = {'http': socks_url,
'https': socks_url}

# =============================================================================
def unpatch_socks():
global orig_getaddrinfo
if not orig_getaddrinfo:
return

import socks
socks.socket.getaddrinfo = orig_getaddrinfo
orig_getaddrinfo = None

global SOCKS_PROXIES
SOCKS_PROXIES = None


# =============================================================================
if os.environ.get('SOCKS_HOST'):
patch_socks()



19 changes: 12 additions & 7 deletions pywb/warcserver/resource/responseloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin

from pywb.warcserver.http import DefaultAdapters
from pywb.warcserver.http import DefaultAdapters, SOCKS_PROXIES

from six.moves.urllib.parse import urlsplit, quote, unquote

Expand All @@ -30,9 +30,6 @@

from requests.models import PreparedRequest

import six.moves.http_client
six.moves.http_client._MAXHEADERS = 10000

logger = logging.getLogger('warcserver')


Expand Down Expand Up @@ -447,11 +444,15 @@ def _do_request_with_redir_check(self, method, load_url,

def _do_request(self, method, load_url, data, req_headers, params, is_live):
adapter = DefaultAdapters.live_adapter if is_live else DefaultAdapters.remote_adapter
pool = adapter.poolmanager
max_retries = adapter.max_retries

if SOCKS_PROXIES:
conn = adapter.get_connection(load_url, SOCKS_PROXIES)
else:
conn = adapter.poolmanager

try:
upstream_res = pool.urlopen(method=method,
upstream_res = conn.urlopen(method=method,
url=load_url,
body=data,
headers=req_headers,
Expand All @@ -465,7 +466,11 @@ def _do_request(self, method, load_url, data, req_headers, params, is_live):
return upstream_res

except Exception as e:
logger.debug('FAILED: ' + method + ' ' + load_url + ': ' + str(e))
if logger.isEnabledFor(logging.DEBUG):
import traceback
traceback.print_exc()
logger.debug('FAILED: ' + method + ' ' + load_url + ': ' + str(e))

raise LiveResourceException(load_url)

def get_custom_metadata(self, content_type, dt):
Expand Down
38 changes: 38 additions & 0 deletions tests/test_socks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from .base_config_test import BaseConfigTest, fmod_sl

import pywb.warcserver.http as pywb_http
import os
import socket
import gevent
import pytest


# ============================================================================
class TestSOCKSProxy(BaseConfigTest):
@classmethod
def setup_class(cls):
os.environ['SOCKS_HOST'] = 'localhost'
os.environ['SOCKS_PORT'] = '8080'

pywb_http.patch_socks()
import pywb.warcserver.resource.responseloader
pywb.warcserver.resource.responseloader.SOCKS_PROXIES = pywb_http.SOCKS_PROXIES
super(TestSOCKSProxy, cls).setup_class('config_test.yaml')

@classmethod
def teardown_class(cls):
pywb_http.unpatch_socks()
super(TestSOCKSProxy, cls).teardown_class()

def test_socks_proxy_set(self):
assert pywb_http.SOCKS_PROXIES == {'http': 'socks5h://localhost:8080',
'https': 'socks5h://localhost:8080'
}

def test_socks_attempt_connect(self, fmod_sl):
pytest.importorskip('socks')
# no proxy is set, expect to fail if socks is being used
resp = self.get('/live/{0}http://httpbin.org/get', fmod_sl, status=400)
assert resp.status_int == 400


0 comments on commit 131c5ff

Please sign in to comment.