Skip to content

Commit

Permalink
introduce MAX_REDIRECTS config setting and fix urllib3 redirect han…
Browse files Browse the repository at this point in the history
…dling

Fixes issue adbar#450

After setting `MAX_REDIRECTS` to 5, I could fetch the original URL from the issue:
`trafilatura -u https://www.hydrogeninsight.com/production/breaking-us-reveals-the-seven-regional-hydrogen-hubs-to-receive-7bn-of-government-funding/2-1-1534596`

I also fixed this old issue: adbar#128
The underlying urllib3 bug has not been fixed: urllib3/urllib3#2475

I had to pass the retry strategy to the actual request method: it doesn't propagate from the pool maanger
  • Loading branch information
vbarbaresi committed Dec 30, 2023
1 parent d31c8d7 commit dfc03f6
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 8 deletions.
1 change: 1 addition & 0 deletions docs/settings.rst
Expand Up @@ -43,6 +43,7 @@ The default file included in the package is `settings.cfg <https://github.com/ad
* ``EXTENSIVE_DATE_SEARCH = on`` set to ``off`` to deactivate ``htmldate``'s opportunistic search (lower recall, higher precision)
- Navigation
* ``EXTERNAL_URLS = off`` do not take URLs from other websites in feeds and sitemaps (CLI mode)
* ``MAX_REDIRECTS = 2``: maximum number of `URL redirections <https://en.wikipedia.org/wiki/URL_redirection>`_ to be followed. Set to 0 to not follow any redirection.


Using a custom file on the command-line
Expand Down
14 changes: 12 additions & 2 deletions tests/downloads_tests.py
Expand Up @@ -33,7 +33,7 @@
_send_pycurl_request, _send_request,
_urllib3_is_live_page,
add_to_compressed_dict, fetch_url,
is_live_page, load_download_buffer)
is_live_page, load_download_buffer, _reset_global_objects)
from trafilatura.settings import DEFAULT_CONFIG, use_config
from trafilatura.utils import decode_response, load_html

Expand Down Expand Up @@ -94,7 +94,17 @@ def test_fetch():
assert load_html(response) is not None
# nothing to see here
assert extract(response, url=response.url, config=ZERO_CONFIG) is None

# test handling redirects
res = fetch_url('https://httpbun.com/redirect/2')
assert len(res) > 100 # We followed redirects and downloaded something in the end
new_config = use_config() # get a new config instance to avoid mutating the default one
# Patch max directs: limit to 0. We won't fetch any page as a result
new_config.set('DEFAULT', 'MAX_REDIRECTS', '0')
_reset_global_objects() # force Retry strategy and PoolManager to be recreated with the new config value
res = fetch_url('http://httpbin.org/redirect/1', config=new_config)
assert res is None
# Also test max redir implementation on pycurl
assert _send_pycurl_request('http://httpbin.org/redirect/1', True, new_config) is None

def test_config():
'''Test how configuration options are read and stored.'''
Expand Down
20 changes: 14 additions & 6 deletions trafilatura/downloads.py
Expand Up @@ -43,7 +43,6 @@
PKG_VERSION = version("trafilatura")

NUM_CONNECTIONS = 50
MAX_REDIRECTS = 2

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
HTTP_POOL = None
Expand Down Expand Up @@ -90,8 +89,8 @@ def _send_request(url, no_ssl, config):
global HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY
if not RETRY_STRATEGY:
RETRY_STRATEGY = urllib3.util.Retry(
total=0,
redirect=MAX_REDIRECTS, # raise_on_redirect=False,
total=config.getint("DEFAULT", "MAX_REDIRECTS"),
redirect=config.getint("DEFAULT", "MAX_REDIRECTS"), # raise_on_redirect=False,
connect=0,
backoff_factor=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT')/2,
status_forcelist=[
Expand All @@ -107,13 +106,13 @@ def _send_request(url, no_ssl, config):
if not HTTP_POOL:
HTTP_POOL = urllib3.PoolManager(retries=RETRY_STRATEGY, timeout=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'), ca_certs=certifi.where(), num_pools=NUM_CONNECTIONS) # cert_reqs='CERT_REQUIRED'
# execute request
response = HTTP_POOL.request('GET', url, headers=_determine_headers(config))
response = HTTP_POOL.request('GET', url, headers=_determine_headers(config), retries=RETRY_STRATEGY)
else:
# define pool
if not NO_CERT_POOL:
NO_CERT_POOL = urllib3.PoolManager(retries=RETRY_STRATEGY, timeout=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'), cert_reqs='CERT_NONE', num_pools=NUM_CONNECTIONS)
# execute request
response = NO_CERT_POOL.request('GET', url, headers=_determine_headers(config))
response = NO_CERT_POOL.request('GET', url, headers=_determine_headers(config), retries=RETRY_STRATEGY)
except urllib3.exceptions.SSLError:
LOGGER.warning('retrying after SSLError: %s', url)
return _send_request(url, True, config)
Expand Down Expand Up @@ -275,7 +274,7 @@ def _send_pycurl_request(url, no_ssl, config):
curl.setopt(pycurl.HTTPHEADER, headerlist)
# curl.setopt(pycurl.USERAGENT, '')
curl.setopt(pycurl.FOLLOWLOCATION, 1)
curl.setopt(pycurl.MAXREDIRS, MAX_REDIRECTS)
curl.setopt(pycurl.MAXREDIRS, config.getint('DEFAULT', 'MAX_REDIRECTS'))
curl.setopt(pycurl.CONNECTTIMEOUT, config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'))
curl.setopt(pycurl.TIMEOUT, config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'))
curl.setopt(pycurl.NOSIGNAL, 1)
Expand Down Expand Up @@ -327,3 +326,12 @@ def _send_pycurl_request(url, no_ssl, config):
# tidy up
curl.close()
return RawResponse(bufferbytes, respcode, effective_url)


def _reset_global_objects():
"""
Force global objects to be re-created
Currently only used during tests
"""
global HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY
HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY = None, None, None
2 changes: 2 additions & 0 deletions trafilatura/settings.cfg
Expand Up @@ -12,6 +12,8 @@ SLEEP_TIME = 5
USER_AGENTS =
# cookie for HTTP requests
COOKIE =
# Maximum number of redirects that we will follow
MAX_REDIRECTS = 5

# Extraction
MIN_EXTRACTED_SIZE = 250
Expand Down

0 comments on commit dfc03f6

Please sign in to comment.