Skip to content

Commit

Permalink
warcserver/cdx query: filter improvements (#285)
Browse files Browse the repository at this point in the history
- pywb.utils.format: add query_to_dict() to convert query string with support for list for certain params
- support multiple values for 'filter' cdx server param (fixes #284)
- pywb.utils.format: add to_bool() to convert string/int to bool (eg. for query args)
- fuzzymatch: add 'allowFuzzy' (default to true) to allow disabling fuzzy matcher
- tests: fuzzymather: test disabling fuzzy matcher with allowFuzzy=0
- tests: cdx-server api: add multiple filter tests, with and without fuzzy matching
  • Loading branch information
ikreymer committed Jan 29, 2018
1 parent cd304cc commit 273b3ee
Show file tree
Hide file tree
Showing 6 changed files with 98 additions and 21 deletions.
40 changes: 37 additions & 3 deletions pywb/utils/format.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from six.moves.urllib.parse import quote
from six.moves.urllib.parse import quote, parse_qsl
import string


#=============================================================================
# ============================================================================
class ParamFormatter(string.Formatter):
def __init__(self, params, name='', prefix='param.'):
self.params = params
Expand Down Expand Up @@ -33,7 +33,7 @@ def get_value(self, key, args, kwargs):
return value


#=============================================================================
# =============================================================================
def res_template(template, params, **extra_params):
formatter = params.get('_formatter')
if not formatter:
Expand All @@ -49,3 +49,37 @@ def res_template(template, params, **extra_params):
return res


# =============================================================================
def to_bool(val):
if not val:
return False

if isinstance(val, str):
return val.lower() not in ('0', 'false', 'f', 'off')
else:
return bool(val)


# =============================================================================
def query_to_dict(query_str, multi=None):
pairlist = parse_qsl(query_str)
if not multi:
return dict(pairlist)

obj = {}
for n, v in pairlist:
if n not in multi:
obj[n] = v
continue

# make_list
if n not in obj:
obj[n] = v
elif isinstance(obj[n], list):
obj[n].append(v)
else:
obj[n] = [obj[n], v]

return obj


4 changes: 2 additions & 2 deletions pywb/warcserver/basewarcserver.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest
from pywb.utils.format import query_to_dict

from werkzeug.routing import Map, Rule
from werkzeug.exceptions import HTTPException
Expand All @@ -7,7 +8,6 @@
import traceback
import json

from six.moves.urllib.parse import parse_qsl
import six

JSON_CT = 'application/json; charset=utf-8'
Expand Down Expand Up @@ -60,7 +60,7 @@ def _add_simple_route(self, path, func):
def get_query_dict(self, environ):
query_str = environ.get('QUERY_STRING')
if query_str:
return dict(parse_qsl(query_str))
return query_to_dict(query_str, multi=['filter'])
else:
return {}

Expand Down
12 changes: 8 additions & 4 deletions pywb/warcserver/index/fuzzymatcher.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from warcio.utils import to_native_str

from pywb.utils.loaders import load_yaml_config
from pywb.utils.format import to_bool
from pywb import DEFAULT_RULES_FILE

import re
Expand Down Expand Up @@ -65,7 +67,7 @@ def parse_fuzzy_rule(self, rule):

return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, find_all)

def get_fuzzy_match(self, urlkey, params):
def get_fuzzy_match(self, urlkey, url, params):
filters = set()
matched_rule = None

Expand Down Expand Up @@ -93,8 +95,6 @@ def get_fuzzy_match(self, urlkey, params):
if not matched_rule:
return None

url = params['url']

# support matching w/o query if no additional filters
# don't include trailing '?' if no filters and replace_after '?'
no_filters = (filters == {'urlkey:'}) and (matched_rule.replace_after == '?')
Expand Down Expand Up @@ -161,10 +161,14 @@ def get_fuzzy_iter(self, cdx_iter, index_source, params):
if found:
return

# if fuzzy matching disabled
if not to_bool(params.get('allowFuzzy', True)):
return

url = params['url']
urlkey = to_native_str(params['key'], 'utf-8')

res = self.get_fuzzy_match(urlkey, params)
res = self.get_fuzzy_match(urlkey, url, params)
if not res:
return

Expand Down
13 changes: 3 additions & 10 deletions pywb/warcserver/index/query.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from six.moves.urllib.parse import urlencode
from pywb.warcserver.index.cdxobject import CDXException
from pywb.utils.canonicalize import calc_search_range
from pywb.utils.format import to_bool


#=================================================================
Expand Down Expand Up @@ -128,17 +129,9 @@ def page_size(self):
def page_count(self):
return self._get_bool('showNumPages')

def _get_bool(self, name, def_val=False):
def _get_bool(self, name):
v = self.params.get(name)
if v:
try:
v = int(v)
except ValueError as ex:
v = (v.lower() == 'true')
else:
v = def_val

return bool(v)
return to_bool(v)

def urlencode(self):
return urlencode(self.params, True)
8 changes: 8 additions & 0 deletions pywb/warcserver/index/test/test_fuzzymatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,14 @@ def test_fuzzy_find_all_rule(self):

assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)

def test_no_fuzzy_disabled(self):
url = 'http://example.com/?_=123'
actual_url = 'http://example.com/'
params = self.get_params(url, actual_url)
params['allowFuzzy'] = 0
cdx_iter, errs = self.fuzzy(self.source, params)
assert list(cdx_iter) == []

def test_no_fuzzy_custom_rule_video_id_diff(self):
url = 'http://youtube.com/get_video_info?a=b&html=true&___abc=123&video_id=ABCD&id=1234'
actual_url = 'http://youtube.com/get_video_info?a=d&html=true&___abc=125&video_id=ABCE&id=1234'
Expand Down
42 changes: 40 additions & 2 deletions tests/test_cdx_server_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_prefix_match(self):
suburls += 1
assert suburls > 0

def test_filters(self):
def test_filters_1(self):
"""
filter cdxes by mimetype and filename field, exact match.
"""
Expand All @@ -71,12 +71,50 @@ def test_filters(self):
assert resp.status_code == 200
assert resp.content_type == 'text/x-cdxj'

for l in resp.text.splitlines():
lines = resp.text.splitlines()
assert len(lines) > 0

for l in lines:
cdx = CDXObject(l.encode('utf-8'))
assert cdx['urlkey'] == 'org,iana)/_css/2013.1/screen.css'
assert cdx['timestamp'] == '20140127171239'
assert cdx['mime'] == 'warc/revisit'
assert cdx['filename'] == 'dupes.warc.gz'

def test_filters_2_no_fuzzy_no_match(self):
"""
two filters, disable fuzzy matching
"""
resp = self.query('http://www.iana.org/_css/2013.1/screen.css',
filter=('!mime:warc/revisit', 'filename:dupes.warc.gz'),
allowFuzzy='false')

assert resp.status_code == 200
assert resp.content_type == 'text/x-cdxj'

lines = resp.text.splitlines()
assert len(lines) == 0

def test_filters_3(self):
"""
filter cdxes by mimetype and filename field, exact match.
"""
resp = self.query('http://www.iana.org/_css/2013.1/screen.css',
filter=('!mime:warc/revisit', '!filename:dupes.warc.gz'))

assert resp.status_code == 200
assert resp.content_type == 'text/x-cdxj'

lines = resp.text.splitlines()
assert len(lines) == 1

for l in lines:
cdx = CDXObject(l.encode('utf-8'))
assert cdx['urlkey'] == 'org,iana)/_css/2013.1/screen.css'
assert cdx['timestamp'] == '20140126200625'
assert cdx['mime'] == 'text/css'
assert cdx['filename'] == 'iana.warc.gz'

def test_limit(self):
resp = self.query('http://www.iana.org/_css/2013.1/screen.css',
limit='1')
Expand Down

0 comments on commit 273b3ee

Please sign in to comment.