warcserver/cdx query: filter improvements (#285)

- pywb.utils.format: add query_to_dict() to convert query string with support for list for certain params - support multiple values for 'filter' cdx server param (fixes #284) - pywb.utils.format: add to_bool() to convert string/int to bool (eg. for query args) - fuzzymatch: add 'allowFuzzy' (default to true) to allow disabling fuzzy matcher - tests: fuzzymather: test disabling fuzzy matcher with allowFuzzy=0 - tests: cdx-server api: add multiple filter tests, with and without fuzzy matching
webrecorder · Jan 29, 2018 · 273b3ee · 273b3ee
1 parent cd304cc
commit 273b3ee
Show file tree

Hide file tree

Showing 6 changed files with 98 additions and 21 deletions.
diff --git a/pywb/utils/format.py b/pywb/utils/format.py
@@ -1,8 +1,8 @@
-from six.moves.urllib.parse import quote
+from six.moves.urllib.parse import quote, parse_qsl
 import string
 
 
-#=============================================================================
+# ============================================================================
 class ParamFormatter(string.Formatter):
     def __init__(self, params, name='', prefix='param.'):
         self.params = params
@@ -33,7 +33,7 @@ def get_value(self, key, args, kwargs):
         return value
 
 
-#=============================================================================
+# =============================================================================
 def res_template(template, params, **extra_params):
     formatter = params.get('_formatter')
     if not formatter:
@@ -49,3 +49,37 @@ def res_template(template, params, **extra_params):
     return res
 
 
+# =============================================================================
+def to_bool(val):
+    if not val:
+        return False
+
+    if isinstance(val, str):
+        return val.lower() not in ('0', 'false', 'f', 'off')
+    else:
+        return bool(val)
+
+
+# =============================================================================
+def query_to_dict(query_str, multi=None):
+    pairlist = parse_qsl(query_str)
+    if not multi:
+        return dict(pairlist)
+
+    obj = {}
+    for n, v in pairlist:
+        if n not in multi:
+            obj[n] = v
+            continue
+
+        # make_list
+        if n not in obj:
+            obj[n] = v
+        elif isinstance(obj[n], list):
+            obj[n].append(v)
+        else:
+            obj[n] = [obj[n], v]
+
+    return obj
+
+
diff --git a/pywb/warcserver/basewarcserver.py b/pywb/warcserver/basewarcserver.py
@@ -1,4 +1,5 @@
 from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest
+from pywb.utils.format import query_to_dict
 
 from werkzeug.routing import Map, Rule
 from werkzeug.exceptions import HTTPException
@@ -7,7 +8,6 @@
 import traceback
 import json
 
-from six.moves.urllib.parse import parse_qsl
 import six
 
 JSON_CT = 'application/json; charset=utf-8'
@@ -60,7 +60,7 @@ def _add_simple_route(self, path, func):
     def get_query_dict(self, environ):
         query_str = environ.get('QUERY_STRING')
         if query_str:
-            return dict(parse_qsl(query_str))
+            return query_to_dict(query_str, multi=['filter'])
         else:
             return {}
 

diff --git a/pywb/warcserver/index/fuzzymatcher.py b/pywb/warcserver/index/fuzzymatcher.py
@@ -1,5 +1,7 @@
 from warcio.utils import to_native_str
+
 from pywb.utils.loaders import load_yaml_config
+from pywb.utils.format import to_bool
 from pywb import DEFAULT_RULES_FILE
 
 import re
@@ -65,7 +67,7 @@ def parse_fuzzy_rule(self, rule):
 
         return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, find_all)
 
-    def get_fuzzy_match(self, urlkey, params):
+    def get_fuzzy_match(self, urlkey, url, params):
         filters = set()
         matched_rule = None
 
@@ -93,8 +95,6 @@ def get_fuzzy_match(self, urlkey, params):
         if not matched_rule:
             return None
 
-        url = params['url']
-
         # support matching w/o query if no additional filters
         # don't include trailing '?' if no filters and replace_after '?'
         no_filters = (filters == {'urlkey:'}) and (matched_rule.replace_after == '?')
@@ -161,10 +161,14 @@ def get_fuzzy_iter(self, cdx_iter, index_source, params):
         if found:
             return
 
+        # if fuzzy matching disabled
+        if not to_bool(params.get('allowFuzzy', True)):
+            return
+
         url = params['url']
         urlkey = to_native_str(params['key'], 'utf-8')
 
-        res = self.get_fuzzy_match(urlkey, params)
+        res = self.get_fuzzy_match(urlkey, url, params)
         if not res:
             return
 

diff --git a/pywb/warcserver/index/query.py b/pywb/warcserver/index/query.py
@@ -1,6 +1,7 @@
 from six.moves.urllib.parse import urlencode
 from pywb.warcserver.index.cdxobject import CDXException
 from pywb.utils.canonicalize import calc_search_range
+from pywb.utils.format import to_bool
 
 
 #=================================================================
@@ -128,17 +129,9 @@ def page_size(self):
     def page_count(self):
         return self._get_bool('showNumPages')
 
-    def _get_bool(self, name, def_val=False):
+    def _get_bool(self, name):
         v = self.params.get(name)
-        if v:
-            try:
-                v = int(v)
-            except ValueError as ex:
-                v = (v.lower() == 'true')
-        else:
-            v = def_val
-
-        return bool(v)
+        return to_bool(v)
 
     def urlencode(self):
         return urlencode(self.params, True)
diff --git a/pywb/warcserver/index/test/test_fuzzymatcher.py b/pywb/warcserver/index/test/test_fuzzymatcher.py
@@ -133,6 +133,14 @@ def test_fuzzy_find_all_rule(self):
 
         assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)
 
+    def test_no_fuzzy_disabled(self):
+        url = 'http://example.com/?_=123'
+        actual_url = 'http://example.com/'
+        params = self.get_params(url, actual_url)
+        params['allowFuzzy'] = 0
+        cdx_iter, errs = self.fuzzy(self.source, params)
+        assert list(cdx_iter) == []
+
     def test_no_fuzzy_custom_rule_video_id_diff(self):
         url = 'http://youtube.com/get_video_info?a=b&html=true&___abc=123&video_id=ABCD&id=1234'
         actual_url = 'http://youtube.com/get_video_info?a=d&html=true&___abc=125&video_id=ABCE&id=1234'

diff --git a/tests/test_cdx_server_app.py b/tests/test_cdx_server_app.py
@@ -61,7 +61,7 @@ def test_prefix_match(self):
                 suburls += 1
         assert suburls > 0
 
-    def test_filters(self):
+    def test_filters_1(self):
         """
         filter cdxes by mimetype and filename field, exact match.
         """
@@ -71,12 +71,50 @@ def test_filters(self):
         assert resp.status_code == 200
         assert resp.content_type == 'text/x-cdxj'
 
-        for l in resp.text.splitlines():
+        lines = resp.text.splitlines()
+        assert len(lines) > 0
+
+        for l in lines:
             cdx = CDXObject(l.encode('utf-8'))
             assert cdx['urlkey'] == 'org,iana)/_css/2013.1/screen.css'
+            assert cdx['timestamp'] == '20140127171239'
             assert cdx['mime'] == 'warc/revisit'
             assert cdx['filename'] == 'dupes.warc.gz'
 
+    def test_filters_2_no_fuzzy_no_match(self):
+        """
+        two filters, disable fuzzy matching
+        """
+        resp = self.query('http://www.iana.org/_css/2013.1/screen.css',
+                     filter=('!mime:warc/revisit', 'filename:dupes.warc.gz'),
+                     allowFuzzy='false')
+
+        assert resp.status_code == 200
+        assert resp.content_type == 'text/x-cdxj'
+
+        lines = resp.text.splitlines()
+        assert len(lines) == 0
+
+    def test_filters_3(self):
+        """
+        filter cdxes by mimetype and filename field, exact match.
+        """
+        resp = self.query('http://www.iana.org/_css/2013.1/screen.css',
+                     filter=('!mime:warc/revisit', '!filename:dupes.warc.gz'))
+
+        assert resp.status_code == 200
+        assert resp.content_type == 'text/x-cdxj'
+
+        lines = resp.text.splitlines()
+        assert len(lines) == 1
+
+        for l in lines:
+            cdx = CDXObject(l.encode('utf-8'))
+            assert cdx['urlkey'] == 'org,iana)/_css/2013.1/screen.css'
+            assert cdx['timestamp'] == '20140126200625'
+            assert cdx['mime'] == 'text/css'
+            assert cdx['filename'] == 'iana.warc.gz'
+
     def test_limit(self):
         resp = self.query('http://www.iana.org/_css/2013.1/screen.css',
                      limit='1')