Merge pull request #66 from ubclaunchpad/64-id-api

Closes #64, closes #70: New getdocument API, improved search API and queries
ubclaunchpad · Nov 15, 2017 · 68b9da5 · 68b9da5
2 parents c7420bb + 0727d0d
commit 68b9da5
Show file tree

Hide file tree

Showing 21 changed files with 419 additions and 194 deletions.
diff --git a/.gitignore b/.gitignore
@@ -68,4 +68,7 @@ env.bak/
 venv.bak/
 
 # mypy
-.mypy_cache/
+.mypy_cache/
+
+# nltk
+averaged_perceptron_tagger/
diff --git a/.travis.yml b/.travis.yml
@@ -13,6 +13,7 @@ install:
   - git clone https://github.com/ubclaunchpad/sleuth-frontend.git ../sleuth-frontend
   - docker-compose up -d solr
   - pip install -r requirements.txt
+  - python scripts/nltk_setup.py
   - bash scripts/wait.sh "Solr" "curl localhost:8983/solr"
 
 before_script:

diff --git a/README.md b/README.md
@@ -19,11 +19,12 @@ Getting Started: [Docker](https://docs.docker.com/get-started/),
 
 ## Installation
 
+- Clone this repository and the [Sleuth frontend](https://github.com/ubclaunchpad/sleuth-frontend) into the same directory
 - Install Docker
 - Run
 
 ```Shell
-$ docker-compose up
+$ docker-compose up --build
 ```
 
 - Once containers have started you can `exec` into `bash` in your `web` container and configure a Django admin user.
@@ -43,6 +44,7 @@ root@57d91373cdca:/home/sleuth# python3 manage.py createsuperuser
 
 - The base url for your Django instance should be http://localhost:8000. 
 - To access the Django admin interface make sure you have completed the steps listed above and to go http://localhost:8000/admin.
+- To test the backend API, go to http://localhost:8000/api/[ENDPOINT]/?[PARAMS]
 
 ### Accessing the Sleuth Front-end App
 
@@ -66,7 +68,7 @@ $ bash cd sleuth_crawler/scraper && scrapy crawl broad_crawler
 
 At the moment the crawler never really seems to stop, so you will likely have to force it to quit when you have sufficient data entries.
 
-To empty the a core, go to:
+To empty a core, go to:
 ```
 http://localhost:8983/solr/[CORE_NAME_HERE]/update?stream.body=<delete><query>*:*</query></delete>&commit=true
 ```
diff --git a/scripts/nltk_setup.py b/scripts/nltk_setup.py
@@ -0,0 +1,7 @@
+'''
+Download NLTK data
+'''
+
+import nltk
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
diff --git a/scripts/start-web.sh b/scripts/start-web.sh
@@ -5,5 +5,6 @@ echo "Waiting for Solr to boot"
 bash scripts/wait.sh "Solr" "curl -s solr:8983/solr"
 echo "Waiting for Postgres to boot"
 bash scripts/wait.sh "Postgres" "pg_isready -h db -p 5432"
+python scripts/nltk_setup.py
 python manage.py migrate
 python manage.py runserver 0.0.0.0:8000
diff --git a/sleuth_backend/solr/models.py b/sleuth_backend/solr/models.py
@@ -52,7 +52,7 @@ class GenericPage(SolrDocument):
         "siteName": "",
         "description": "",
         "content": "",
-        "children": []
+        "links": []
     }
 
     def __init__(self, **kwargs):

diff --git a/sleuth_backend/solr/query.py b/sleuth_backend/solr/query.py
@@ -1,20 +1,19 @@
-#import nltk
+'''
+Solr query assembling
+'''
 
-"""
-Solr queries
-"""
+import nltk
 
 class Query(object):
     """
     This object allows component-based building and manipulation of Solr query strings.
     See class for available query manipulations.
 
     Params:
-        query_str (str): the desired query
-        as_phrase (str): should this query be formatted as a phrase (default=True)
-        fields   (dict): the Solr fields to apply this query to (default=None) 
-        proximity (int): proximity for parts of the search phrase (default=None)
-                         only works if as_phrase=True
+        query_str  (str): the desired query
+        as_phrase (bool): should this query be formatted as a phrase (default=True)
+        escape    (bool): should special characters be escaped from the phrase (default=False)
+        sanitize  (bool): should query be stripped of trivial words (default=False)
 
     Example Usage:
         my_query = Query(query_str)
@@ -23,20 +22,20 @@ class Query(object):
         return str(my_query)                # return query string
     """
 
-    def __init__(self, query_str, as_phrase=True, fields=None, proximity=None):
+    def __init__(self, query_str, as_phrase=True, escape=False, sanitize=False):
         """
         Initialize a query
         """
         self.query_str = query_str
-        self._sanitize()
+
+        if escape:
+            self._escape_special_chars()
+
+        if sanitize:
+            self._sanitize()
 
         if as_phrase:
-            self._as_phrase(proximity)
-
-        if fields:
-            if type(fields) is not dict:
-                raise ValueError('Fields must be a dict of field names and boost factors')
-            self._for_fields(fields)
+            self._as_phrase()
 
     def __str__(self):
         """
@@ -68,11 +67,32 @@ def select_require(self, terms):
         """
         for term in terms:
             self.query_str += '+{}'.format(term)
-
-    def _for_fields(self, fields):
+
+    def for_single_field(self, field):
+        '''
+        Apply given field to query
+        '''
+        self.query_str = '{}:{}'.format(field, self.query_str)
+
+    def fuzz(self, factor):
+        '''
+        "Fuzzes" the query by a given factor where 0 <= factor <=2.
+        Acts differently depending on whether the query is a phrase or not.
+        For phrases, this factor determines how far about the words of a
+        phrase can be found.
+        For terms, this factor determines how many insertions/deletions will
+        still return a match.
+        '''
+        if factor < 0 or factor > 2:
+            raise ValueError('Factor must be between 0 and 2.')
+        self.query_str = '{}~{}'.format(self.query_str, factor)
+
+    def for_fields(self, fields):
         """
         Apply given fields to query
         """
+        if type(fields) is not dict:
+            raise ValueError('Fields must be a dict of field names and boost factors')
         self._for_fields_helper(self.query_str, list(fields.items()))
 
     def _for_fields_helper(self, query_str, fields):
@@ -86,19 +106,39 @@ def _for_fields_helper(self, query_str, fields):
         self.select_or(query)
         self._for_fields_helper(query_str, fields[1:])
 
-    def _as_phrase(self, proximity):
+    def _as_phrase(self):
         """
         Format query as entire phrase, and optionally set proximity for
         words within the phrase.
         """
         self.query_str = '"{}"'.format(self.query_str)
-        if proximity:
-            self.query_str = '{}~{}'.format(self.query_str, proximity)
 
     def _sanitize(self):
-        """
+        '''
         Trim nonessential words such as 'and', 'or', 'for'
-        """
-        # TODO: trim useless words like 'and', 'or', 'for' 
-        # from query if as_phrase is false using NLTK POS tagger
-        self.query_str = ' '.join(self.query_str.split())
+        Parts of Speech types:
+        http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
+        '''
+        tags_to_keep = [
+            'NN', 'NNS', 'NNP', 'NNPS',       # noun types
+            'VB', 'VBG', 'VBN', 'VBP', 'VBZ', # verb types
+            'JJ', 'JJR', 'JJS',               # adjective types
+            'RB', 'RBR', 'RBS',               # adverbs
+        ]
+        tokens = nltk.word_tokenize(self.query_str)
+        tags = nltk.pos_tag(tokens)
+        words_list = []
+        for tag in tags:
+            if tag[1] in tags_to_keep:
+                words_list.append(tag[0])
+        self.query_str = ' '.join(words_list)
+
+    def _escape_special_chars(self):
+        '''
+        Escape special characters that interfere with Solr's query parser.
+        Ideally only use on queries where as_phrase=False, since special
+        characters in phrases do not upset Solr.
+        '''
+        special_chars = ['!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '|', '&']
+        for c in special_chars:
+            self.query_str = self.query_str.replace(c, '\\'+c)
diff --git a/sleuth_backend/tests/test_models.py b/sleuth_backend/tests/test_models.py
@@ -16,7 +16,7 @@ def create_page(self, t):
                 "siteName": "testsite",
                 "content": "testcontent",
                 "description": "testblurb",
-                "children": []
+                "links": []
             }
             return (args, GenericPage(**args))
         if t == "courseItem":

diff --git a/sleuth_backend/tests/test_query.py b/sleuth_backend/tests/test_query.py
@@ -7,35 +7,34 @@ class TestQuery(TestCase):
     Test the Solr query object
     """
 
-    def test_basic_init(self):
+    def test_init(self):
         """
         Test initializing a Query
         as_phrase and not as_phrase
-        with and without proximity parameter
         """
         query_str = "hello"
         query = Query(query_str)
         self.assertEqual('"hello"', str(query))
 
-        query = Query(query_str, proximity=5)
-        self.assertEqual('"hello"~5', str(query))
-
         query = Query(query_str, as_phrase=False)
         self.assertEqual('hello', str(query))
 
-    def test_init_fields(self):
+        query = Query('wow:wow()', escape=True)
+        self.assertEqual('"wow\:wow\(\)"', str(query))
+
+    def test_for_fields(self):
         """
-        Test initializing a Query with fields applied
+        Test applying fields to a Query
         """
-        query_str = "hello bruno"
         fields = {'id':1, 'name':10}
-        query = Query(query_str, fields=fields)
+        query = Query("hello bruno")
+        query.for_fields(fields)
         self.assertEqual(
             '"hello bruno" OR id:("hello bruno")^1 OR name:("hello bruno")^10',
             str(query)
         )
         not_dict = "not clean"
-        self.failUnlessRaises(ValueError, Query, query_str, fields=not_dict)
+        self.failUnlessRaises(ValueError, query.for_fields, not_dict)
 
     def test_boost_importance(self):
         """
@@ -66,3 +65,27 @@ def test_selects(self):
         terms = ["hack", "wack"]
         query1.select_require(terms)
         self.assertEqual('"hello bruno"+hack+wack', str(query1))
+
+    def test_for_single_field(self):
+        '''
+        Test applying a single field to a query
+        '''
+        query = Query("hello bruno")
+        query.for_single_field('id')
+        self.assertEqual('id:"hello bruno"', str(query))
+
+    def test_fuzz(self):
+        '''
+        Test applying a fuzz factor to a query
+        '''
+        query = Query("hello bruno")
+        query.fuzz(2)
+        self.assertEqual('"hello bruno"~2', str(query))
+        self.failUnlessRaises(ValueError, query.fuzz, 7)
+
+    def test_sanitation(self):
+        '''
+        Test query init with sanitize=True
+        '''
+        query = Query("The quick brown fox jumped over 12 lazy dogs", sanitize=True)
+        print(str(query))