Skip to content

Commit

Permalink
Merge pull request #66 from ubclaunchpad/64-id-api
Browse files Browse the repository at this point in the history
Closes #64, closes #70: New getdocument API, improved search API and queries
  • Loading branch information
bfbachmann committed Nov 15, 2017
2 parents c7420bb + 0727d0d commit 68b9da5
Show file tree
Hide file tree
Showing 21 changed files with 419 additions and 194 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,7 @@ env.bak/
venv.bak/

# mypy
.mypy_cache/
.mypy_cache/

# nltk
averaged_perceptron_tagger/
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ install:
- git clone https://github.com/ubclaunchpad/sleuth-frontend.git ../sleuth-frontend
- docker-compose up -d solr
- pip install -r requirements.txt
- python scripts/nltk_setup.py
- bash scripts/wait.sh "Solr" "curl localhost:8983/solr"

before_script:
Expand Down
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ Getting Started: [Docker](https://docs.docker.com/get-started/),

## Installation

- Clone this repository and the [Sleuth frontend](https://github.com/ubclaunchpad/sleuth-frontend) into the same directory
- Install Docker
- Run

```Shell
$ docker-compose up
$ docker-compose up --build
```

- Once containers have started you can `exec` into `bash` in your `web` container and configure a Django admin user.
Expand All @@ -43,6 +44,7 @@ root@57d91373cdca:/home/sleuth# python3 manage.py createsuperuser

- The base url for your Django instance should be http://localhost:8000.
- To access the Django admin interface make sure you have completed the steps listed above and to go http://localhost:8000/admin.
- To test the backend API, go to http://localhost:8000/api/[ENDPOINT]/?[PARAMS]

### Accessing the Sleuth Front-end App

Expand All @@ -66,7 +68,7 @@ $ bash cd sleuth_crawler/scraper && scrapy crawl broad_crawler

At the moment the crawler never really seems to stop, so you will likely have to force it to quit when you have sufficient data entries.

To empty the a core, go to:
To empty a core, go to:
```
http://localhost:8983/solr/[CORE_NAME_HERE]/update?stream.body=<delete><query>*:*</query></delete>&commit=true
```
7 changes: 7 additions & 0 deletions scripts/nltk_setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
'''
Download NLTK data
'''

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
1 change: 1 addition & 0 deletions scripts/start-web.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ echo "Waiting for Solr to boot"
bash scripts/wait.sh "Solr" "curl -s solr:8983/solr"
echo "Waiting for Postgres to boot"
bash scripts/wait.sh "Postgres" "pg_isready -h db -p 5432"
python scripts/nltk_setup.py
python manage.py migrate
python manage.py runserver 0.0.0.0:8000
2 changes: 1 addition & 1 deletion sleuth_backend/solr/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class GenericPage(SolrDocument):
"siteName": "",
"description": "",
"content": "",
"children": []
"links": []
}

def __init__(self, **kwargs):
Expand Down
94 changes: 67 additions & 27 deletions sleuth_backend/solr/query.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
#import nltk
'''
Solr query assembling
'''

"""
Solr queries
"""
import nltk

class Query(object):
"""
This object allows component-based building and manipulation of Solr query strings.
See class for available query manipulations.
Params:
query_str (str): the desired query
as_phrase (str): should this query be formatted as a phrase (default=True)
fields (dict): the Solr fields to apply this query to (default=None)
proximity (int): proximity for parts of the search phrase (default=None)
only works if as_phrase=True
query_str (str): the desired query
as_phrase (bool): should this query be formatted as a phrase (default=True)
escape (bool): should special characters be escaped from the phrase (default=False)
sanitize (bool): should query be stripped of trivial words (default=False)
Example Usage:
my_query = Query(query_str)
Expand All @@ -23,20 +22,20 @@ class Query(object):
return str(my_query) # return query string
"""

def __init__(self, query_str, as_phrase=True, fields=None, proximity=None):
def __init__(self, query_str, as_phrase=True, escape=False, sanitize=False):
"""
Initialize a query
"""
self.query_str = query_str
self._sanitize()

if escape:
self._escape_special_chars()

if sanitize:
self._sanitize()

if as_phrase:
self._as_phrase(proximity)

if fields:
if type(fields) is not dict:
raise ValueError('Fields must be a dict of field names and boost factors')
self._for_fields(fields)
self._as_phrase()

def __str__(self):
"""
Expand Down Expand Up @@ -68,11 +67,32 @@ def select_require(self, terms):
"""
for term in terms:
self.query_str += '+{}'.format(term)

def _for_fields(self, fields):

def for_single_field(self, field):
'''
Apply given field to query
'''
self.query_str = '{}:{}'.format(field, self.query_str)

def fuzz(self, factor):
'''
"Fuzzes" the query by a given factor where 0 <= factor <=2.
Acts differently depending on whether the query is a phrase or not.
For phrases, this factor determines how far about the words of a
phrase can be found.
For terms, this factor determines how many insertions/deletions will
still return a match.
'''
if factor < 0 or factor > 2:
raise ValueError('Factor must be between 0 and 2.')
self.query_str = '{}~{}'.format(self.query_str, factor)

def for_fields(self, fields):
"""
Apply given fields to query
"""
if type(fields) is not dict:
raise ValueError('Fields must be a dict of field names and boost factors')
self._for_fields_helper(self.query_str, list(fields.items()))

def _for_fields_helper(self, query_str, fields):
Expand All @@ -86,19 +106,39 @@ def _for_fields_helper(self, query_str, fields):
self.select_or(query)
self._for_fields_helper(query_str, fields[1:])

def _as_phrase(self, proximity):
def _as_phrase(self):
"""
Format query as entire phrase, and optionally set proximity for
words within the phrase.
"""
self.query_str = '"{}"'.format(self.query_str)
if proximity:
self.query_str = '{}~{}'.format(self.query_str, proximity)

def _sanitize(self):
"""
'''
Trim nonessential words such as 'and', 'or', 'for'
"""
# TODO: trim useless words like 'and', 'or', 'for'
# from query if as_phrase is false using NLTK POS tagger
self.query_str = ' '.join(self.query_str.split())
Parts of Speech types:
http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
'''
tags_to_keep = [
'NN', 'NNS', 'NNP', 'NNPS', # noun types
'VB', 'VBG', 'VBN', 'VBP', 'VBZ', # verb types
'JJ', 'JJR', 'JJS', # adjective types
'RB', 'RBR', 'RBS', # adverbs
]
tokens = nltk.word_tokenize(self.query_str)
tags = nltk.pos_tag(tokens)
words_list = []
for tag in tags:
if tag[1] in tags_to_keep:
words_list.append(tag[0])
self.query_str = ' '.join(words_list)

def _escape_special_chars(self):
'''
Escape special characters that interfere with Solr's query parser.
Ideally only use on queries where as_phrase=False, since special
characters in phrases do not upset Solr.
'''
special_chars = ['!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '|', '&']
for c in special_chars:
self.query_str = self.query_str.replace(c, '\\'+c)
2 changes: 1 addition & 1 deletion sleuth_backend/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def create_page(self, t):
"siteName": "testsite",
"content": "testcontent",
"description": "testblurb",
"children": []
"links": []
}
return (args, GenericPage(**args))
if t == "courseItem":
Expand Down
43 changes: 33 additions & 10 deletions sleuth_backend/tests/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,35 +7,34 @@ class TestQuery(TestCase):
Test the Solr query object
"""

def test_basic_init(self):
def test_init(self):
"""
Test initializing a Query
as_phrase and not as_phrase
with and without proximity parameter
"""
query_str = "hello"
query = Query(query_str)
self.assertEqual('"hello"', str(query))

query = Query(query_str, proximity=5)
self.assertEqual('"hello"~5', str(query))

query = Query(query_str, as_phrase=False)
self.assertEqual('hello', str(query))

def test_init_fields(self):
query = Query('wow:wow()', escape=True)
self.assertEqual('"wow\:wow\(\)"', str(query))

def test_for_fields(self):
"""
Test initializing a Query with fields applied
Test applying fields to a Query
"""
query_str = "hello bruno"
fields = {'id':1, 'name':10}
query = Query(query_str, fields=fields)
query = Query("hello bruno")
query.for_fields(fields)
self.assertEqual(
'"hello bruno" OR id:("hello bruno")^1 OR name:("hello bruno")^10',
str(query)
)
not_dict = "not clean"
self.failUnlessRaises(ValueError, Query, query_str, fields=not_dict)
self.failUnlessRaises(ValueError, query.for_fields, not_dict)

def test_boost_importance(self):
"""
Expand Down Expand Up @@ -66,3 +65,27 @@ def test_selects(self):
terms = ["hack", "wack"]
query1.select_require(terms)
self.assertEqual('"hello bruno"+hack+wack', str(query1))

def test_for_single_field(self):
'''
Test applying a single field to a query
'''
query = Query("hello bruno")
query.for_single_field('id')
self.assertEqual('id:"hello bruno"', str(query))

def test_fuzz(self):
'''
Test applying a fuzz factor to a query
'''
query = Query("hello bruno")
query.fuzz(2)
self.assertEqual('"hello bruno"~2', str(query))
self.failUnlessRaises(ValueError, query.fuzz, 7)

def test_sanitation(self):
'''
Test query init with sanitize=True
'''
query = Query("The quick brown fox jumped over 12 lazy dogs", sanitize=True)
print(str(query))
Loading

0 comments on commit 68b9da5

Please sign in to comment.