Skip to content

Commit

Permalink
NLTK imports, query sanitation
Browse files Browse the repository at this point in the history
  • Loading branch information
bobheadxi committed Nov 11, 2017
1 parent 4f29284 commit a718bee
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 14 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,7 @@ env.bak/
venv.bak/

# mypy
.mypy_cache/
.mypy_cache/

# nltk
averaged_perceptron_tagger/
7 changes: 7 additions & 0 deletions scripts/nltk_setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
'''
Download NLTK data
'''

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
1 change: 1 addition & 0 deletions scripts/start-web.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ echo "Waiting for Solr to boot"
bash scripts/wait.sh "Solr" "curl -s solr:8983/solr"
echo "Waiting for Postgres to boot"
bash scripts/wait.sh "Postgres" "pg_isready -h db -p 5432"
python scripts/nltk_setup.py
python manage.py migrate
python manage.py runserver 0.0.0.0:8000
32 changes: 24 additions & 8 deletions sleuth_backend/solr/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Solr query assembling
'''

#import nltk
import nltk

class Query(object):
"""
Expand All @@ -13,6 +13,7 @@ class Query(object):
query_str (str): the desired query
as_phrase (bool): should this query be formatted as a phrase (default=True)
escape (bool): should special characters be escaped from the phrase (default=False)
sanitize (bool): should query be stripped of trivial words (default=False)
Example Usage:
my_query = Query(query_str)
Expand All @@ -21,16 +22,18 @@ class Query(object):
return str(my_query) # return query string
"""

def __init__(self, query_str, as_phrase=True, escape=False):
def __init__(self, query_str, as_phrase=True, escape=False, sanitize=False):
"""
Initialize a query
"""
self.query_str = query_str
self._sanitize()

if escape:
self._escape_special_chars()

if sanitize:
self._sanitize()

if as_phrase:
self._as_phrase()

Expand Down Expand Up @@ -111,12 +114,25 @@ def _as_phrase(self):
self.query_str = '"{}"'.format(self.query_str)

def _sanitize(self):
"""
'''
Trim nonessential words such as 'and', 'or', 'for'
"""
# TODO: trim useless words like 'and', 'or', 'for'
# from query if as_phrase is false using NLTK POS tagger
self.query_str = ' '.join(self.query_str.split())
Parts of Speech types:
http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
'''
tags_to_keep = [
'NN', 'NNS', 'NNP', 'NNPS', # noun types
'VB', 'VBG', 'VBN', 'VBP', 'VBZ', # verb types
'JJ', 'JJR', 'JJS', # adjective types
'RB', 'RBR', 'RBS', # adverbs
]
words = nltk.word_tokenize(self.query_str)
words = nltk.pos_tag(words)
print(words)
words_list = []
for word in words:
if word[1] in tags_to_keep:
words_list.append(word[0])
self.query_str = ' '.join(words_list)

def _escape_special_chars(self):
'''
Expand Down
7 changes: 7 additions & 0 deletions sleuth_backend/tests/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,10 @@ def test_fuzz(self):
query.fuzz(2)
self.assertEqual('"hello bruno"~2', str(query))
self.failUnlessRaises(ValueError, query.fuzz, 7)

def test_sanitation(self):
'''
Test query init with sanitize=True
'''
query = Query("The quick brown fox jumped over 12 lazy dogs", sanitize=True)
print(str(query))
11 changes: 6 additions & 5 deletions sleuth_backend/views/views_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,16 @@ def build_search_query(core, query_str, base_kwargs):
if core == "genericPage":
fields = {
'id': 1,
'siteName': 8,
'name': 8,
'siteName': 5,
'description': 5,
'content': 2
'content': 8
}
query = Query(query_str)
query.fuzz(2)
terms_query = Query(query_str, as_phrase=False, escape=True)
terms_query.fuzz(2)
query.boost_importance(5)
terms_query = Query(query_str, as_phrase=False, escape=True, sanitize=True)
terms_query.fuzz(1)
terms_query.for_fields(fields)
query.select_or(terms_query)
kwargs['default_field'] = 'content'
Expand All @@ -74,7 +75,7 @@ def build_search_query(core, query_str, base_kwargs):
}
query = Query(query_str)
query.fuzz(2)
terms_query = Query(query_str, as_phrase=False, escape=True)
terms_query = Query(query_str, as_phrase=False, escape=True, sanitize=True)
terms_query.for_fields(fields)
query.select_or(terms_query)
kwargs['default_field'] = 'name'
Expand Down

0 comments on commit a718bee

Please sign in to comment.