This repository has been archived by the owner on May 13, 2020. It is now read-only.
/
QueryParser.py
244 lines (208 loc) · 7.25 KB
/
QueryParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Query Parser.
This particular parser recognizes the following syntax:
Start = OrExpr
OrExpr = AndExpr ('OR' AndExpr)*
AndExpr = Term ('AND' NotExpr)*
NotExpr = ['NOT'] Term
Term = '(' OrExpr ')' | ATOM+
The key words (AND, OR, NOT) are recognized in any mixture of case.
An ATOM is either:
+ A sequence of characters not containing whitespace or parentheses or
double quotes, and not equal (ignoring case) to one of the key words
'AND', 'OR', 'NOT'; or
+ A non-empty string enclosed in double quotes. The interior of the
string can contain whitespace, parentheses and key words, but not
quotes.
+ A hyphen followed by one of the two forms above, meaning that it
must not be present.
An unquoted ATOM may also contain globbing characters. Globbing
syntax is defined by the lexicon; for example "foo*" could mean any
word starting with "foo".
When multiple consecutive ATOMs are found at the leaf level, they are
connected by an implied AND operator, and an unquoted leading hyphen
is interpreted as a NOT operator.
Summarizing the default operator rules:
- a sequence of words without operators implies AND, e.g. ``foo bar''
- double-quoted text implies phrase search, e.g. ``"foo bar"''
- words connected by punctuation implies phrase search, e.g. ``foo-bar''
- a leading hyphen implies NOT, e.g. ``foo -bar''
- these can be combined, e.g. ``foo -"foo bar"'' or ``foo -foo-bar''
- * and ? are used for globbing (i.e. prefix search), e.g. ``foo*''
"""
import re
from zope.interface import implements
from Products.ZCTextIndex.IQueryParser import IQueryParser
from Products.ZCTextIndex import ParseTree
# Create unique symbols for token types.
_AND = intern("AND")
_OR = intern("OR")
_NOT = intern("NOT")
_LPAREN = intern("(")
_RPAREN = intern(")")
_ATOM = intern("ATOM")
_EOF = intern("EOF")
# Map keyword string to token type.
_keywords = {
_AND: _AND,
_OR: _OR,
_NOT: _NOT,
_LPAREN: _LPAREN,
_RPAREN: _RPAREN,
}
# Regular expression to tokenize.
_tokenizer_regex = re.compile(r"""
# a paren
[()]
# or an optional hyphen
| -?
# followed by
(?:
# a string inside double quotes (and not containing these)
" [^"]* "
# or a non-empty stretch w/o whitespace, parens or double quotes
| [^()\s"]+
)
""", re.VERBOSE)
class QueryParser:
implements(IQueryParser)
# This class is not thread-safe;
# each thread should have its own instance
def __init__(self, lexicon):
self._lexicon = lexicon
self._ignored = None
# Public API methods
def parseQuery(self, query):
# Lexical analysis.
tokens = _tokenizer_regex.findall(query)
self._tokens = tokens
# classify tokens
self._tokentypes = [_keywords.get(token.upper(), _ATOM)
for token in tokens]
# add _EOF
self._tokens.append(_EOF)
self._tokentypes.append(_EOF)
self._index = 0
# Syntactical analysis.
self._ignored = [] # Ignored words in the query, for parseQueryEx
tree = self._parseOrExpr()
self._require(_EOF)
if tree is None:
raise ParseTree.ParseError(
"Query contains only common words: %s" % repr(query))
return tree
def getIgnored(self):
return self._ignored
def parseQueryEx(self, query):
tree = self.parseQuery(query)
ignored = self.getIgnored()
return tree, ignored
# Recursive descent parser
def _require(self, tokentype):
if not self._check(tokentype):
t = self._tokens[self._index]
msg = "Token %r required, %r found" % (tokentype, t)
raise ParseTree.ParseError, msg
def _check(self, tokentype):
if self._tokentypes[self._index] is tokentype:
self._index += 1
return 1
else:
return 0
def _peek(self, tokentype):
return self._tokentypes[self._index] is tokentype
def _get(self, tokentype):
t = self._tokens[self._index]
self._require(tokentype)
return t
def _parseOrExpr(self):
L = []
L.append(self._parseAndExpr())
while self._check(_OR):
L.append(self._parseAndExpr())
L = filter(None, L)
if not L:
return None # Only stopwords
elif len(L) == 1:
return L[0]
else:
return ParseTree.OrNode(L)
def _parseAndExpr(self):
L = []
t = self._parseTerm()
if t is not None:
L.append(t)
Nots = []
while self._check(_AND):
t = self._parseNotExpr()
if t is None:
continue
if isinstance(t, ParseTree.NotNode):
Nots.append(t)
else:
L.append(t)
if not L:
return None # Only stopwords
L.extend(Nots)
if len(L) == 1:
return L[0]
else:
return ParseTree.AndNode(L)
def _parseNotExpr(self):
if self._check(_NOT):
t = self._parseTerm()
if t is None:
return None # Only stopwords
return ParseTree.NotNode(t)
else:
return self._parseTerm()
def _parseTerm(self):
if self._check(_LPAREN):
tree = self._parseOrExpr()
self._require(_RPAREN)
else:
nodes = []
nodes = [self._parseAtom()]
while self._peek(_ATOM):
nodes.append(self._parseAtom())
nodes = filter(None, nodes)
if not nodes:
return None # Only stopwords
structure = [(isinstance(nodes[i], ParseTree.NotNode), i, nodes[i])
for i in range(len(nodes))]
structure.sort()
nodes = [node for (bit, index, node) in structure]
if isinstance(nodes[0], ParseTree.NotNode):
raise ParseTree.ParseError(
"a term must have at least one positive word")
if len(nodes) == 1:
return nodes[0]
tree = ParseTree.AndNode(nodes)
return tree
def _parseAtom(self):
term = self._get(_ATOM)
words = self._lexicon.parseTerms(term)
if not words:
self._ignored.append(term)
return None
if len(words) > 1:
tree = ParseTree.PhraseNode(words)
elif self._lexicon.isGlob(words[0]):
tree = ParseTree.GlobNode(words[0])
else:
tree = ParseTree.AtomNode(words[0])
if term[0] == "-":
tree = ParseTree.NotNode(tree)
return tree