Permalink
Fetching contributors…
Cannot retrieve contributors at this time
82 lines (69 sloc) 2.9 KB
# The Tokenizer module adds methods to the query language compiler that transforms a query string
# into a stream of tokens, which are more appropriate for parsing a query string.
module ScopedSearch::QueryLanguage::Tokenizer
# All keywords that the language supports
KEYWORDS = { 'and' => :and, 'or' => :or, 'not' => :not, 'set?' => :notnull, 'has' => :notnull, 'null?' => :null, 'before' => :lt, 'after' => :gt, 'at' => :eq }
# Every operator the language supports.
OPERATORS = { '&' => :and, '|' => :or, '&&' => :and, '||' => :or, '-'=> :not, '!' => :not, '~' => :like, '!~' => :unlike,
'=' => :eq, '==' => :eq, '!=' => :ne, '<>' => :ne, '>' => :gt, '<' => :lt, '>=' => :gte, '<=' => :lte, '^' => :in, '!^' => :notin }
# Tokenizes the string and returns the result as an array of tokens.
def tokenize
@current_char_pos = -1
to_a
end
# Returns the current character of the string
def current_char
@current_char
end
# Returns a following character of the string (by default, the next
# character), without updating the position pointer.
def peek_char(amount = 1)
@str[@current_char_pos + amount, 1]
end
# Returns the next character of the string, and moves the position
# pointer one step forward
def next_char
@current_char_pos += 1
@current_char = @str[@current_char_pos, 1]
end
# Tokenizes the string by iterating over the characters.
def each_token(&block)
while next_char
case current_char
when /^\s?$/; # ignore
when '('; yield(:lparen)
when ')'; yield(:rparen)
when ','; yield(:comma)
when /\&|\||=|<|>|\^|!|~|-/; tokenize_operator(&block)
when '"'; tokenize_quoted_keyword(&block)
else; tokenize_keyword(&block)
end
end
end
# Tokenizes an operator that occurs in the OPERATORS hash
# The .to_s on [peek|next]_char is to prevent a ruby bug when nil
# values are returned from strings which have forced encoding.
# https://github.com/wvanbergen/scoped_search/issues/33 for details
def tokenize_operator(&block)
operator = current_char
operator << next_char.to_s if OPERATORS.has_key?(operator + peek_char.to_s)
yield(OPERATORS[operator])
end
# Tokenizes a keyword, and converts it to a Symbol if it is recognized as a
# reserved language keyword (the KEYWORDS array).
def tokenize_keyword(&block)
keyword = current_char
keyword << next_char while /[^=~<>\s\&\|\)\(,]/ =~ peek_char
KEYWORDS.has_key?(keyword.downcase) ? yield(KEYWORDS[keyword.downcase]) : yield(keyword)
end
# Tokenizes a keyword that is quoted using double quotes. Allows escaping
# of double quote characters by backslashes.
def tokenize_quoted_keyword(&block)
keyword = ""
until next_char.nil? || current_char == '"'
keyword << (current_char == "\\" ? next_char : current_char)
end
yield(keyword)
end
alias :each :each_token
end