Skip to content

Commit

Permalink
Attempting to fix regexes
Browse files Browse the repository at this point in the history
  • Loading branch information
camertron committed Mar 29, 2016
1 parent 204cc38 commit a277024
Show file tree
Hide file tree
Showing 12 changed files with 170 additions and 59 deletions.
6 changes: 5 additions & 1 deletion lib/twitter_cldr/parsers/parser.rb
Expand Up @@ -44,7 +44,11 @@ def empty?(token)
def current_token
@tokens[@token_index]
end

def eof?
@token_index >= @tokens.size
end
end

end
end
end
15 changes: 15 additions & 0 deletions lib/twitter_cldr/parsers/unicode_regex/character_class.rb
Expand Up @@ -50,6 +50,10 @@ def to_set
evaluate(root)
end

def codepoints
codepoints_from(root)
end

def to_s
stringify(root)
end
Expand All @@ -58,6 +62,17 @@ def to_s

attr_reader :root

def codepoints_from(node)
case node
when UnaryOperator
codepoints_from(node.child)
when BinaryOperator
codepoints_from(node.left) + codepoints_from(node.right)
else
node.codepoints
end
end

def stringify(node)
case node
when UnaryOperator, BinaryOperator
Expand Down
4 changes: 4 additions & 0 deletions lib/twitter_cldr/parsers/unicode_regex/character_range.rb
Expand Up @@ -33,6 +33,10 @@ def to_set
)
end

def codepoints
to_set.to_full_a
end

def to_s
"#{initial.to_s}-#{final.to_s}"
end
Expand Down
58 changes: 47 additions & 11 deletions lib/twitter_cldr/parsers/unicode_regex/character_set.rb
Expand Up @@ -44,19 +44,9 @@ def to_s

def codepoints
code_points = CodePoint.code_points_for_property(
property_name, property_value
*normalized_property
)

%w(General_Category Script).each do |name|
if code_points.empty?
code_points = CodePoint.code_points_for_property(
name, property_value
)
else
break
end
end

if code_points.empty?
raise UnicodeRegexParserError,
"Couldn't find property '#{property_name}' containing "\
Expand All @@ -66,6 +56,52 @@ def codepoints
code_points
end

private

def normalized_property
property_value_candidates.each do |property_value|
prop_name, prop_value = normalized_property_name(
property_value, property_name_candidates
)

if prop_name
return [prop_name, prop_value]
end
end

[nil, nil]
end

def normalized_property_name(property_value, property_name_candidates)
property_name_candidates.each do |property_name|
prop_name, prop_value = CodePoint.properties.normalize(
property_name, property_value
)

if prop_name
return [prop_name, prop_value]
end
end

[nil, nil]
end

def property_name_candidates
if property_name
[property_name]
else
[property_value, 'General_Category', 'Script']
end
end

def property_value_candidates
if property_name && property_value
[property_value]
else
[property_value, nil].uniq
end
end

end
end
end
Expand Down
91 changes: 62 additions & 29 deletions lib/twitter_cldr/parsers/unicode_regex_parser.rb
Expand Up @@ -28,11 +28,15 @@ def parse(tokens, options = {})
private

# Types that are allowed to be used in character ranges.
CHARACTER_CLASS_TOKEN_TYPES = [
RANGED_CHARACTER_CLASS_TOKEN_TYPES = [
:variable, :character_set, :negated_character_set, :unicode_char,
:multichar_string, :string, :escaped_character, :character_range
]

CHARACTER_CLASS_TOKEN_TYPES = RANGED_CHARACTER_CLASS_TOKEN_TYPES + [
:open_bracket, :special_char
]

NEGATED_TOKEN_TYPES = [
:negated_character_set
]
Expand All @@ -52,22 +56,14 @@ def make_token(type, value = nil)
})
end

# Identifies regex ranges and makes implicit operators explicit
# Identifies regex ranges
def preprocess(tokens)
result = []
i = 0

while i < tokens.size
# Character class entities side-by-side are treated as unions. So
# are side-by-side character classes. Add a special placeholder token
# to help out the expression parser.
add_union = (valid_character_class_token?(result.last) && tokens[i].type != :close_bracket) ||
(result.last && result.last.type == :close_bracket && tokens[i].type == :open_bracket)

result << make_token(:union) if add_union

is_range = valid_character_class_token?(tokens[i]) &&
valid_character_class_token?(tokens[i + 2]) &&
is_range = valid_ranged_character_class_token?(tokens[i]) &&
valid_ranged_character_class_token?(tokens[i + 2]) &&
tokens[i + 1].type == :dash

if is_range
Expand Down Expand Up @@ -120,6 +116,10 @@ def valid_character_class_token?(token)
token && CHARACTER_CLASS_TOKEN_TYPES.include?(token.type)
end

def valid_ranged_character_class_token?(token)
token && RANGED_CHARACTER_CLASS_TOKEN_TYPES.include?(token.type)
end

def unary_operator?(token)
token && UNARY_OPERATORS.include?(token.type)
end
Expand Down Expand Up @@ -201,24 +201,9 @@ def character_class
loop do
case current_token.type
when *CharacterClass.closing_types
last_operator = peek(operator_stack)
open_count -= 1

until last_operator.type == CharacterClass.opening_type_for(current_token.type)
operator = operator_stack.pop

node = if unary_operator?(operator)
unary_operator_node(operator.type, operand_stack.pop)
else
binary_operator_node(
operator.type, operand_stack.pop, operand_stack.pop
)
end

operand_stack.push(node)
last_operator = peek(operator_stack)
end
operator_stack.pop
build_until_open(operator_stack, operand_stack)
add_implicit_union(operator_stack, open_count)

when *CharacterClass.opening_types
open_count += 1
Expand All @@ -228,6 +213,7 @@ def character_class
operator_stack.push(current_token)

else
add_implicit_union(operator_stack, open_count)
operand_stack.push(
send(current_token.type, current_token)
)
Expand All @@ -240,6 +226,53 @@ def character_class
CharacterClass.new(operand_stack.pop)
end

def build_until_open(operator_stack, operand_stack)
last_operator = peek(operator_stack)
opening_type = CharacterClass.opening_type_for(current_token.type)

until last_operator.type == opening_type
operator = operator_stack.pop
node = get_operator_node(operator, operand_stack)
operand_stack.push(node)
last_operator = peek(operator_stack)
end

operator_stack.pop
end

def get_operator_node(operator, operand_stack)
if operator.type == :dash && operand_stack.size < 2
get_non_range_dash_node(operator, operand_stack)
else
if unary_operator?(operator)
unary_operator_node(operator.type, operand_stack.pop)
else
binary_operator_node(
operator.type, operand_stack.pop, operand_stack.pop
)
end
end
end

# Most regular expression engines allow character classes
# to contain a literal hyphen caracter as the first character.
# For example, [-abc] is a legal expression. It denotes a
# character class that contains the letters '-', 'a', 'b',
# and 'c'. For example, /[-abc]*/.match('-ba') returns 0 in Ruby.
def get_non_range_dash_node(operator, operand_stack)
binary_operator_node(
:union, operand_stack.pop, string(make_token(:string, '-'))
)
end

def add_implicit_union(operator_stack, open_count)
if n = @tokens[@token_index + 1]
if valid_character_class_token?(n) && open_count > 0
operator_stack.push(make_token(:union))
end
end
end

def peek(array)
array.last
end
Expand Down
4 changes: 4 additions & 0 deletions lib/twitter_cldr/shared/properties_database.rb
Expand Up @@ -79,6 +79,10 @@ def property_values_for(property_name)
path.split(File::SEPARATOR).join if path
end.compact

if name_indicates_value_prefix?(property_name)
values += values.map { |v| v[0] }
end

property_values[property_name] = if values.length == 0
nil
else
Expand Down
28 changes: 15 additions & 13 deletions lib/twitter_cldr/tokenizers/tokenizer.rb
Expand Up @@ -49,7 +49,7 @@ def self.union(*tokenizers)
Regexp.compile(
tokenizers.map do |tokenizer|
tokenizer.custom_splitter.source
end.join("|")
end.join("|"), nil, 'u'
)
end

Expand Down Expand Up @@ -79,17 +79,19 @@ def tokenize(text)
recognizer.recognizes?(token_text)
end

if recognizer.token_type == :composite
content = token_text.match(recognizer.content)[1]
ret << CompositeToken.new(tokenize(content))
else
cleaned_text = recognizer.clean(token_text)

if (remove_empty_entries && cleaned_text.size > 0) || !remove_empty_entries
ret << Token.new(
value: cleaned_text,
type: recognizer.token_type
)
if recognizer
if recognizer.token_type == :composite
content = token_text.match(recognizer.content)[1]
ret << CompositeToken.new(tokenize(content))
else
cleaned_text = recognizer.clean(token_text)

if (remove_empty_entries && cleaned_text.size > 0) || !remove_empty_entries
ret << Token.new(
value: cleaned_text,
type: recognizer.token_type
)
end
end
end

Expand All @@ -112,4 +114,4 @@ def clear_splitter

end
end
end
end
Expand Up @@ -21,8 +21,8 @@ def tokenizer
recognizers = [
# The variable name can contain letters and digits, but must start with a letter.
TokenRecognizer.new(:variable, /\$\w[\w\d]*/),
TokenRecognizer.new(:character_set, /\[:[\w\s]+:\]|\\p\{[\w=]+\}/), # [:Lu:] or \p{Lu} or \p{Sentence_Break=CF}
TokenRecognizer.new(:negated_character_set, /\[:\^[\w\s]+:\]|\\P\{[\w=]+\}/), #[:^Lu:] or \P{Lu}
TokenRecognizer.new(:character_set, /\[:[\w\s=]+:\]|\\p\{[\w\s=]+\}/), # [:Lu:] or \p{Lu} or \p{Sentence_Break=CF}
TokenRecognizer.new(:negated_character_set, /\[:\^[\w\s=]+:\]|\\P\{[\w\s=]+\}/), #[:^Lu:] or \P{Lu}
TokenRecognizer.new(:unicode_char, /\\u\{?[a-fA-F0-9]{1,6}\}?/),
TokenRecognizer.new(:multichar_string, /\{\w+\}/u),

Expand All @@ -38,7 +38,7 @@ def tokenizer
TokenRecognizer.new(:open_bracket, /\[/),
TokenRecognizer.new(:close_bracket, /\]/),

TokenRecognizer.new(:string, //) do |val|
TokenRecognizer.new(:string, //u) do |val|
val == " " ? val : val.strip
end
]
Expand Down
6 changes: 6 additions & 0 deletions lib/twitter_cldr/utils/range_set.rb
Expand Up @@ -175,6 +175,12 @@ def each(&block)
end
end

def size
ranges.inject(0) do |sum, range|
sum + range.size
end
end

private

def includes_numeric?(num)
Expand Down
2 changes: 1 addition & 1 deletion spec/shared/code_point_spec.rb
Expand Up @@ -64,7 +64,7 @@ def clear
properties = code_point.properties
expect(properties.alphabetic).to be_true
expect(properties.script).to eq(Set.new(%w(Latin)))
expect(properties.general_category).to eq(Set.new(%w(Lu)))
expect(properties.general_category).to eq(Set.new(%w(L Lu)))
end
end

Expand Down
2 changes: 1 addition & 1 deletion spec/shared/properties_database_spec.rb
Expand Up @@ -89,7 +89,7 @@
it 'returns a property set for the given code point' do
property_set = database.properties_for_code_point(65)
expect(property_set).to be_a(PropertySet)
expect(property_set.general_category).to eq(Set.new(%w(Lu)))
expect(property_set.general_category).to eq(Set.new(%w(L Lu)))
expect(property_set.word_break).to eq(Set.new(%w(ALetter)))
end
end
Expand Down
7 changes: 7 additions & 0 deletions spec/shared/unicode_regex_spec.rb
Expand Up @@ -210,6 +210,13 @@ def compile(str, symbol_table = nil)
expect(regex).to exactly_match(",")
expect(regex).not_to exactly_match("a")
end

it "should treat a dash that is the first character of a character class as a literal dash instead of a range" do
regex = compile("[-abc]*")
expect(regex).to exactly_match("a-b-c")
expect(regex).to exactly_match("--a")
expect(regex).not_to exactly_match("def")
end
end
end
end

0 comments on commit a277024

Please sign in to comment.