diff --git a/lib/twitter_cldr/parsers/parser.rb b/lib/twitter_cldr/parsers/parser.rb index c0f20472c..b2b8cad0b 100644 --- a/lib/twitter_cldr/parsers/parser.rb +++ b/lib/twitter_cldr/parsers/parser.rb @@ -44,7 +44,11 @@ def empty?(token) def current_token @tokens[@token_index] end + + def eof? + @token_index >= @tokens.size + end end end -end \ No newline at end of file +end diff --git a/lib/twitter_cldr/parsers/unicode_regex/character_class.rb b/lib/twitter_cldr/parsers/unicode_regex/character_class.rb index a5c575c40..171c4d192 100644 --- a/lib/twitter_cldr/parsers/unicode_regex/character_class.rb +++ b/lib/twitter_cldr/parsers/unicode_regex/character_class.rb @@ -50,6 +50,10 @@ def to_set evaluate(root) end + def codepoints + codepoints_from(root) + end + def to_s stringify(root) end @@ -58,6 +62,17 @@ def to_s attr_reader :root + def codepoints_from(node) + case node + when UnaryOperator + codepoints_from(node.child) + when BinaryOperator + codepoints_from(node.left) + codepoints_from(node.right) + else + node.codepoints + end + end + def stringify(node) case node when UnaryOperator, BinaryOperator diff --git a/lib/twitter_cldr/parsers/unicode_regex/character_range.rb b/lib/twitter_cldr/parsers/unicode_regex/character_range.rb index cc26d23a5..a637cf02a 100644 --- a/lib/twitter_cldr/parsers/unicode_regex/character_range.rb +++ b/lib/twitter_cldr/parsers/unicode_regex/character_range.rb @@ -33,6 +33,10 @@ def to_set ) end + def codepoints + to_set.to_full_a + end + def to_s "#{initial.to_s}-#{final.to_s}" end diff --git a/lib/twitter_cldr/parsers/unicode_regex/character_set.rb b/lib/twitter_cldr/parsers/unicode_regex/character_set.rb index e40a8b392..acd3d2c0e 100755 --- a/lib/twitter_cldr/parsers/unicode_regex/character_set.rb +++ b/lib/twitter_cldr/parsers/unicode_regex/character_set.rb @@ -44,19 +44,9 @@ def to_s def codepoints code_points = CodePoint.code_points_for_property( - property_name, property_value + *normalized_property ) - %w(General_Category Script).each do |name| - if code_points.empty? - code_points = CodePoint.code_points_for_property( - name, property_value - ) - else - break - end - end - if code_points.empty? raise UnicodeRegexParserError, "Couldn't find property '#{property_name}' containing "\ @@ -66,6 +56,52 @@ def codepoints code_points end + private + + def normalized_property + property_value_candidates.each do |property_value| + prop_name, prop_value = normalized_property_name( + property_value, property_name_candidates + ) + + if prop_name + return [prop_name, prop_value] + end + end + + [nil, nil] + end + + def normalized_property_name(property_value, property_name_candidates) + property_name_candidates.each do |property_name| + prop_name, prop_value = CodePoint.properties.normalize( + property_name, property_value + ) + + if prop_name + return [prop_name, prop_value] + end + end + + [nil, nil] + end + + def property_name_candidates + if property_name + [property_name] + else + [property_value, 'General_Category', 'Script'] + end + end + + def property_value_candidates + if property_name && property_value + [property_value] + else + [property_value, nil].uniq + end + end + end end end diff --git a/lib/twitter_cldr/parsers/unicode_regex_parser.rb b/lib/twitter_cldr/parsers/unicode_regex_parser.rb index f93e54845..ad8325bca 100644 --- a/lib/twitter_cldr/parsers/unicode_regex_parser.rb +++ b/lib/twitter_cldr/parsers/unicode_regex_parser.rb @@ -28,11 +28,15 @@ def parse(tokens, options = {}) private # Types that are allowed to be used in character ranges. - CHARACTER_CLASS_TOKEN_TYPES = [ + RANGED_CHARACTER_CLASS_TOKEN_TYPES = [ :variable, :character_set, :negated_character_set, :unicode_char, :multichar_string, :string, :escaped_character, :character_range ] + CHARACTER_CLASS_TOKEN_TYPES = RANGED_CHARACTER_CLASS_TOKEN_TYPES + [ + :open_bracket, :special_char + ] + NEGATED_TOKEN_TYPES = [ :negated_character_set ] @@ -52,22 +56,14 @@ def make_token(type, value = nil) }) end - # Identifies regex ranges and makes implicit operators explicit + # Identifies regex ranges def preprocess(tokens) result = [] i = 0 while i < tokens.size - # Character class entities side-by-side are treated as unions. So - # are side-by-side character classes. Add a special placeholder token - # to help out the expression parser. - add_union = (valid_character_class_token?(result.last) && tokens[i].type != :close_bracket) || - (result.last && result.last.type == :close_bracket && tokens[i].type == :open_bracket) - - result << make_token(:union) if add_union - - is_range = valid_character_class_token?(tokens[i]) && - valid_character_class_token?(tokens[i + 2]) && + is_range = valid_ranged_character_class_token?(tokens[i]) && + valid_ranged_character_class_token?(tokens[i + 2]) && tokens[i + 1].type == :dash if is_range @@ -120,6 +116,10 @@ def valid_character_class_token?(token) token && CHARACTER_CLASS_TOKEN_TYPES.include?(token.type) end + def valid_ranged_character_class_token?(token) + token && RANGED_CHARACTER_CLASS_TOKEN_TYPES.include?(token.type) + end + def unary_operator?(token) token && UNARY_OPERATORS.include?(token.type) end @@ -201,24 +201,9 @@ def character_class loop do case current_token.type when *CharacterClass.closing_types - last_operator = peek(operator_stack) open_count -= 1 - - until last_operator.type == CharacterClass.opening_type_for(current_token.type) - operator = operator_stack.pop - - node = if unary_operator?(operator) - unary_operator_node(operator.type, operand_stack.pop) - else - binary_operator_node( - operator.type, operand_stack.pop, operand_stack.pop - ) - end - - operand_stack.push(node) - last_operator = peek(operator_stack) - end - operator_stack.pop + build_until_open(operator_stack, operand_stack) + add_implicit_union(operator_stack, open_count) when *CharacterClass.opening_types open_count += 1 @@ -228,6 +213,7 @@ def character_class operator_stack.push(current_token) else + add_implicit_union(operator_stack, open_count) operand_stack.push( send(current_token.type, current_token) ) @@ -240,6 +226,53 @@ def character_class CharacterClass.new(operand_stack.pop) end + def build_until_open(operator_stack, operand_stack) + last_operator = peek(operator_stack) + opening_type = CharacterClass.opening_type_for(current_token.type) + + until last_operator.type == opening_type + operator = operator_stack.pop + node = get_operator_node(operator, operand_stack) + operand_stack.push(node) + last_operator = peek(operator_stack) + end + + operator_stack.pop + end + + def get_operator_node(operator, operand_stack) + if operator.type == :dash && operand_stack.size < 2 + get_non_range_dash_node(operator, operand_stack) + else + if unary_operator?(operator) + unary_operator_node(operator.type, operand_stack.pop) + else + binary_operator_node( + operator.type, operand_stack.pop, operand_stack.pop + ) + end + end + end + + # Most regular expression engines allow character classes + # to contain a literal hyphen caracter as the first character. + # For example, [-abc] is a legal expression. It denotes a + # character class that contains the letters '-', 'a', 'b', + # and 'c'. For example, /[-abc]*/.match('-ba') returns 0 in Ruby. + def get_non_range_dash_node(operator, operand_stack) + binary_operator_node( + :union, operand_stack.pop, string(make_token(:string, '-')) + ) + end + + def add_implicit_union(operator_stack, open_count) + if n = @tokens[@token_index + 1] + if valid_character_class_token?(n) && open_count > 0 + operator_stack.push(make_token(:union)) + end + end + end + def peek(array) array.last end diff --git a/lib/twitter_cldr/shared/properties_database.rb b/lib/twitter_cldr/shared/properties_database.rb index 81425eeef..4d1e50f46 100755 --- a/lib/twitter_cldr/shared/properties_database.rb +++ b/lib/twitter_cldr/shared/properties_database.rb @@ -79,6 +79,10 @@ def property_values_for(property_name) path.split(File::SEPARATOR).join if path end.compact + if name_indicates_value_prefix?(property_name) + values += values.map { |v| v[0] } + end + property_values[property_name] = if values.length == 0 nil else diff --git a/lib/twitter_cldr/tokenizers/tokenizer.rb b/lib/twitter_cldr/tokenizers/tokenizer.rb index 07312bc99..551ad6266 100644 --- a/lib/twitter_cldr/tokenizers/tokenizer.rb +++ b/lib/twitter_cldr/tokenizers/tokenizer.rb @@ -49,7 +49,7 @@ def self.union(*tokenizers) Regexp.compile( tokenizers.map do |tokenizer| tokenizer.custom_splitter.source - end.join("|") + end.join("|"), nil, 'u' ) end @@ -79,17 +79,19 @@ def tokenize(text) recognizer.recognizes?(token_text) end - if recognizer.token_type == :composite - content = token_text.match(recognizer.content)[1] - ret << CompositeToken.new(tokenize(content)) - else - cleaned_text = recognizer.clean(token_text) - - if (remove_empty_entries && cleaned_text.size > 0) || !remove_empty_entries - ret << Token.new( - value: cleaned_text, - type: recognizer.token_type - ) + if recognizer + if recognizer.token_type == :composite + content = token_text.match(recognizer.content)[1] + ret << CompositeToken.new(tokenize(content)) + else + cleaned_text = recognizer.clean(token_text) + + if (remove_empty_entries && cleaned_text.size > 0) || !remove_empty_entries + ret << Token.new( + value: cleaned_text, + type: recognizer.token_type + ) + end end end @@ -112,4 +114,4 @@ def clear_splitter end end -end \ No newline at end of file +end diff --git a/lib/twitter_cldr/tokenizers/unicode_regex/unicode_regex_tokenizer.rb b/lib/twitter_cldr/tokenizers/unicode_regex/unicode_regex_tokenizer.rb index 3ce0379b4..bfdacb6a0 100644 --- a/lib/twitter_cldr/tokenizers/unicode_regex/unicode_regex_tokenizer.rb +++ b/lib/twitter_cldr/tokenizers/unicode_regex/unicode_regex_tokenizer.rb @@ -21,8 +21,8 @@ def tokenizer recognizers = [ # The variable name can contain letters and digits, but must start with a letter. TokenRecognizer.new(:variable, /\$\w[\w\d]*/), - TokenRecognizer.new(:character_set, /\[:[\w\s]+:\]|\\p\{[\w=]+\}/), # [:Lu:] or \p{Lu} or \p{Sentence_Break=CF} - TokenRecognizer.new(:negated_character_set, /\[:\^[\w\s]+:\]|\\P\{[\w=]+\}/), #[:^Lu:] or \P{Lu} + TokenRecognizer.new(:character_set, /\[:[\w\s=]+:\]|\\p\{[\w\s=]+\}/), # [:Lu:] or \p{Lu} or \p{Sentence_Break=CF} + TokenRecognizer.new(:negated_character_set, /\[:\^[\w\s=]+:\]|\\P\{[\w\s=]+\}/), #[:^Lu:] or \P{Lu} TokenRecognizer.new(:unicode_char, /\\u\{?[a-fA-F0-9]{1,6}\}?/), TokenRecognizer.new(:multichar_string, /\{\w+\}/u), @@ -38,7 +38,7 @@ def tokenizer TokenRecognizer.new(:open_bracket, /\[/), TokenRecognizer.new(:close_bracket, /\]/), - TokenRecognizer.new(:string, //) do |val| + TokenRecognizer.new(:string, //u) do |val| val == " " ? val : val.strip end ] diff --git a/lib/twitter_cldr/utils/range_set.rb b/lib/twitter_cldr/utils/range_set.rb index 865a9971e..cd8fe78bf 100644 --- a/lib/twitter_cldr/utils/range_set.rb +++ b/lib/twitter_cldr/utils/range_set.rb @@ -175,6 +175,12 @@ def each(&block) end end + def size + ranges.inject(0) do |sum, range| + sum + range.size + end + end + private def includes_numeric?(num) diff --git a/spec/shared/code_point_spec.rb b/spec/shared/code_point_spec.rb index b0ebb1650..e24150064 100644 --- a/spec/shared/code_point_spec.rb +++ b/spec/shared/code_point_spec.rb @@ -64,7 +64,7 @@ def clear properties = code_point.properties expect(properties.alphabetic).to be_true expect(properties.script).to eq(Set.new(%w(Latin))) - expect(properties.general_category).to eq(Set.new(%w(Lu))) + expect(properties.general_category).to eq(Set.new(%w(L Lu))) end end diff --git a/spec/shared/properties_database_spec.rb b/spec/shared/properties_database_spec.rb index d620b8c0f..fa86759b9 100644 --- a/spec/shared/properties_database_spec.rb +++ b/spec/shared/properties_database_spec.rb @@ -89,7 +89,7 @@ it 'returns a property set for the given code point' do property_set = database.properties_for_code_point(65) expect(property_set).to be_a(PropertySet) - expect(property_set.general_category).to eq(Set.new(%w(Lu))) + expect(property_set.general_category).to eq(Set.new(%w(L Lu))) expect(property_set.word_break).to eq(Set.new(%w(ALetter))) end end diff --git a/spec/shared/unicode_regex_spec.rb b/spec/shared/unicode_regex_spec.rb index f84890cf6..d2c7fb6a2 100644 --- a/spec/shared/unicode_regex_spec.rb +++ b/spec/shared/unicode_regex_spec.rb @@ -210,6 +210,13 @@ def compile(str, symbol_table = nil) expect(regex).to exactly_match(",") expect(regex).not_to exactly_match("a") end + + it "should treat a dash that is the first character of a character class as a literal dash instead of a range" do + regex = compile("[-abc]*") + expect(regex).to exactly_match("a-b-c") + expect(regex).to exactly_match("--a") + expect(regex).not_to exactly_match("def") + end end end end