Attempting to fix regexes

twitter · Mar 29, 2016 · a277024 · a277024
1 parent 204cc38
commit a277024
Show file tree

Hide file tree

Showing 12 changed files with 170 additions and 59 deletions.
diff --git a/lib/twitter_cldr/parsers/parser.rb b/lib/twitter_cldr/parsers/parser.rb
@@ -44,7 +44,11 @@ def empty?(token)
       def current_token
         @tokens[@token_index]
       end
+
+      def eof?
+        @token_index >= @tokens.size
+      end
     end
 
   end
-end
+end
diff --git a/lib/twitter_cldr/parsers/unicode_regex/character_class.rb b/lib/twitter_cldr/parsers/unicode_regex/character_class.rb
@@ -50,6 +50,10 @@ def to_set
           evaluate(root)
         end
 
+        def codepoints
+          codepoints_from(root)
+        end
+
         def to_s
           stringify(root)
         end
@@ -58,6 +62,17 @@ def to_s
 
         attr_reader :root
 
+        def codepoints_from(node)
+          case node
+            when UnaryOperator
+              codepoints_from(node.child)
+            when BinaryOperator
+              codepoints_from(node.left) + codepoints_from(node.right)
+            else
+              node.codepoints
+          end
+        end
+
         def stringify(node)
           case node
             when UnaryOperator, BinaryOperator

diff --git a/lib/twitter_cldr/parsers/unicode_regex/character_range.rb b/lib/twitter_cldr/parsers/unicode_regex/character_range.rb
@@ -33,6 +33,10 @@ def to_set
           )
         end
 
+        def codepoints
+          to_set.to_full_a
+        end
+
         def to_s
           "#{initial.to_s}-#{final.to_s}"
         end

diff --git a/lib/twitter_cldr/parsers/unicode_regex/character_set.rb b/lib/twitter_cldr/parsers/unicode_regex/character_set.rb
@@ -44,19 +44,9 @@ def to_s
 
         def codepoints
           code_points = CodePoint.code_points_for_property(
-            property_name, property_value
+            *normalized_property
           )
 
-          %w(General_Category Script).each do |name|
-            if code_points.empty?
-              code_points = CodePoint.code_points_for_property(
-                name, property_value
-              )
-            else
-              break
-            end
-          end
-
           if code_points.empty?
             raise UnicodeRegexParserError,
               "Couldn't find property '#{property_name}' containing "\
@@ -66,6 +56,52 @@ def codepoints
           code_points
         end
 
+        private
+
+        def normalized_property
+          property_value_candidates.each do |property_value|
+            prop_name, prop_value = normalized_property_name(
+              property_value, property_name_candidates
+            )
+
+            if prop_name
+              return [prop_name, prop_value]
+            end
+          end
+
+          [nil, nil]
+        end
+
+        def normalized_property_name(property_value, property_name_candidates)
+          property_name_candidates.each do |property_name|
+            prop_name, prop_value = CodePoint.properties.normalize(
+              property_name, property_value
+            )
+
+            if prop_name
+              return [prop_name, prop_value]
+            end
+          end
+
+          [nil, nil]
+        end
+
+        def property_name_candidates
+          if property_name
+            [property_name]
+          else
+            [property_value, 'General_Category', 'Script']
+          end
+        end
+
+        def property_value_candidates
+          if property_name && property_value
+            [property_value]
+          else
+            [property_value, nil].uniq
+          end
+        end
+
       end
     end
   end

diff --git a/lib/twitter_cldr/parsers/unicode_regex_parser.rb b/lib/twitter_cldr/parsers/unicode_regex_parser.rb
@@ -28,11 +28,15 @@ def parse(tokens, options = {})
       private
 
       # Types that are allowed to be used in character ranges.
-      CHARACTER_CLASS_TOKEN_TYPES = [
+      RANGED_CHARACTER_CLASS_TOKEN_TYPES = [
         :variable, :character_set, :negated_character_set, :unicode_char,
         :multichar_string, :string, :escaped_character, :character_range
       ]
 
+      CHARACTER_CLASS_TOKEN_TYPES = RANGED_CHARACTER_CLASS_TOKEN_TYPES + [
+        :open_bracket, :special_char
+      ]
+
       NEGATED_TOKEN_TYPES = [
         :negated_character_set
       ]
@@ -52,22 +56,14 @@ def make_token(type, value = nil)
         })
       end
 
-      # Identifies regex ranges and makes implicit operators explicit
+      # Identifies regex ranges
       def preprocess(tokens)
         result = []
         i = 0
 
         while i < tokens.size
-          # Character class entities side-by-side are treated as unions. So
-          # are side-by-side character classes. Add a special placeholder token
-          # to help out the expression parser.
-          add_union = (valid_character_class_token?(result.last) && tokens[i].type != :close_bracket) ||
-            (result.last && result.last.type == :close_bracket && tokens[i].type == :open_bracket)
-
-          result << make_token(:union) if add_union
-
-          is_range = valid_character_class_token?(tokens[i]) &&
-            valid_character_class_token?(tokens[i + 2]) &&
+          is_range = valid_ranged_character_class_token?(tokens[i]) &&
+            valid_ranged_character_class_token?(tokens[i + 2]) &&
             tokens[i + 1].type == :dash
 
           if is_range
@@ -120,6 +116,10 @@ def valid_character_class_token?(token)
         token && CHARACTER_CLASS_TOKEN_TYPES.include?(token.type)
       end
 
+      def valid_ranged_character_class_token?(token)
+        token && RANGED_CHARACTER_CLASS_TOKEN_TYPES.include?(token.type)
+      end
+
       def unary_operator?(token)
         token && UNARY_OPERATORS.include?(token.type)
       end
@@ -201,24 +201,9 @@ def character_class
         loop do
           case current_token.type
             when *CharacterClass.closing_types
-              last_operator = peek(operator_stack)
               open_count -= 1
-
-              until last_operator.type == CharacterClass.opening_type_for(current_token.type)
-                operator = operator_stack.pop
-
-                node = if unary_operator?(operator)
-                  unary_operator_node(operator.type, operand_stack.pop)
-                else
-                  binary_operator_node(
-                    operator.type, operand_stack.pop, operand_stack.pop
-                  )
-                end
-
-                operand_stack.push(node)
-                last_operator = peek(operator_stack)
-              end
-              operator_stack.pop
+              build_until_open(operator_stack, operand_stack)
+              add_implicit_union(operator_stack, open_count)
 
             when *CharacterClass.opening_types
               open_count += 1
@@ -228,6 +213,7 @@ def character_class
               operator_stack.push(current_token)
 
             else
+              add_implicit_union(operator_stack, open_count)
               operand_stack.push(
                 send(current_token.type, current_token)
               )
@@ -240,6 +226,53 @@ def character_class
         CharacterClass.new(operand_stack.pop)
       end
 
+      def build_until_open(operator_stack, operand_stack)
+        last_operator = peek(operator_stack)
+        opening_type = CharacterClass.opening_type_for(current_token.type)
+
+        until last_operator.type == opening_type
+          operator = operator_stack.pop
+          node = get_operator_node(operator, operand_stack)
+          operand_stack.push(node)
+          last_operator = peek(operator_stack)
+        end
+
+        operator_stack.pop
+      end
+
+      def get_operator_node(operator, operand_stack)
+        if operator.type == :dash && operand_stack.size < 2
+          get_non_range_dash_node(operator, operand_stack)
+        else
+          if unary_operator?(operator)
+            unary_operator_node(operator.type, operand_stack.pop)
+          else
+            binary_operator_node(
+              operator.type, operand_stack.pop, operand_stack.pop
+            )
+          end
+        end
+      end
+
+      # Most regular expression engines allow character classes
+      # to contain a literal hyphen caracter as the first character.
+      # For example, [-abc] is a legal expression. It denotes a
+      # character class that contains the letters '-', 'a', 'b',
+      # and 'c'. For example, /[-abc]*/.match('-ba') returns 0 in Ruby.
+      def get_non_range_dash_node(operator, operand_stack)
+        binary_operator_node(
+          :union, operand_stack.pop, string(make_token(:string, '-'))
+        )
+      end
+
+      def add_implicit_union(operator_stack, open_count)
+        if n = @tokens[@token_index + 1]
+          if valid_character_class_token?(n) && open_count > 0
+            operator_stack.push(make_token(:union))
+          end
+        end
+      end
+
       def peek(array)
         array.last
       end

diff --git a/lib/twitter_cldr/shared/properties_database.rb b/lib/twitter_cldr/shared/properties_database.rb
@@ -79,6 +79,10 @@ def property_values_for(property_name)
           path.split(File::SEPARATOR).join if path
         end.compact
 
+        if name_indicates_value_prefix?(property_name)
+          values += values.map { |v| v[0] }
+        end
+
         property_values[property_name] = if values.length == 0
           nil
         else

diff --git a/lib/twitter_cldr/tokenizers/tokenizer.rb b/lib/twitter_cldr/tokenizers/tokenizer.rb
@@ -49,7 +49,7 @@ def self.union(*tokenizers)
           Regexp.compile(
             tokenizers.map do |tokenizer|
               tokenizer.custom_splitter.source
-            end.join("|")
+            end.join("|"), nil, 'u'
           )
         end
 
@@ -79,17 +79,19 @@ def tokenize(text)
             recognizer.recognizes?(token_text)
           end
 
-          if recognizer.token_type == :composite
-            content = token_text.match(recognizer.content)[1]
-            ret << CompositeToken.new(tokenize(content))
-          else
-            cleaned_text = recognizer.clean(token_text)
-
-            if (remove_empty_entries && cleaned_text.size > 0) || !remove_empty_entries
-              ret << Token.new(
-                value: cleaned_text,
-                type: recognizer.token_type
-              )
+          if recognizer
+            if recognizer.token_type == :composite
+              content = token_text.match(recognizer.content)[1]
+              ret << CompositeToken.new(tokenize(content))
+            else
+              cleaned_text = recognizer.clean(token_text)
+
+              if (remove_empty_entries && cleaned_text.size > 0) || !remove_empty_entries
+                ret << Token.new(
+                  value: cleaned_text,
+                  type: recognizer.token_type
+                )
+              end
             end
           end
 
@@ -112,4 +114,4 @@ def clear_splitter
 
     end
   end
-end
+end
diff --git a/lib/twitter_cldr/tokenizers/unicode_regex/unicode_regex_tokenizer.rb b/lib/twitter_cldr/tokenizers/unicode_regex/unicode_regex_tokenizer.rb
@@ -21,8 +21,8 @@ def tokenizer
           recognizers = [
             # The variable name can contain letters and digits, but must start with a letter.
             TokenRecognizer.new(:variable, /\$\w[\w\d]*/),
-            TokenRecognizer.new(:character_set, /\[:[\w\s]+:\]|\\p\{[\w=]+\}/),  # [:Lu:] or \p{Lu} or \p{Sentence_Break=CF}
-            TokenRecognizer.new(:negated_character_set, /\[:\^[\w\s]+:\]|\\P\{[\w=]+\}/),  #[:^Lu:] or \P{Lu}
+            TokenRecognizer.new(:character_set, /\[:[\w\s=]+:\]|\\p\{[\w\s=]+\}/),  # [:Lu:] or \p{Lu} or \p{Sentence_Break=CF}
+            TokenRecognizer.new(:negated_character_set, /\[:\^[\w\s=]+:\]|\\P\{[\w\s=]+\}/),  #[:^Lu:] or \P{Lu}
             TokenRecognizer.new(:unicode_char, /\\u\{?[a-fA-F0-9]{1,6}\}?/),
             TokenRecognizer.new(:multichar_string, /\{\w+\}/u),
 
@@ -38,7 +38,7 @@ def tokenizer
             TokenRecognizer.new(:open_bracket, /\[/),
             TokenRecognizer.new(:close_bracket, /\]/),
 
-            TokenRecognizer.new(:string, //) do |val|
+            TokenRecognizer.new(:string, //u) do |val|
               val == " " ? val : val.strip
             end
           ]

diff --git a/lib/twitter_cldr/utils/range_set.rb b/lib/twitter_cldr/utils/range_set.rb
@@ -175,6 +175,12 @@ def each(&block)
         end
       end
 
+      def size
+        ranges.inject(0) do |sum, range|
+          sum + range.size
+        end
+      end
+
       private
 
       def includes_numeric?(num)

diff --git a/spec/shared/code_point_spec.rb b/spec/shared/code_point_spec.rb
@@ -64,7 +64,7 @@ def clear
       properties = code_point.properties
       expect(properties.alphabetic).to be_true
       expect(properties.script).to eq(Set.new(%w(Latin)))
-      expect(properties.general_category).to eq(Set.new(%w(Lu)))
+      expect(properties.general_category).to eq(Set.new(%w(L Lu)))
     end
   end
 

diff --git a/spec/shared/properties_database_spec.rb b/spec/shared/properties_database_spec.rb
@@ -89,7 +89,7 @@
       it 'returns a property set for the given code point' do
         property_set = database.properties_for_code_point(65)
         expect(property_set).to be_a(PropertySet)
-        expect(property_set.general_category).to eq(Set.new(%w(Lu)))
+        expect(property_set.general_category).to eq(Set.new(%w(L Lu)))
         expect(property_set.word_break).to eq(Set.new(%w(ALetter)))
       end
     end

diff --git a/spec/shared/unicode_regex_spec.rb b/spec/shared/unicode_regex_spec.rb
@@ -210,6 +210,13 @@ def compile(str, symbol_table = nil)
         expect(regex).to exactly_match(",")
         expect(regex).not_to exactly_match("a")
       end
+
+      it "should treat a dash that is the first character of a character class as a literal dash instead of a range" do
+        regex = compile("[-abc]*")
+        expect(regex).to exactly_match("a-b-c")
+        expect(regex).to exactly_match("--a")
+        expect(regex).not_to exactly_match("def")
+      end
     end
   end
 end