Handle multibyte chars in Tokenizer (thanks to nene for the fix)

svent · Dec 8, 2013 · 1d28c11 · 1d28c11
1 parent dae363a
commit 1d28c11
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 1 deletion.
diff --git a/lib/rkelly/tokenizer.rb b/lib/rkelly/tokenizer.rb
@@ -56,6 +56,17 @@ class Tokenizer
 
     SINGLE_CHARS_THAT_IMPLY_DIVISION = [')', ']', '}']
 
+    # Determine the method to use to measure String length in bytes,
+    # because StringScanner#pos can obly be set in bytes.
+    #
+    # - In Ruby 1.8 String#length returns always the string length
+    #   in bytes.
+    #
+    # - In Ruby 1.9+ String#length returns string length in
+    #   characters and we need to use String#bytesize instead.
+    #
+    BYTESIZE_METHOD = "".respond_to?(:bytesize) ? :bytesize : :length
+
     def initialize(&block)
       @lexemes = []
 
@@ -136,7 +147,7 @@ def raw_tokens(string)
 
         longest_token.line = line_number
         line_number += longest_token.value.scan(/\n/).length
-        scanner.pos += longest_token.value.length
+        scanner.pos += longest_token.value.send(BYTESIZE_METHOD)
         tokens << longest_token
       end
       tokens

diff --git a/test/test_tokenizer.rb b/test/test_tokenizer.rb
@@ -152,6 +152,26 @@ def test_comment_assign
     ], tokens)
   end
 
+  def test_unicode_string
+    tokens = @tokenizer.tokenize("foo = 'öäüõ';")
+    assert_tokens([
+                 [:IDENT, 'foo'],
+                 ['=', '='],
+                 [:STRING, "'öäüõ'"],
+                 [';', ';'],
+    ], tokens)
+  end
+
+  def test_unicode_regex
+    tokens = @tokenizer.tokenize("foo = /öäüõ/;")
+    assert_tokens([
+                 [:IDENT, 'foo'],
+                 ['=', '='],
+                 [:REGEXP, "/öäüõ/"],
+                 [';', ';'],
+    ], tokens)
+  end
+
   def assert_tokens(expected, actual)
     assert_equal(expected, actual.select { |x| x[0] != :S })
   end