Skip to content

Commit

Permalink
Handle multibyte chars in Tokenizer (thanks to nene for the fix)
Browse files Browse the repository at this point in the history
  • Loading branch information
svent committed Dec 8, 2013
1 parent dae363a commit 1d28c11
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 1 deletion.
13 changes: 12 additions & 1 deletion lib/rkelly/tokenizer.rb
Expand Up @@ -56,6 +56,17 @@ class Tokenizer

SINGLE_CHARS_THAT_IMPLY_DIVISION = [')', ']', '}']

# Determine the method to use to measure String length in bytes,
# because StringScanner#pos can obly be set in bytes.
#
# - In Ruby 1.8 String#length returns always the string length
# in bytes.
#
# - In Ruby 1.9+ String#length returns string length in
# characters and we need to use String#bytesize instead.
#
BYTESIZE_METHOD = "".respond_to?(:bytesize) ? :bytesize : :length

def initialize(&block)
@lexemes = []

Expand Down Expand Up @@ -136,7 +147,7 @@ def raw_tokens(string)

longest_token.line = line_number
line_number += longest_token.value.scan(/\n/).length
scanner.pos += longest_token.value.length
scanner.pos += longest_token.value.send(BYTESIZE_METHOD)
tokens << longest_token
end
tokens
Expand Down
20 changes: 20 additions & 0 deletions test/test_tokenizer.rb
Expand Up @@ -152,6 +152,26 @@ def test_comment_assign
], tokens)
end

def test_unicode_string
tokens = @tokenizer.tokenize("foo = 'öäüõ';")
assert_tokens([
[:IDENT, 'foo'],
['=', '='],
[:STRING, "'öäüõ'"],
[';', ';'],
], tokens)
end

def test_unicode_regex
tokens = @tokenizer.tokenize("foo = /öäüõ/;")
assert_tokens([
[:IDENT, 'foo'],
['=', '='],
[:REGEXP, "/öäüõ/"],
[';', ';'],
], tokens)
end

def assert_tokens(expected, actual)
assert_equal(expected, actual.select { |x| x[0] != :S })
end
Expand Down

0 comments on commit 1d28c11

Please sign in to comment.