From e9fa3e9125830c33214087af7ac35484400c207a Mon Sep 17 00:00:00 2001 From: Zachary Rankin Date: Wed, 25 Jan 2017 15:35:06 -0800 Subject: [PATCH] Added support for special unicode characters --- src/Tokenizer.php | 17 +++++++++-------- tests/lexer-test.php | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/Tokenizer.php b/src/Tokenizer.php index 5cabdd2..447266a 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -48,14 +48,15 @@ public function getToken($string) return $matched; } ], - "/^&[^\s;]*;/" => ['type' => 'html-entity', 'value_fn' => $returnMatches], - "/^[><\+\-!@#$%^&*();]/" => ['type' => 'punctuation', 'value_fn' => $returnMatches], - "/^\w+/" => ['type' => 'word', 'value_fn' => $returnMatches], - "/^./" => [ - 'type' => 'unknown', - 'value_fn' => function($matches, $string) { - return $string[0]; - } + "/^&[^\s;]*;/" => ['type' => 'html-entity', 'value_fn' => $returnMatches], + "/^[><\+\-!@#$%^&*();=:'\"]/" => ['type' => 'punctuation', 'value_fn' => $returnMatches], + "/^\w+/" => ['type' => 'word', 'value_fn' => $returnMatches], + "/^\X/u" => ['type' => 'special-char', 'value_fn' => $returnMatches], + "/^./" => [ + 'type' => 'unknown', + 'value_fn' => function($matches, $string) { + return $string[0]; + } ], ]; diff --git a/tests/lexer-test.php b/tests/lexer-test.php index b46581f..94a7843 100644 --- a/tests/lexer-test.php +++ b/tests/lexer-test.php @@ -200,3 +200,21 @@ ] ); }); + +Test::create('should tokenize special chars', function(Test $test) { + $tokenizer = new Tokenizer; + $tokens = $tokenizer->tokenize("“x = −4”"); + $test->equals( + $tokens, + [ + ['type' => 'special-char', 'value' => '“'], + ['type' => 'word', 'value' => 'x'], + ['type' => 'whitespace', 'value' => ' '], + ['type' => 'punctuation', 'value' => '='], + ['type' => 'whitespace', 'value' => ' '], + ['type' => 'special-char', 'value' => '−'], + ['type' => 'word', 'value' => '4'], + ['type' => 'special-char', 'value' => '”'], + ] + ); +});