Skip to content

Commit

Permalink
Added support for special unicode characters
Browse files Browse the repository at this point in the history
  • Loading branch information
zacharymarshal committed Jan 25, 2017
1 parent 250a77d commit e9fa3e9
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 8 deletions.
17 changes: 9 additions & 8 deletions src/Tokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,15 @@ public function getToken($string)
return $matched;
}
],
"/^&[^\s;]*;/" => ['type' => 'html-entity', 'value_fn' => $returnMatches],
"/^[><\+\-!@#$%^&*();]/" => ['type' => 'punctuation', 'value_fn' => $returnMatches],
"/^\w+/" => ['type' => 'word', 'value_fn' => $returnMatches],
"/^./" => [
'type' => 'unknown',
'value_fn' => function($matches, $string) {
return $string[0];
}
"/^&[^\s;]*;/" => ['type' => 'html-entity', 'value_fn' => $returnMatches],
"/^[><\+\-!@#$%^&*();=:'\"]/" => ['type' => 'punctuation', 'value_fn' => $returnMatches],
"/^\w+/" => ['type' => 'word', 'value_fn' => $returnMatches],
"/^\X/u" => ['type' => 'special-char', 'value_fn' => $returnMatches],
"/^./" => [
'type' => 'unknown',
'value_fn' => function($matches, $string) {
return $string[0];
}
],
];

Expand Down
18 changes: 18 additions & 0 deletions tests/lexer-test.php
Original file line number Diff line number Diff line change
Expand Up @@ -200,3 +200,21 @@
]
);
});

Test::create('should tokenize special chars', function(Test $test) {
$tokenizer = new Tokenizer;
$tokens = $tokenizer->tokenize("“x = −4”");
$test->equals(
$tokens,
[
['type' => 'special-char', 'value' => '“'],
['type' => 'word', 'value' => 'x'],
['type' => 'whitespace', 'value' => ' '],
['type' => 'punctuation', 'value' => '='],
['type' => 'whitespace', 'value' => ' '],
['type' => 'special-char', 'value' => '−'],
['type' => 'word', 'value' => '4'],
['type' => 'special-char', 'value' => '”'],
]
);
});

0 comments on commit e9fa3e9

Please sign in to comment.