/
Tokenizer.php
200 lines (169 loc) · 6.14 KB
/
Tokenizer.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
<?php
/*
* This file is part of the Symfony package.
*
* (c) Fabien Potencier <fabien@symfony.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Symfony\Component\CssSelector;
/**
* Tokenizer lexes a CSS Selector to tokens.
*
* This component is a port of the Python lxml library,
* which is copyright Infrae and distributed under the BSD license.
*
* @author Fabien Potencier <fabien@symfony.com>
*/
class Tokenizer
{
/**
* Takes a CSS selector and returns an array holding the Tokens
* it contains.
*
* @param string $s The selector to lex.
*
* @return array Token[]
*/
public function tokenize($s)
{
if (function_exists('mb_internal_encoding') && ((int) ini_get('mbstring.func_overload')) & 2) {
$mbEncoding = mb_internal_encoding();
mb_internal_encoding('ASCII');
}
$tokens = array();
$pos = 0;
$s = preg_replace('#/\*.*?\*/#s', '', $s);
while (true) {
if (preg_match('#\s+#A', $s, $match, 0, $pos)) {
$precedingWhitespacePos = $pos;
$pos += strlen($match[0]);
} else {
$precedingWhitespacePos = 0;
}
if ($pos >= strlen($s)) {
if (isset($mbEncoding)) {
mb_internal_encoding($mbEncoding);
}
return $tokens;
}
if (preg_match('#[+-]?\d*n(?:[+-]\d+)?#A', $s, $match, 0, $pos) && 'n' !== $match[0]) {
$sym = substr($s, $pos, strlen($match[0]));
$tokens[] = new Token('Symbol', $sym, $pos);
$pos += strlen($match[0]);
continue;
}
$c = $s[$pos];
$c2 = substr($s, $pos, 2);
if (in_array($c2, array('~=', '|=', '^=', '$=', '*=', '::', '!='))) {
$tokens[] = new Token('Token', $c2, $pos);
$pos += 2;
continue;
}
if (in_array($c, array('>', '+', '~', ',', '.', '*', '=', '[', ']', '(', ')', '|', ':', '#'))) {
if (in_array($c, array('.', '#', '[')) && $precedingWhitespacePos > 0) {
$tokens[] = new Token('Token', ' ', $precedingWhitespacePos);
}
$tokens[] = new Token('Token', $c, $pos);
++$pos;
continue;
}
if ('"' === $c || "'" === $c) {
// Quoted string
$oldPos = $pos;
list($sym, $pos) = $this->tokenizeEscapedString($s, $pos);
$tokens[] = new Token('String', $sym, $oldPos);
continue;
}
$oldPos = $pos;
list($sym, $pos) = $this->tokenizeSymbol($s, $pos);
$tokens[] = new Token('Symbol', $sym, $oldPos);
continue;
}
}
/**
* Tokenizes a quoted string (i.e. 'A string quoted with \' characters'),
* and returns an array holding the unquoted string contained by $s and
* the new position from which tokenizing should take over.
*
* @throws ParseException When expected closing is not found
*
* @param string $s The selector string containing the quoted string.
* @param integer $pos The starting position for the quoted string.
*
* @return array
*/
private function tokenizeEscapedString($s, $pos)
{
$quote = $s[$pos];
$pos = $pos + 1;
$start = $pos;
while (true) {
$next = strpos($s, $quote, $pos);
if (false === $next) {
throw new ParseException(sprintf('Expected closing %s for string in: %s', $quote, substr($s, $start)));
}
$result = substr($s, $start, $next - $start);
if ('\\' === $result[strlen($result) - 1]) {
// next quote character is escaped
$pos = $next + 1;
continue;
}
if (false !== strpos($result, '\\')) {
$result = $this->unescapeStringLiteral($result);
}
return array($result, $next + 1);
}
}
/**
* Unescapes a string literal and returns the unescaped string.
*
* @throws ParseException When invalid escape sequence is found
*
* @param string $literal The string literal to unescape.
*
* @return string
*/
private function unescapeStringLiteral($literal)
{
return preg_replace_callback('#(\\\\(?:[A-Fa-f0-9]{1,6}(?:\r\n|\s)?|[^A-Fa-f0-9]))#', function ($matches) use ($literal)
{
if ($matches[0][0] == '\\' && strlen($matches[0]) > 1) {
$matches[0] = substr($matches[0], 1);
if (in_array($matches[0][0], array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'a', 'b', 'c', 'd', 'e', 'f'))) {
return chr(trim($matches[0]));
}
} else {
throw new ParseException(sprintf('Invalid escape sequence %s in string %s', $matches[0], $literal));
}
}, $literal);
}
/**
* Lexes selector $s and returns an array holding the name of the symbol
* contained in it and the new position from which tokenizing should take
* over.
*
* @throws ParseException When Unexpected symbol is found
*
* @param string $s The selector string.
* @param integer $pos The position in $s at which the symbol starts.
*
* @return array
*/
private function tokenizeSymbol($s, $pos)
{
$start = $pos;
if (!preg_match('#[^\w\-]#', $s, $match, PREG_OFFSET_CAPTURE, $pos)) {
// Goes to end of s
return array(substr($s, $start), strlen($s));
}
$matchStart = $match[0][1];
if ($matchStart == $pos) {
throw new ParseException(sprintf('Unexpected symbol: %s at %s', $s[$pos], $pos));
}
$result = substr($s, $start, $matchStart - $start);
$pos = $matchStart;
return array($result, $pos);
}
}