|
10 | 10 | import click
|
11 | 11 |
|
12 | 12 |
|
| 13 | +class Lexer: |
| 14 | + def __init__(self, text: str): |
| 15 | + self.text: str = text |
| 16 | + self.idx: int = 0 |
| 17 | + |
| 18 | + def has_input(self) -> bool: |
| 19 | + return self.idx < len(self.text) |
| 20 | + |
| 21 | + def read_char(self) -> str: |
| 22 | + c = self.peek_char() |
| 23 | + self.idx += 1 |
| 24 | + return c |
| 25 | + |
| 26 | + def peek_char(self) -> str: |
| 27 | + if not self.has_input(): |
| 28 | + raise ParseError("unexpected EOF while reading token") |
| 29 | + return self.text[self.idx] |
| 30 | + |
| 31 | + def read_one(self) -> str: |
| 32 | + while (c := self.read_char()).isspace(): |
| 33 | + pass |
| 34 | + if c == '"': |
| 35 | + return self.read_string() |
| 36 | + if c == "-" and self.peek_char() == "-": |
| 37 | + self.read_comment() |
| 38 | + return self.read_one() |
| 39 | + tok = c |
| 40 | + while self.has_input() and not (c := self.read_char()).isspace(): |
| 41 | + tok += c |
| 42 | + return tok |
| 43 | + |
| 44 | + def read_string(self) -> str: |
| 45 | + buf = "" |
| 46 | + while self.has_input() and (c := self.read_char()) != '"': |
| 47 | + buf += c |
| 48 | + return '"' + buf + '"' |
| 49 | + |
| 50 | + def read_comment(self) -> None: |
| 51 | + while self.has_input() and (c := self.read_char()) != "\n": |
| 52 | + pass |
| 53 | + |
| 54 | + |
13 | 55 | def tokenize(x: str) -> list[str]:
|
14 |
| - # TODO: Make this a proper tokenizer that handles strings with blankspace. |
15 |
| - stripped = re.sub(r" *--[^\n]*", "", x).strip() |
16 |
| - return re.split(r"[\s\n]+", stripped) |
| 56 | + lexer = Lexer(x) |
| 57 | + tokens = [] |
| 58 | + while lexer.has_input(): |
| 59 | + tokens.append(lexer.read_one()) |
| 60 | + return tokens |
17 | 61 |
|
18 | 62 |
|
19 | 63 | @dataclass(frozen=True)
|
@@ -234,7 +278,6 @@ def test_ignore_line_comment(self) -> None:
|
234 | 278 | def test_tokenize_string(self) -> None:
|
235 | 279 | self.assertEqual(tokenize('"hello"'), ['"hello"'])
|
236 | 280 |
|
237 |
| - @unittest.skip("TODO(max): Support spaces in strings") |
238 | 281 | def test_tokenize_string_with_spaces(self) -> None:
|
239 | 282 | self.assertEqual(tokenize('"hello world"'), ['"hello world"'])
|
240 | 283 |
|
|
0 commit comments