Skip to content

Commit 082e303

Browse files
committed
Add simple custom lexer
Replaces the regular expression hack that didn't support real tokenization. Now we can do string literals with spaces in them!
1 parent bb8cf2b commit 082e303

File tree

1 file changed

+47
-4
lines changed

1 file changed

+47
-4
lines changed

scrapscript.py

100644100755
Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,54 @@
1010
import click
1111

1212

13+
class Lexer:
14+
def __init__(self, text: str):
15+
self.text: str = text
16+
self.idx: int = 0
17+
18+
def has_input(self) -> bool:
19+
return self.idx < len(self.text)
20+
21+
def read_char(self) -> str:
22+
c = self.peek_char()
23+
self.idx += 1
24+
return c
25+
26+
def peek_char(self) -> str:
27+
if not self.has_input():
28+
raise ParseError("unexpected EOF while reading token")
29+
return self.text[self.idx]
30+
31+
def read_one(self) -> str:
32+
while (c := self.read_char()).isspace():
33+
pass
34+
if c == '"':
35+
return self.read_string()
36+
if c == "-" and self.peek_char() == "-":
37+
self.read_comment()
38+
return self.read_one()
39+
tok = c
40+
while self.has_input() and not (c := self.read_char()).isspace():
41+
tok += c
42+
return tok
43+
44+
def read_string(self) -> str:
45+
buf = ""
46+
while self.has_input() and (c := self.read_char()) != '"':
47+
buf += c
48+
return '"' + buf + '"'
49+
50+
def read_comment(self) -> None:
51+
while self.has_input() and (c := self.read_char()) != "\n":
52+
pass
53+
54+
1355
def tokenize(x: str) -> list[str]:
14-
# TODO: Make this a proper tokenizer that handles strings with blankspace.
15-
stripped = re.sub(r" *--[^\n]*", "", x).strip()
16-
return re.split(r"[\s\n]+", stripped)
56+
lexer = Lexer(x)
57+
tokens = []
58+
while lexer.has_input():
59+
tokens.append(lexer.read_one())
60+
return tokens
1761

1862

1963
@dataclass(frozen=True)
@@ -234,7 +278,6 @@ def test_ignore_line_comment(self) -> None:
234278
def test_tokenize_string(self) -> None:
235279
self.assertEqual(tokenize('"hello"'), ['"hello"'])
236280

237-
@unittest.skip("TODO(max): Support spaces in strings")
238281
def test_tokenize_string_with_spaces(self) -> None:
239282
self.assertEqual(tokenize('"hello world"'), ['"hello world"'])
240283

0 commit comments

Comments
 (0)