Add simple custom lexer

Replaces the regular expression hack that didn't support real tokenization. Now we can do string literals with spaces in them!
tekknolagi · Nov 17, 2023 · 082e303 · 082e303
1 parent bb8cf2b
commit 082e303
Showing 1 changed file with 47 additions and 4 deletions.
diff --git a/scrapscript.py b/scrapscript.py
@@ -10,10 +10,54 @@
 import click
 
 
+class Lexer:
+    def __init__(self, text: str):
+        self.text: str = text
+        self.idx: int = 0
+
+    def has_input(self) -> bool:
+        return self.idx < len(self.text)
+
+    def read_char(self) -> str:
+        c = self.peek_char()
+        self.idx += 1
+        return c
+
+    def peek_char(self) -> str:
+        if not self.has_input():
+            raise ParseError("unexpected EOF while reading token")
+        return self.text[self.idx]
+
+    def read_one(self) -> str:
+        while (c := self.read_char()).isspace():
+            pass
+        if c == '"':
+            return self.read_string()
+        if c == "-" and self.peek_char() == "-":
+            self.read_comment()
+            return self.read_one()
+        tok = c
+        while self.has_input() and not (c := self.read_char()).isspace():
+            tok += c
+        return tok
+
+    def read_string(self) -> str:
+        buf = ""
+        while self.has_input() and (c := self.read_char()) != '"':
+            buf += c
+        return '"' + buf + '"'
+
+    def read_comment(self) -> None:
+        while self.has_input() and (c := self.read_char()) != "\n":
+            pass
+
+
 def tokenize(x: str) -> list[str]:
-    # TODO: Make this a proper tokenizer that handles strings with blankspace.
-    stripped = re.sub(r" *--[^\n]*", "", x).strip()
-    return re.split(r"[\s\n]+", stripped)
+    lexer = Lexer(x)
+    tokens = []
+    while lexer.has_input():
+        tokens.append(lexer.read_one())
+    return tokens
 
 
 @dataclass(frozen=True)
@@ -234,7 +278,6 @@ def test_ignore_line_comment(self) -> None:
     def test_tokenize_string(self) -> None:
         self.assertEqual(tokenize('"hello"'), ['"hello"'])
 
-    @unittest.skip("TODO(max): Support spaces in strings")
     def test_tokenize_string_with_spaces(self) -> None:
         self.assertEqual(tokenize('"hello world"'), ['"hello world"'])