Skip to content

Commit

Permalink
Add simple custom lexer
Browse files Browse the repository at this point in the history
Replaces the regular expression hack that didn't support real
tokenization. Now we can do string literals with spaces in them!
  • Loading branch information
tekknolagi committed Nov 17, 2023
1 parent bb8cf2b commit 082e303
Showing 1 changed file with 47 additions and 4 deletions.
51 changes: 47 additions & 4 deletions scrapscript.py 100644 → 100755
Expand Up @@ -10,10 +10,54 @@
import click


class Lexer:
def __init__(self, text: str):
self.text: str = text
self.idx: int = 0

def has_input(self) -> bool:
return self.idx < len(self.text)

def read_char(self) -> str:
c = self.peek_char()
self.idx += 1
return c

def peek_char(self) -> str:
if not self.has_input():
raise ParseError("unexpected EOF while reading token")
return self.text[self.idx]

def read_one(self) -> str:
while (c := self.read_char()).isspace():
pass
if c == '"':
return self.read_string()
if c == "-" and self.peek_char() == "-":
self.read_comment()
return self.read_one()
tok = c
while self.has_input() and not (c := self.read_char()).isspace():
tok += c
return tok

def read_string(self) -> str:
buf = ""
while self.has_input() and (c := self.read_char()) != '"':
buf += c
return '"' + buf + '"'

def read_comment(self) -> None:
while self.has_input() and (c := self.read_char()) != "\n":
pass


def tokenize(x: str) -> list[str]:
# TODO: Make this a proper tokenizer that handles strings with blankspace.
stripped = re.sub(r" *--[^\n]*", "", x).strip()
return re.split(r"[\s\n]+", stripped)
lexer = Lexer(x)
tokens = []
while lexer.has_input():
tokens.append(lexer.read_one())
return tokens


@dataclass(frozen=True)
Expand Down Expand Up @@ -234,7 +278,6 @@ def test_ignore_line_comment(self) -> None:
def test_tokenize_string(self) -> None:
self.assertEqual(tokenize('"hello"'), ['"hello"'])

@unittest.skip("TODO(max): Support spaces in strings")
def test_tokenize_string_with_spaces(self) -> None:
self.assertEqual(tokenize('"hello world"'), ['"hello world"'])

Expand Down

0 comments on commit 082e303

Please sign in to comment.