Skip to content

Commit 0082989

Browse files
committed
[self-hosted] tokenizer error for ascii control codes
1 parent 45ab752 commit 0082989

File tree

1 file changed

+69
-24
lines changed

1 file changed

+69
-24
lines changed

src-self-hosted/tokenizer.zig

Lines changed: 69 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ pub const Tokenizer = struct {
141141
buffer: []const u8,
142142
index: usize,
143143
actual_file_end: usize,
144+
pending_invalid_token: ?Token,
144145

145146
pub const Location = struct {
146147
line: usize,
@@ -179,24 +180,18 @@ pub const Tokenizer = struct {
179180
}
180181

181182
pub fn init(buffer: []const u8) -> Tokenizer {
182-
if (buffer.len == 0 or buffer[buffer.len - 1] == '\n') {
183-
return Tokenizer {
184-
.buffer = buffer,
185-
.index = 0,
186-
.actual_file_end = buffer.len,
187-
};
188-
} else {
183+
var source_len = buffer.len;
184+
while (source_len > 0) : (source_len -= 1) {
185+
if (buffer[source_len - 1] == '\n') break;
189186
// last line is incomplete, so skip it, and give an error when we get there.
190-
var source_len = buffer.len;
191-
while (source_len > 0) : (source_len -= 1) {
192-
if (buffer[source_len - 1] == '\n') break;
193-
}
194-
return Tokenizer {
195-
.buffer = buffer[0..source_len],
196-
.index = 0,
197-
.actual_file_end = buffer.len,
198-
};
199187
}
188+
189+
return Tokenizer {
190+
.buffer = buffer[0..source_len],
191+
.index = 0,
192+
.actual_file_end = buffer.len,
193+
.pending_invalid_token = null,
194+
};
200195
}
201196

202197
const State = enum {
@@ -223,6 +218,10 @@ pub const Tokenizer = struct {
223218
};
224219

225220
pub fn next(self: &Tokenizer) -> Token {
221+
if (self.pending_invalid_token) |token| {
222+
self.pending_invalid_token = null;
223+
return token;
224+
}
226225
var state = State.Start;
227226
var result = Token {
228227
.id = Token.Id.Eof,
@@ -368,7 +367,7 @@ pub const Tokenizer = struct {
368367
break;
369368
},
370369
'\n' => break, // Look for this error later.
371-
else => {},
370+
else => self.checkLiteralCharacter(),
372371
},
373372

374373
State.StringLiteralBackslash => switch (c) {
@@ -455,7 +454,7 @@ pub const Tokenizer = struct {
455454
.end = undefined,
456455
};
457456
},
458-
else => {},
457+
else => self.checkLiteralCharacter(),
459458
},
460459
State.Zero => switch (c) {
461460
'b', 'o', 'x' => {
@@ -513,23 +512,46 @@ pub const Tokenizer = struct {
513512
}
514513
}
515514
result.end = self.index;
516-
if (result.id == Token.Id.Eof and self.actual_file_end != self.buffer.len) {
517-
// instead of an Eof, give an error token
518-
result.id = Token.Id.NoEolAtEof;
519-
result.end = self.actual_file_end;
515+
if (result.id == Token.Id.Eof) {
516+
if (self.pending_invalid_token) |token| {
517+
self.pending_invalid_token = null;
518+
return token;
519+
}
520+
if (self.actual_file_end != self.buffer.len) {
521+
// instead of an Eof, give an error token
522+
result.id = Token.Id.NoEolAtEof;
523+
result.end = self.actual_file_end;
524+
}
520525
}
521526
return result;
522527
}
523528

524529
pub fn getTokenSlice(self: &const Tokenizer, token: &const Token) -> []const u8 {
525530
return self.buffer[token.start..token.end];
526531
}
532+
533+
fn checkLiteralCharacter(self: &Tokenizer) {
534+
if (self.pending_invalid_token != null) return;
535+
const c0 = self.buffer[self.index];
536+
if (c0 < 0x20 or c0 == 0x7f) {
537+
// ascii control codes are never allowed
538+
// (note that \n was checked before we got here)
539+
self.pending_invalid_token = Token {
540+
.id = Token.Id.Invalid,
541+
.start = self.index,
542+
.end = self.index + 1,
543+
};
544+
return;
545+
}
546+
}
527547
};
528548

529549

530550

531551
test "tokenizer" {
532552
// source must end with eol
553+
testTokenize("", []Token.Id {
554+
}, true);
533555
testTokenize("no newline", []Token.Id {
534556
}, false);
535557
testTokenize("test\n", []Token.Id {
@@ -538,6 +560,29 @@ test "tokenizer" {
538560
testTokenize("test\nno newline", []Token.Id {
539561
Token.Id.Keyword_test,
540562
}, false);
563+
564+
// invalid token characters
565+
testTokenize("#\n", []Token.Id {
566+
Token.Id.Invalid,
567+
}, true);
568+
testTokenize("`\n", []Token.Id {
569+
Token.Id.Invalid,
570+
}, true);
571+
572+
// invalid literal/comment characters
573+
testTokenize("\"\x00\"\n", []Token.Id {
574+
Token.Id { .StringLiteral = Token.StrLitKind.Normal },
575+
Token.Id.Invalid,
576+
}, true);
577+
testTokenize("//\x00\n", []Token.Id {
578+
Token.Id.Invalid,
579+
}, true);
580+
testTokenize("//\x1f\n", []Token.Id {
581+
Token.Id.Invalid,
582+
}, true);
583+
testTokenize("//\x7f\n", []Token.Id {
584+
Token.Id.Invalid,
585+
}, true);
541586
}
542587

543588
fn testTokenize(source: []const u8, expected_tokens: []const Token.Id, expected_eol_at_eof: bool) {
@@ -546,8 +591,8 @@ fn testTokenize(source: []const u8, expected_tokens: []const Token.Id, expected_
546591
const token = tokenizer.next();
547592
std.debug.assert(@TagType(Token.Id)(token.id) == @TagType(Token.Id)(expected_token_id));
548593
switch (expected_token_id) {
549-
Token.Id.StringLiteral => |kind| {
550-
@panic("TODO: how do i test this?");
594+
Token.Id.StringLiteral => |expected_kind| {
595+
std.debug.assert(expected_kind == switch (token.id) { Token.Id.StringLiteral => |kind| kind, else => unreachable });
551596
},
552597
else => {},
553598
}

0 commit comments

Comments
 (0)