@@ -141,6 +141,7 @@ pub const Tokenizer = struct {
141141 buffer : []const u8 ,
142142 index : usize ,
143143 actual_file_end : usize ,
144+ pending_invalid_token : ? Token ,
144145
145146 pub const Location = struct {
146147 line : usize ,
@@ -179,24 +180,18 @@ pub const Tokenizer = struct {
179180 }
180181
181182 pub fn init (buffer : []const u8 ) - > Tokenizer {
182- if (buffer .len == 0 or buffer [buffer .len - 1 ] == '\n ' ) {
183- return Tokenizer {
184- .buffer = buffer ,
185- .index = 0 ,
186- .actual_file_end = buffer .len ,
187- };
188- } else {
183+ var source_len = buffer .len ;
184+ while (source_len > 0 ) : (source_len -= 1 ) {
185+ if (buffer [source_len - 1 ] == '\n ' ) break ;
189186 // last line is incomplete, so skip it, and give an error when we get there.
190- var source_len = buffer .len ;
191- while (source_len > 0 ) : (source_len -= 1 ) {
192- if (buffer [source_len - 1 ] == '\n ' ) break ;
193- }
194- return Tokenizer {
195- .buffer = buffer [0.. source_len ],
196- .index = 0 ,
197- .actual_file_end = buffer .len ,
198- };
199187 }
188+
189+ return Tokenizer {
190+ .buffer = buffer [0.. source_len ],
191+ .index = 0 ,
192+ .actual_file_end = buffer .len ,
193+ .pending_invalid_token = null ,
194+ };
200195 }
201196
202197 const State = enum {
@@ -223,6 +218,10 @@ pub const Tokenizer = struct {
223218 };
224219
225220 pub fn next (self : & Tokenizer ) - > Token {
221+ if (self .pending_invalid_token ) | token | {
222+ self .pending_invalid_token = null ;
223+ return token ;
224+ }
226225 var state = State .Start ;
227226 var result = Token {
228227 .id = Token .Id .Eof ,
@@ -368,7 +367,7 @@ pub const Tokenizer = struct {
368367 break ;
369368 },
370369 '\n ' = > break , // Look for this error later.
371- else = > {} ,
370+ else = > self . checkLiteralCharacter () ,
372371 },
373372
374373 State .StringLiteralBackslash = > switch (c ) {
@@ -455,7 +454,7 @@ pub const Tokenizer = struct {
455454 .end = undefined ,
456455 };
457456 },
458- else = > {} ,
457+ else = > self . checkLiteralCharacter () ,
459458 },
460459 State .Zero = > switch (c ) {
461460 'b' , 'o' , 'x' = > {
@@ -513,23 +512,46 @@ pub const Tokenizer = struct {
513512 }
514513 }
515514 result .end = self .index ;
516- if (result .id == Token .Id .Eof and self .actual_file_end != self .buffer .len ) {
517- // instead of an Eof, give an error token
518- result .id = Token .Id .NoEolAtEof ;
519- result .end = self .actual_file_end ;
515+ if (result .id == Token .Id .Eof ) {
516+ if (self .pending_invalid_token ) | token | {
517+ self .pending_invalid_token = null ;
518+ return token ;
519+ }
520+ if (self .actual_file_end != self .buffer .len ) {
521+ // instead of an Eof, give an error token
522+ result .id = Token .Id .NoEolAtEof ;
523+ result .end = self .actual_file_end ;
524+ }
520525 }
521526 return result ;
522527 }
523528
524529 pub fn getTokenSlice (self : & const Tokenizer , token : & const Token ) - > []const u8 {
525530 return self .buffer [token .start .. token .end ];
526531 }
532+
533+ fn checkLiteralCharacter (self : & Tokenizer ) {
534+ if (self .pending_invalid_token != null ) return ;
535+ const c0 = self .buffer [self .index ];
536+ if (c0 < 0x20 or c0 == 0x7f ) {
537+ // ascii control codes are never allowed
538+ // (note that \n was checked before we got here)
539+ self .pending_invalid_token = Token {
540+ .id = Token .Id .Invalid ,
541+ .start = self .index ,
542+ .end = self .index + 1 ,
543+ };
544+ return ;
545+ }
546+ }
527547};
528548
529549
530550
531551test "tokenizer" {
532552 // source must end with eol
553+ testTokenize ("" , []Token.Id {
554+ }, true );
533555 testTokenize ("no newline" , []Token.Id {
534556 }, false );
535557 testTokenize ("test\n " , []Token.Id {
@@ -538,6 +560,29 @@ test "tokenizer" {
538560 testTokenize ("test\n no newline" , []Token.Id {
539561 Token .Id .Keyword_test ,
540562 }, false );
563+
564+ // invalid token characters
565+ testTokenize ("#\n " , []Token.Id {
566+ Token .Id .Invalid ,
567+ }, true );
568+ testTokenize ("`\n " , []Token.Id {
569+ Token .Id .Invalid ,
570+ }, true );
571+
572+ // invalid literal/comment characters
573+ testTokenize ("\" \x00 \" \n " , []Token.Id {
574+ Token.Id { .StringLiteral = Token .StrLitKind .Normal },
575+ Token .Id .Invalid ,
576+ }, true );
577+ testTokenize ("//\x00 \n " , []Token.Id {
578+ Token .Id .Invalid ,
579+ }, true );
580+ testTokenize ("//\x1f \n " , []Token.Id {
581+ Token .Id .Invalid ,
582+ }, true );
583+ testTokenize ("//\x7f \n " , []Token.Id {
584+ Token .Id .Invalid ,
585+ }, true );
541586}
542587
543588fn testTokenize (source : []const u8 , expected_tokens : []const Token.Id , expected_eol_at_eof : bool ) {
@@ -546,8 +591,8 @@ fn testTokenize(source: []const u8, expected_tokens: []const Token.Id, expected_
546591 const token = tokenizer .next ();
547592 std .debug .assert (@TagType (Token .Id )(token .id ) == @TagType (Token .Id )(expected_token_id ));
548593 switch (expected_token_id ) {
549- Token .Id .StringLiteral = > | kind | {
550- @panic ( "TODO: how do i test this?" );
594+ Token .Id .StringLiteral = > | expected_kind | {
595+ std . debug . assert ( expected_kind == switch ( token . id ) { Token . Id . StringLiteral = > | kind | kind , else = > unreachable } );
551596 },
552597 else = > {},
553598 }
0 commit comments