Skip to content

Commit

Permalink
refactor: move parsing html comments to the scanner
Browse files Browse the repository at this point in the history
  • Loading branch information
amaanq committed Jan 31, 2024
1 parent 84c57ee commit 4f279cc
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 19 deletions.
35 changes: 16 additions & 19 deletions grammar.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,16 @@ module.exports = grammar({
$._automatic_semicolon,
$._template_chars,
$._ternary_qmark,
$.html_comment,
'||',
// We use escape sequence to tell the scanner if we're currently inside a string or template string, in which case
// it should NOT parse html comments.
$.escape_sequence,
],

extras: $ => [
$.comment,
$.html_comment,
/[\s\p{Zs}\uFEFF\u2028\u2029\u2060\u200B]/,
],

Expand Down Expand Up @@ -953,25 +959,16 @@ module.exports = grammar({
)),

// http://stackoverflow.com/questions/13014947/regex-to-match-a-c-style-multiline-comment/36328890#36328890
comment: _ => token(choice(
seq('//', /.*/),
seq(
'/*',
/[^*]*\*+([^/*][^*]*\*+)*/,
'/',
),
// https://tc39.es/ecma262/#sec-html-like-comments
seq('<!--', /.*/),
// This allows code to exist before this token on the same line.
//
// Technically, --> is supposed to have nothing before it on the same line
// except for comments and whitespace, but that is difficult to express,
// and in general tree sitter grammars tend to prefer to be overly
// permissive anyway.
//
// This approach does not appear to cause problems in practice.
seq('-->', /.*/),
)),
comment: $ => choice(
token(choice(
seq('//', /.*/),
seq(
'/*',
/[^*]*\*+([^/*][^*]*\*+)*/,
'/',
),
)),
),

template_string: $ => seq(
'`',
Expand Down
46 changes: 46 additions & 0 deletions src/scanner.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ enum TokenType {
AUTOMATIC_SEMICOLON,
TEMPLATE_CHARS,
TERNARY_QMARK,
HTML_COMMENT,
LOGICAL_OR,
ESCAPE_SEQUENCE,
};

void *tree_sitter_javascript_external_scanner_create() { return NULL; }
Expand Down Expand Up @@ -169,6 +172,45 @@ static bool scan_ternary_qmark(TSLexer *lexer) {
return false;
}


static bool scan_closing_comment(TSLexer *lexer) {

while (iswspace(lexer->lookahead) || lexer->lookahead == 0x2028 || lexer->lookahead == 0x2029) {
skip(lexer);
}

const char* comment_start = "<!--";
const char* comment_end = "-->";

if (lexer->lookahead == '<') {
for (unsigned i = 0; i < 4; i++) {
if (lexer->lookahead != comment_start[i]) {
return false;
}
advance(lexer);
}
} else if (lexer->lookahead == '-') {
for (unsigned i = 0; i < 3; i++) {
if (lexer->lookahead != comment_end[i]) {
return false;
}
advance(lexer);
}
} else {
return false;
}

while (lexer->lookahead != 0 && lexer->lookahead != '\n' &&
lexer->lookahead != 0x2028 && lexer->lookahead != 0x2029) {
advance(lexer);
}

lexer->result_symbol = HTML_COMMENT;
lexer->mark_end(lexer);

return true;
}

bool tree_sitter_javascript_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
if (valid_symbols[TEMPLATE_CHARS]) {
Expand All @@ -184,5 +226,9 @@ bool tree_sitter_javascript_external_scanner_scan(void *payload, TSLexer *lexer,
return scan_ternary_qmark(lexer);
}

if (valid_symbols[HTML_COMMENT] && !valid_symbols[LOGICAL_OR] && !valid_symbols[ESCAPE_SEQUENCE]) {
return scan_closing_comment(lexer);
}

return false;
}

0 comments on commit 4f279cc

Please sign in to comment.