External lexer for extras
comment
#884
-
I'm trying out different features of tree-sitter, and having troubles with module.exports = grammar({
name: "simple_4",
externals: $ => [$.COMMENT],
extras: $ => [$.COMMENT, /[\s\n]/],
rules: {
main: $ => repeat($.stmt),
stmt: $ => /[0-9]/
}
}); That correspond to language that consists of single-digit integer literals, with optional comments in form of bool tree_sitter_simple_4_external_scanner_scan(
void* payload,
TSLexer* lexer,
const bool* valid_symbols) {
printf(
"Lex at [%c]\n",
(lexer->lookahead == '\n' ? 'n' : lexer->lookahead));
if (valid_symbols[COMMENT] && lexer->lookahead == '{') {
printf(" At comment start\n");
while (lexer->lookahead != '}' && lexer->lookahead != '\0') {
lexer->advance(lexer, false);
}
lexer->advance(lexer, false);
lexer->mark_end(lexer);
lexer->result_symbol = COMMENT;
return true;
}
return false;
} Full source code#include <stdio.h>
#include <tree_sitter/parser.h>
enum Tok
{
COMMENT
};
void* tree_sitter_simple_4_external_scanner_create() {
return NULL;
}
bool tree_sitter_simple_4_external_scanner_scan(
void* payload,
TSLexer* lexer,
const bool* valid_symbols) {
printf(
"Lex at [%c]\n",
(lexer->lookahead == '\n' ? 'n' : lexer->lookahead));
if (valid_symbols[COMMENT] && lexer->lookahead == '{') {
printf(" At comment start\n");
while (lexer->lookahead != '}' && lexer->lookahead != '\0') {
lexer->advance(lexer, false);
}
lexer->advance(lexer, false);
lexer->mark_end(lexer);
lexer->result_symbol = COMMENT;
return true;
}
return false;
}
unsigned tree_sitter_simple_4_external_scanner_serialize(
void* payload,
char* buffer) {
return 0;
}
void tree_sitter_simple_4_external_scanner_deserialize(
void* payload,
const char* buffer,
unsigned length) {
}
void tree_sitter_simple_4_external_scanner_destroy(void* payload) {
} Parser works as expected for inputs like To be more specific - I added I'm not specifically sure, but I might've misunderstood something wrt. of $ tree-sitter --version
tree-sitter 0.17.3 |
Beta Was this translation helpful? Give feedback.
Replies: 5 comments 9 replies
-
Not an expert at all here; I just started exploring Tree-Sitter myself, but I have some quick suggestions:
(again, just quick reactions, I haven't looked at the overall logic in any detail at all, but hope that helps.) |
Beta Was this translation helpful? Give feedback.
-
I updated grammar to use the uppercase
(main [0, 0] - [1, 0]
(stmt [0, 0] - [0, 1])
(COMMENT [0, 1] - [0, 7])) The AST is correct as far as I'm concerned, and the comment is correctly recognized. And here is an output for
(main [0, 0] - [1, 0]
(stmt [0, 0] - [0, 1])
(ERROR [0, 2] - [0, 8]
(ERROR [0, 2] - [0, 8]))) Also including output of new_parse
process version:0, version_count:1, state:1, row:1, col:0
lex_external state:1, row:1, column:0
Lex at [1]
lex_internal state:0, row:1, column:0
consume character:'1'
lexed_lookahead sym:stmt, size:1
shift state:2
process version:0, version_count:1, state:2, row:1, col:1
lex_external state:1, row:1, column:1
Lex at [ ]
lex_internal state:0, row:1, column:1
skip character:' '
lex_external state:1, row:1, column:1
Lex at [ ]
lex_internal state:0, row:1, column:1
skip character:' '
skip_unrecognized_character
consume character:'{'
lex_external state:1, row:1, column:3
Lex at [t]
lex_internal state:0, row:1, column:3
consume character:'t'
lex_external state:1, row:1, column:4
Lex at [e]
lex_internal state:0, row:1, column:4
consume character:'e'
lex_external state:1, row:1, column:5
Lex at [s]
lex_internal state:0, row:1, column:5
consume character:'s'
lex_external state:1, row:1, column:6
Lex at [t]
lex_internal state:0, row:1, column:6
consume character:'t'
lex_external state:1, row:1, column:7
Lex at [}]
lex_internal state:0, row:1, column:7
consume character:'}'
lex_external state:1, row:1, column:8
Lex at [n]
lex_internal state:0, row:1, column:8
skip character:10
lexed_lookahead sym:ERROR, size:7, character:'{'
detect_error
resume version:0
process version:0, version_count:1, state:0, row:1, col:1
lex_external state:1, row:1, column:1
Lex at [ ]
lex_internal state:0, row:1, column:1
skip character:' '
skip_unrecognized_character
consume character:'{'
lex_external state:1, row:1, column:3
Lex at [t]
lex_internal state:0, row:1, column:3
consume character:'t'
lex_external state:1, row:1, column:4
Lex at [e]
lex_internal state:0, row:1, column:4
consume character:'e'
lex_external state:1, row:1, column:5
Lex at [s]
lex_internal state:0, row:1, column:5
consume character:'s'
lex_external state:1, row:1, column:6
Lex at [t]
lex_internal state:0, row:1, column:6
consume character:'t'
lex_external state:1, row:1, column:7
Lex at [}]
lex_internal state:0, row:1, column:7
consume character:'}'
lex_external state:1, row:1, column:8
Lex at [n]
lex_internal state:0, row:1, column:8
skip character:10
lexed_lookahead sym:ERROR, size:7, character:'{'
skip_token symbol:ERROR
process version:0, version_count:1, state:0, row:1, col:8
lex_external state:1, row:1, column:8
Lex at [n]
lex_internal state:0, row:1, column:8
skip character:10
lexed_lookahead sym:end, size:1
recover_to_previous state:2, depth:2
recover_eof
process version:1, version_count:2, state:2, row:1, col:8
lex_external state:1, row:1, column:8
Lex at [n]
lex_internal state:0, row:1, column:8
skip character:10
lexed_lookahead sym:end, size:1
reduce sym:main, child_count:1
accept
select_smaller_error symbol:main, over_symbol:ERROR
done |
Beta Was this translation helpful? Give feedback.
-
The names in the token enum don’t matter; the order corresponds to the ordering of the externals array. I think your external scanner just needs to handle white space. White space is not automatically skipped, because sometimes it plays a role in scanners’ token selection logic. |
Beta Was this translation helpful? Give feedback.
-
@maxbrunsfeld yes, that solved main issue, but now Updated grammar module.exports = grammar({
name: "simple_4",
externals: $ => [$.COMMENT, $.SPACE],
extras: $ => [$.COMMENT, $.SPACE, /\n/],
rules: {
main: $ => repeat($.stmt),
stmt: $ => /[0-9]/
}
}); And lexer code bool tree_sitter_simple_4_external_scanner_scan(
void* payload,
TSLexer* lexer,
const bool* valid_symbols) {
if (valid_symbols[COMMENT] && lexer->lookahead == '{') {
while (lexer->lookahead != '}' && lexer->lookahead != '\0') {
lexer->advance(lexer, false);
}
lexer->advance(lexer, false);
lexer->mark_end(lexer);
lexer->result_symbol = COMMENT;
return true;
} else if (valid_symbols[SPACE] && lexer->lookahead == ' ') {
lexer->advance(lexer, true);
lexer->mark_end(lexer);
lexer->result_symbol = SPACE;
return true;
} else {
return false;
}
} For (main [0, 0] - [1, 0]
(stmt [0, 0] - [0, 1])
(SPACE [0, 2] - [0, 2])
(COMMENT [0, 2] - [0, 8])) In this particular case |
Beta Was this translation helpful? Give feedback.
-
For the sake of completeness - full main scanner function with edits: bool tree_sitter_simple_4_external_scanner_scan(
void* payload,
TSLexer* lexer,
const bool* valid_symbols) {
+ while (lexer->lookahead == ' ') {
+ lexer->advance(lexer, true);
+ }
if (valid_symbols[COMMENT] && lexer->lookahead == '{') {
while (lexer->lookahead != '}' && lexer->lookahead != '\0') {
lexer->advance(lexer, false);
}
lexer->advance(lexer, false);
lexer->mark_end(lexer);
lexer->result_symbol = COMMENT;
return true;
}
return false;
} Original grammar and code left unchanged |
Beta Was this translation helpful? Give feedback.
The names in the token enum don’t matter; the order corresponds to the ordering of the externals array.
I think your external scanner just needs to handle white space. White space is not automatically skipped, because sometimes it plays a role in scanners’ token selection logic.