-
Notifications
You must be signed in to change notification settings - Fork 19
feat(parser)!: support custom input encoding #136
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -244,18 +244,22 @@ public Optional<Tree> parse(String source, Tree oldTree) throws IllegalStateExce | |
* | ||
* @return An optional {@linkplain Tree} which is empty if parsing was halted. | ||
* @throws IllegalStateException If the parser does not have a language assigned. | ||
* @throws IllegalArgumentException If given a custom {@link InputEncoding}. | ||
*/ | ||
public Optional<Tree> parse(String source, InputEncoding encoding, @Nullable Tree oldTree) | ||
throws IllegalStateException { | ||
if (language == null) { | ||
throw new IllegalStateException("The parser has no language assigned"); | ||
} | ||
if (encoding.encoding() == TSInputEncodingCustom()) { | ||
throw new IllegalArgumentException("Custom encoding is not supported when parsing strings"); | ||
} | ||
|
||
try (var alloc = Arena.ofShared()) { | ||
var bytes = source.getBytes(encoding.charset()); | ||
var string = alloc.allocateFrom(C_CHAR, bytes); | ||
var old = oldTree == null ? MemorySegment.NULL : oldTree.segment(); | ||
var tree = ts_parser_parse_string_encoding(self, old, string, bytes.length, encoding.ordinal()); | ||
var tree = ts_parser_parse_string_encoding(self, old, string, bytes.length, encoding.encoding()); | ||
if (tree.equals(MemorySegment.NULL)) return Optional.empty(); | ||
return Optional.of(new Tree(tree, language, source, encoding.charset())); | ||
} | ||
|
@@ -304,7 +308,20 @@ public Optional<Tree> parse( | |
|
||
var input = TSInput.allocate(arena); | ||
TSInput.payload(input, MemorySegment.NULL); | ||
TSInput.encoding(input, encoding.ordinal()); | ||
TSInput.encoding(input, encoding.encoding()); | ||
var encoder = encoding.charset().newEncoder(); | ||
var decode = encoding.encoding() != TSInputEncodingCustom() | ||
? MemorySegment.NULL | ||
: DecodeFunction.allocate( | ||
(string, length, code_point) -> { | ||
if (length == 0) return 0; | ||
var buffer = string.asSlice(0, length).asByteBuffer(); | ||
var decoded = encoding.charset().decode(buffer); | ||
code_point.set(C_INT, 0, decoded.charAt(0)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
return (int)encoder.maxBytesPerChar(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
}, | ||
arena); | ||
TSInput.decode(input, decode); | ||
// NOTE: can't use _ because of palantir/palantir-java-format#934 | ||
var read = TSInput.read.allocate( | ||
(payload, index, point, bytes) -> { | ||
|
@@ -365,6 +382,7 @@ public String toString() { | |
* | ||
* @since 0.25.0 | ||
*/ | ||
@SuppressWarnings("ClassCanBeRecord") | ||
public static final class State { | ||
private final @Unsigned int currentByteOffset; | ||
private final boolean hasError; | ||
|
@@ -398,6 +416,7 @@ public String toString() { | |
* @since 0.25.0 | ||
*/ | ||
@NullMarked | ||
@SuppressWarnings("ClassCanBeRecord") | ||
public static final class Options { | ||
private final Predicate<State> progressCallback; | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -101,9 +101,9 @@ void parseLogger() { | |
assertEquals("LEX - done", messages.getLast()); | ||
} | ||
|
||
@SuppressWarnings("unused") | ||
@Test | ||
@DisplayName("parse(callback)") | ||
@SuppressWarnings("unused") | ||
void parseCallback() { | ||
var source = "class Foo {}"; | ||
// NOTE: can't use _ because of palantir/palantir-java-format#934 | ||
|
@@ -154,6 +154,24 @@ void parseCancellation() { | |
} | ||
} | ||
|
||
@Test | ||
@DisplayName("parse(custom)") | ||
void parseCustom() { | ||
parser.setLanguage(language); | ||
var encoding = InputEncoding.valueOf(StandardCharsets.UTF_32); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Related to https://github.com/tree-sitter/java-tree-sitter/pull/136/files#r2341779397, this test only works because UTF-32 uses a fixed amount of bytes per code point. And I am also not completely sure if this here works properly since for these surrogate pairs probably only the high surrogate is actually read. |
||
var source = "var value = \"\uD83C\uDF00\uD83C\uDFEF\";"; | ||
ParseCallback callback = (offset, p) -> offset == 0 ? source : null; | ||
try (var tree = parser.parse(callback, encoding).orElseThrow()) { | ||
var rootNode = tree.getRootNode(); | ||
|
||
assertEquals(68, rootNode.getEndByte()); | ||
assertFalse(rootNode.isError()); | ||
assertEquals( | ||
"(program (local_variable_declaration type: (type_identifier) declarator: (variable_declarator name: (identifier) value: (string_literal (string_fragment)))))", | ||
rootNode.toSexp()); | ||
} | ||
} | ||
|
||
@Test | ||
@DisplayName("parse(options)") | ||
void parseOptions() { | ||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Calling
decode(buffer)
here will decode all bytes, even though just the first code point is needed. This is probably rather inefficient.Also as mentioned by the documentation,
Charset#decode
usesCodingErrorAction.REPLACE
, which might not be desired? (On the other hand, I am not sure how the Java FFM behaves when an upcall throws an exception, maybe it exits the JVM, seeLinker#upcallStub
.)