Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 38 additions & 19 deletions src/main/java/io/github/treesitter/jtreesitter/InputEncoding.java
Original file line number Diff line number Diff line change
@@ -1,46 +1,62 @@
package io.github.treesitter.jtreesitter;

import static io.github.treesitter.jtreesitter.internal.TreeSitter.*;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import org.jspecify.annotations.NonNull;

/** The encoding of source code. */
public enum InputEncoding {
@SuppressWarnings("ClassCanBeRecord")
public class InputEncoding {
private final @NonNull Charset charset;

private final int encoding;

private InputEncoding(@NonNull Charset charset, int encoding) {
this.charset = charset;
this.encoding = encoding;
}

Charset charset() {
return charset;
}

int encoding() {
return encoding;
}

/** UTF-8 encoding. */
UTF_8(StandardCharsets.UTF_8),
public static final InputEncoding UTF_8 = new InputEncoding(StandardCharsets.UTF_8, TSInputEncodingUTF8());

/**
* UTF-16 little endian encoding.
*
* @since 0.25.0
*/
UTF_16LE(StandardCharsets.UTF_16LE),
public static final InputEncoding UTF_16LE = new InputEncoding(StandardCharsets.UTF_16LE, TSInputEncodingUTF16LE());

/**
* UTF-16 big endian encoding.
*
* @since 0.25.0
*/
UTF_16BE(StandardCharsets.UTF_16BE);

private final @NonNull Charset charset;

InputEncoding(@NonNull Charset charset) {
this.charset = charset;
}

Charset charset() {
return charset;
}
public static final InputEncoding UTF_16BE = new InputEncoding(StandardCharsets.UTF_16BE, TSInputEncodingUTF16BE());

private static final boolean IS_BIG_ENDIAN = ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN);

/**
* Convert a standard {@linkplain Charset} to an {@linkplain InputEncoding}.
*
* @param charset one of {@link StandardCharsets#UTF_8}, {@link StandardCharsets#UTF_16BE},
* {@link StandardCharsets#UTF_16LE}, or {@link StandardCharsets#UTF_16} (native byte order).
* @throws IllegalArgumentException If the character set is invalid.
* @since 0.25.1
* @implNote The following encodings are handled by the Tree-sitter library:
* {@link StandardCharsets#UTF_8}, {@link StandardCharsets#UTF_16BE},
* {@link StandardCharsets#UTF_16LE}, and {@link StandardCharsets#UTF_16} (native byte order).
* Every other {@link Charset} will use its {@link Charset#decode(ByteBuffer) decoder}
* ({@link StandardCharsets#UTF_32} is converted to the native byte order).
*
* @since 0.26.0
*/
@SuppressWarnings("SameParameterValue")
public static @NonNull InputEncoding valueOf(@NonNull Charset charset) throws IllegalArgumentException {
Expand All @@ -50,6 +66,9 @@ Charset charset() {
if (charset.equals(StandardCharsets.UTF_16)) {
return IS_BIG_ENDIAN ? InputEncoding.UTF_16BE : InputEncoding.UTF_16LE;
}
throw new IllegalArgumentException("Invalid character set: %s".formatted(charset));
if (charset.equals(StandardCharsets.UTF_32)) {
charset = IS_BIG_ENDIAN ? StandardCharsets.UTF_32BE : StandardCharsets.UTF_32LE;
}
return new InputEncoding(charset, TSInputEncodingCustom());
}
}
23 changes: 21 additions & 2 deletions src/main/java/io/github/treesitter/jtreesitter/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -244,18 +244,22 @@ public Optional<Tree> parse(String source, Tree oldTree) throws IllegalStateExce
*
* @return An optional {@linkplain Tree} which is empty if parsing was halted.
* @throws IllegalStateException If the parser does not have a language assigned.
* @throws IllegalArgumentException If given a custom {@link InputEncoding}.
*/
public Optional<Tree> parse(String source, InputEncoding encoding, @Nullable Tree oldTree)
throws IllegalStateException {
if (language == null) {
throw new IllegalStateException("The parser has no language assigned");
}
if (encoding.encoding() == TSInputEncodingCustom()) {
throw new IllegalArgumentException("Custom encoding is not supported when parsing strings");
}

try (var alloc = Arena.ofShared()) {
var bytes = source.getBytes(encoding.charset());
var string = alloc.allocateFrom(C_CHAR, bytes);
var old = oldTree == null ? MemorySegment.NULL : oldTree.segment();
var tree = ts_parser_parse_string_encoding(self, old, string, bytes.length, encoding.ordinal());
var tree = ts_parser_parse_string_encoding(self, old, string, bytes.length, encoding.encoding());
if (tree.equals(MemorySegment.NULL)) return Optional.empty();
return Optional.of(new Tree(tree, language, source, encoding.charset()));
}
Expand Down Expand Up @@ -304,7 +308,20 @@ public Optional<Tree> parse(

var input = TSInput.allocate(arena);
TSInput.payload(input, MemorySegment.NULL);
TSInput.encoding(input, encoding.ordinal());
TSInput.encoding(input, encoding.encoding());
var encoder = encoding.charset().newEncoder();
var decode = encoding.encoding() != TSInputEncodingCustom()
? MemorySegment.NULL
: DecodeFunction.allocate(
(string, length, code_point) -> {
if (length == 0) return 0;
var buffer = string.asSlice(0, length).asByteBuffer();
var decoded = encoding.charset().decode(buffer);
Copy link
Contributor

@Marcono1234 Marcono1234 Sep 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Calling decode(buffer) here will decode all bytes, even though just the first code point is needed. This is probably rather inefficient.

Also as mentioned by the documentation, Charset#decode uses CodingErrorAction.REPLACE, which might not be desired? (On the other hand, I am not sure how the Java FFM behaves when an upcall throws an exception, maybe it exits the JVM, see Linker#upcallStub.)

code_point.set(C_INT, 0, decoded.charAt(0));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

charAt(0) only retrieves a single 16-bit Java char. For supplementary codepoints (>= U+10000) that will only be the high surrogate.

return (int)encoder.maxBytesPerChar();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maxBytesPerChar() only works if the charset consumes a fixed number of bytes. It would not work for a charset like UTF-8 (if that didn't have built-in support by tree-sitter) where a code point uses a variable amount of bytes.

},
arena);
TSInput.decode(input, decode);
// NOTE: can't use _ because of palantir/palantir-java-format#934
var read = TSInput.read.allocate(
(payload, index, point, bytes) -> {
Expand Down Expand Up @@ -365,6 +382,7 @@ public String toString() {
*
* @since 0.25.0
*/
@SuppressWarnings("ClassCanBeRecord")
public static final class State {
private final @Unsigned int currentByteOffset;
private final boolean hasError;
Expand Down Expand Up @@ -398,6 +416,7 @@ public String toString() {
* @since 0.25.0
*/
@NullMarked
@SuppressWarnings("ClassCanBeRecord")
public static final class Options {
private final Predicate<State> progressCallback;

Expand Down
20 changes: 19 additions & 1 deletion src/test/java/io/github/treesitter/jtreesitter/ParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,9 @@ void parseLogger() {
assertEquals("LEX - done", messages.getLast());
}

@SuppressWarnings("unused")
@Test
@DisplayName("parse(callback)")
@SuppressWarnings("unused")
void parseCallback() {
var source = "class Foo {}";
// NOTE: can't use _ because of palantir/palantir-java-format#934
Expand Down Expand Up @@ -154,6 +154,24 @@ void parseCancellation() {
}
}

@Test
@DisplayName("parse(custom)")
void parseCustom() {
parser.setLanguage(language);
var encoding = InputEncoding.valueOf(StandardCharsets.UTF_32);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Related to https://github.com/tree-sitter/java-tree-sitter/pull/136/files#r2341779397, this test only works because UTF-32 uses a fixed amount of bytes per code point.

And I am also not completely sure if this here works properly since for these surrogate pairs probably only the high surrogate is actually read.

var source = "var value = \"\uD83C\uDF00\uD83C\uDFEF\";";
ParseCallback callback = (offset, p) -> offset == 0 ? source : null;
try (var tree = parser.parse(callback, encoding).orElseThrow()) {
var rootNode = tree.getRootNode();

assertEquals(68, rootNode.getEndByte());
assertFalse(rootNode.isError());
assertEquals(
"(program (local_variable_declaration type: (type_identifier) declarator: (variable_declarator name: (identifier) value: (string_literal (string_fragment)))))",
rootNode.toSexp());
}
}

@Test
@DisplayName("parse(options)")
void parseOptions() {
Expand Down
Loading