diff --git a/src/main/java/io/github/treesitter/jtreesitter/InputEncoding.java b/src/main/java/io/github/treesitter/jtreesitter/InputEncoding.java index df88380..81709af 100644 --- a/src/main/java/io/github/treesitter/jtreesitter/InputEncoding.java +++ b/src/main/java/io/github/treesitter/jtreesitter/InputEncoding.java @@ -1,46 +1,62 @@ package io.github.treesitter.jtreesitter; +import static io.github.treesitter.jtreesitter.internal.TreeSitter.*; + +import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import org.jspecify.annotations.NonNull; /** The encoding of source code. */ -public enum InputEncoding { +@SuppressWarnings("ClassCanBeRecord") +public class InputEncoding { + private final @NonNull Charset charset; + + private final int encoding; + + private InputEncoding(@NonNull Charset charset, int encoding) { + this.charset = charset; + this.encoding = encoding; + } + + Charset charset() { + return charset; + } + + int encoding() { + return encoding; + } + /** UTF-8 encoding. */ - UTF_8(StandardCharsets.UTF_8), + public static final InputEncoding UTF_8 = new InputEncoding(StandardCharsets.UTF_8, TSInputEncodingUTF8()); + /** * UTF-16 little endian encoding. * * @since 0.25.0 */ - UTF_16LE(StandardCharsets.UTF_16LE), + public static final InputEncoding UTF_16LE = new InputEncoding(StandardCharsets.UTF_16LE, TSInputEncodingUTF16LE()); + /** * UTF-16 big endian encoding. * * @since 0.25.0 */ - UTF_16BE(StandardCharsets.UTF_16BE); - - private final @NonNull Charset charset; - - InputEncoding(@NonNull Charset charset) { - this.charset = charset; - } - - Charset charset() { - return charset; - } + public static final InputEncoding UTF_16BE = new InputEncoding(StandardCharsets.UTF_16BE, TSInputEncodingUTF16BE()); private static final boolean IS_BIG_ENDIAN = ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN); /** * Convert a standard {@linkplain Charset} to an {@linkplain InputEncoding}. * - * @param charset one of {@link StandardCharsets#UTF_8}, {@link StandardCharsets#UTF_16BE}, - * {@link StandardCharsets#UTF_16LE}, or {@link StandardCharsets#UTF_16} (native byte order). - * @throws IllegalArgumentException If the character set is invalid. - * @since 0.25.1 + * @implNote The following encodings are handled by the Tree-sitter library: + * {@link StandardCharsets#UTF_8}, {@link StandardCharsets#UTF_16BE}, + * {@link StandardCharsets#UTF_16LE}, and {@link StandardCharsets#UTF_16} (native byte order). + * Every other {@link Charset} will use its {@link Charset#decode(ByteBuffer) decoder} + * ({@link StandardCharsets#UTF_32} is converted to the native byte order). + * + * @since 0.26.0 */ @SuppressWarnings("SameParameterValue") public static @NonNull InputEncoding valueOf(@NonNull Charset charset) throws IllegalArgumentException { @@ -50,6 +66,9 @@ Charset charset() { if (charset.equals(StandardCharsets.UTF_16)) { return IS_BIG_ENDIAN ? InputEncoding.UTF_16BE : InputEncoding.UTF_16LE; } - throw new IllegalArgumentException("Invalid character set: %s".formatted(charset)); + if (charset.equals(StandardCharsets.UTF_32)) { + charset = IS_BIG_ENDIAN ? StandardCharsets.UTF_32BE : StandardCharsets.UTF_32LE; + } + return new InputEncoding(charset, TSInputEncodingCustom()); } } diff --git a/src/main/java/io/github/treesitter/jtreesitter/Parser.java b/src/main/java/io/github/treesitter/jtreesitter/Parser.java index 5b03d45..fd255a2 100644 --- a/src/main/java/io/github/treesitter/jtreesitter/Parser.java +++ b/src/main/java/io/github/treesitter/jtreesitter/Parser.java @@ -244,18 +244,22 @@ public Optional parse(String source, Tree oldTree) throws IllegalStateExce * * @return An optional {@linkplain Tree} which is empty if parsing was halted. * @throws IllegalStateException If the parser does not have a language assigned. + * @throws IllegalArgumentException If given a custom {@link InputEncoding}. */ public Optional parse(String source, InputEncoding encoding, @Nullable Tree oldTree) throws IllegalStateException { if (language == null) { throw new IllegalStateException("The parser has no language assigned"); } + if (encoding.encoding() == TSInputEncodingCustom()) { + throw new IllegalArgumentException("Custom encoding is not supported when parsing strings"); + } try (var alloc = Arena.ofShared()) { var bytes = source.getBytes(encoding.charset()); var string = alloc.allocateFrom(C_CHAR, bytes); var old = oldTree == null ? MemorySegment.NULL : oldTree.segment(); - var tree = ts_parser_parse_string_encoding(self, old, string, bytes.length, encoding.ordinal()); + var tree = ts_parser_parse_string_encoding(self, old, string, bytes.length, encoding.encoding()); if (tree.equals(MemorySegment.NULL)) return Optional.empty(); return Optional.of(new Tree(tree, language, source, encoding.charset())); } @@ -304,7 +308,20 @@ public Optional parse( var input = TSInput.allocate(arena); TSInput.payload(input, MemorySegment.NULL); - TSInput.encoding(input, encoding.ordinal()); + TSInput.encoding(input, encoding.encoding()); + var encoder = encoding.charset().newEncoder(); + var decode = encoding.encoding() != TSInputEncodingCustom() + ? MemorySegment.NULL + : DecodeFunction.allocate( + (string, length, code_point) -> { + if (length == 0) return 0; + var buffer = string.asSlice(0, length).asByteBuffer(); + var decoded = encoding.charset().decode(buffer); + code_point.set(C_INT, 0, decoded.charAt(0)); + return (int)encoder.maxBytesPerChar(); + }, + arena); + TSInput.decode(input, decode); // NOTE: can't use _ because of palantir/palantir-java-format#934 var read = TSInput.read.allocate( (payload, index, point, bytes) -> { @@ -365,6 +382,7 @@ public String toString() { * * @since 0.25.0 */ + @SuppressWarnings("ClassCanBeRecord") public static final class State { private final @Unsigned int currentByteOffset; private final boolean hasError; @@ -398,6 +416,7 @@ public String toString() { * @since 0.25.0 */ @NullMarked + @SuppressWarnings("ClassCanBeRecord") public static final class Options { private final Predicate progressCallback; diff --git a/src/test/java/io/github/treesitter/jtreesitter/ParserTest.java b/src/test/java/io/github/treesitter/jtreesitter/ParserTest.java index 9941a49..16e786c 100644 --- a/src/test/java/io/github/treesitter/jtreesitter/ParserTest.java +++ b/src/test/java/io/github/treesitter/jtreesitter/ParserTest.java @@ -101,9 +101,9 @@ void parseLogger() { assertEquals("LEX - done", messages.getLast()); } - @SuppressWarnings("unused") @Test @DisplayName("parse(callback)") + @SuppressWarnings("unused") void parseCallback() { var source = "class Foo {}"; // NOTE: can't use _ because of palantir/palantir-java-format#934 @@ -154,6 +154,24 @@ void parseCancellation() { } } + @Test + @DisplayName("parse(custom)") + void parseCustom() { + parser.setLanguage(language); + var encoding = InputEncoding.valueOf(StandardCharsets.UTF_32); + var source = "var value = \"\uD83C\uDF00\uD83C\uDFEF\";"; + ParseCallback callback = (offset, p) -> offset == 0 ? source : null; + try (var tree = parser.parse(callback, encoding).orElseThrow()) { + var rootNode = tree.getRootNode(); + + assertEquals(68, rootNode.getEndByte()); + assertFalse(rootNode.isError()); + assertEquals( + "(program (local_variable_declaration type: (type_identifier) declarator: (variable_declarator name: (identifier) value: (string_literal (string_fragment)))))", + rootNode.toSexp()); + } + } + @Test @DisplayName("parse(options)") void parseOptions() {