Skip to content

Commit

Permalink
Java: Handle supplementary characters during tokenization
Browse files Browse the repository at this point in the history
Thanks to Teemu Kanstrén for the bug report
  • Loading branch information
hatapitk committed Nov 14, 2016
1 parent 7adad4c commit 90212c5
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 2 deletions.
15 changes: 13 additions & 2 deletions libvoikko/java/src/main/java/org/puimula/libvoikko/Voikko.java
Expand Up @@ -316,15 +316,26 @@ public synchronized List<Token> tokens(String text) {
requireValidHandle();
List<Token> allTokens = new ArrayList<Token>();
int lastStart = 0;
for (int i = text.indexOf('\0'); i != -1; i = text.indexOf('\0', i + 1)) {
for (int i = indexOfSpecialUnknown(text, 0); i != -1; i = indexOfSpecialUnknown(text, i + 1)) {
allTokens.addAll(tokensNonNull(text.substring(lastStart, i)));
allTokens.add(new Token(TokenType.UNKNOWN, "\0"));
allTokens.add(new Token(TokenType.UNKNOWN, Character.toString(text.charAt(i))));
lastStart = i + 1;
}
allTokens.addAll(tokensNonNull(text.substring(lastStart)));
return allTokens;
}

private int indexOfSpecialUnknown(String text, int startFrom) {
int len = text.length();
for (int i = startFrom; i < len; i++) {
int c = text.charAt(i);
if (c == 0 || (c >= 0xD800 && c <= 0xDFFF)) {
return i;
}
}
return -1;
}

private List<Token> tokensNonNull(String text) {
Libvoikko lib = getLib();
List<Token> result = new ArrayList<Token>();
Expand Down
12 changes: 12 additions & 0 deletions libvoikko/java/src/test/java/org/puimula/libvoikko/VoikkoTest.java
Expand Up @@ -555,4 +555,16 @@ public void nullCharIsUnknownToken() {
}
assertEquals(0, voikko.tokens("").size());
}

@Test
public void supplementaryCharactersAreUnknownTokens() {
// This is actually one Unicode character (UTF-16 high + low surrogate)
List<Token> tokens = voikko.tokens("\uDBC0\uDC78");
assertEquals(2, tokens.size());
assertEquals(TokenType.UNKNOWN, tokens.get(0).getType());
assertEquals("\uDBC0", tokens.get(0).getText());
assertEquals(TokenType.UNKNOWN, tokens.get(1).getType());
assertEquals("\uDC78", tokens.get(1).getText());
}

}

0 comments on commit 90212c5

Please sign in to comment.