Skip to content

Fix entity offset double-conversion bug #26

Merged
merged 6 commits into from Feb 15, 2012
View
79 src/com/twitter/Extractor.java
@@ -275,43 +275,84 @@ public String extractReplyScreenname(String text) {
*
* In UTF-16 based indices, Unicode supplementary characters are counted as two characters.
*
+ * This method requires that the list of entities be in ascending order by start index.
+ *
* @param text original text
* @param entities entities with Unicode based indices
*/
public void modifyIndicesFromUnicodeToUTF16(String text, List<Entity> entities) {
- shiftIndices(text, entities, +1);
+ IndexConverter convert = new IndexConverter(text);
+
+ for (Entity entity : entities) {
+ entity.start = convert.codePointsToCodeUnits(entity.start);
+ entity.end = convert.codePointsToCodeUnits(entity.end);
+ }
}
/*
* Modify UTF-16-based indices of the entities to Unicode-based indices.
*
* In Unicode-based indices, Unicode supplementary characters are counted as single characters.
*
+ * This method requires that the list of entities be in ascending order by start index.
+ *
* @param text original text
* @param entities entities with UTF-16 based indices
*/
public void modifyIndicesFromUTF16ToToUnicode(String text, List<Entity> entities) {
- shiftIndices(text, entities, -1);
+ IndexConverter convert = new IndexConverter(text);
+
+ for (Entity entity : entities) {
+ entity.start = convert.codeUnitsToCodePoints(entity.start);
+ entity.end = convert.codeUnitsToCodePoints(entity.end);
+ }
}
- /*
- * Shift Entity's indices by {@code diff} for every Unicode supplementary character
- * which appears before the entity.
- *
- * @param text original text
- * @param entities extracted entities
- * @param the amount to shift the entity's indices.
+ /**
+ * An efficient converter of indices between code points and code units.
*/
- protected void shiftIndices(String text, List<Entity> entities, int diff) {
- for (int i = 0; i < text.length() - 1; i++) {
- if (Character.isSupplementaryCodePoint(text.codePointAt(i))) {
- for (Entity entity: entities) {
- if (entity.start > i) {
- entity.start += diff;
- entity.end += diff;
- }
- }
+ private static final class IndexConverter {
+ protected final String text;
+
+ // Keep track of a single corresponding pair of code unit and code point
+ // offsets so that we can re-use counting work if the next requested
+ // entity is near the most recent entity.
+ protected int codePointIndex = 0;
+ protected int charIndex = 0;
+
+ IndexConverter(String text) {
+ this.text = text;
+ }
+
+ /**
+ * @param charIndex Index into the string measured in code units.
+ * @return The code point index that corresponds to the specified character index.
+ */
+ int codeUnitsToCodePoints(int charIndex) {
+ if (charIndex < this.charIndex) {
+ this.codePointIndex -= text.codePointCount(charIndex, this.charIndex);
+ } else {
+ this.codePointIndex += text.codePointCount(this.charIndex, charIndex);
}
+ this.charIndex = charIndex;
+
+ // Make sure that charIndex never points to the second code unit of a
+ // surrogate pair.
+ if (charIndex > 0 && Character.isSupplementaryCodePoint(text.codePointAt(charIndex - 1))) {
+ this.charIndex -= 1;
+ }
+ return this.codePointIndex;
+ }
+
+ /**
+ * @param codePointIndex Index into the string measured in code points.
+ * @return the code unit index that corresponds to the specified code point index.
+ */
+ int codePointsToCodeUnits(int codePointIndex) {
+ // Note that offsetByCodePoints accepts negative indices.
+ this.charIndex = text.offsetByCodePoints(this.charIndex, codePointIndex - this.codePointIndex);
+ this.codePointIndex = codePointIndex;
+ return this.charIndex;
}
}
-}
+}
2 test-data/twitter-text-conformance
@@ -1 +1 @@
-Subproject commit cb9e64901bc2fc560dbba6ae39f0c0b5633ab192
+Subproject commit 923c63f0198134486459d71e1384536ecda7b4f2
View
109 tests/com/twitter/ExtractorTest.java
@@ -2,6 +2,9 @@
package com.twitter;
import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
import junit.framework.TestCase;
import junit.framework.TestSuite;
import junit.framework.Test;
@@ -10,14 +13,108 @@
protected Extractor extractor;
public static Test suite() {
- Class<?>[] testClasses = { ReplyTest.class, MentionTest.class, HashtagTest.class, URLTest.class };
+ Class<?>[] testClasses = { OffsetConversionTest.class, ReplyTest.class,
+ MentionTest.class, HashtagTest.class, URLTest.class };
return new TestSuite(testClasses);
}
public void setUp() throws Exception {
extractor = new Extractor();
}
+ public static class OffsetConversionTest extends ExtractorTest {
+
+ public void testConvertIndices() {
+ assertOffsetConversionOk("abc", "abc");
+ assertOffsetConversionOk("\ud83d\ude02abc", "abc");
+ assertOffsetConversionOk("\ud83d\ude02abc\ud83d\ude02", "abc");
+ assertOffsetConversionOk("\ud83d\ude02abc\ud838\ude02abc", "abc");
+ assertOffsetConversionOk("\ud83d\ude02abc\ud838\ude02abc\ud83d\ude02",
+ "abc");
+ assertOffsetConversionOk("\ud83d\ude02\ud83d\ude02abc", "abc");
+ assertOffsetConversionOk("\ud83d\ude02\ud83d\ude02\ud83d\ude02abc",
+ "abc");
+
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d\ude02", "abc");
+
+ // Several surrogate pairs following the entity
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d\ude02\ud83d" +
+ "\ude02\ud83d\ude02", "abc");
+
+ // Several surrogate pairs surrounding multiple entities
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02", "abc");
+
+ // unpaired low surrogate (at start)
+ assertOffsetConversionOk
+ ("\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02", "abc");
+
+ // unpaired low surrogate (at end)
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ude02", "abc");
+
+ // unpaired low and high surrogates (at end)
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ud83d\ude02\ude02", "abc");
+
+ assertOffsetConversionOk("\ud83dabc\ud83d", "abc");
+
+ assertOffsetConversionOk("\ude02abc\ude02", "abc");
+
+ assertOffsetConversionOk("\ude02\ude02abc\ude02\ude02", "abc");
+
+ assertOffsetConversionOk("abcabc", "abc");
+
+ assertOffsetConversionOk("abc\ud83d\ude02abc", "abc");
+
+ assertOffsetConversionOk("aa", "a");
+
+ assertOffsetConversionOk("\ud83d\ude02a\ud83d\ude02a\ud83d\ude02", "a");
+ }
+
+ private void assertOffsetConversionOk(String testData, String patStr) {
+ // Build an entity at the location of patStr
+ final Pattern pat = Pattern.compile(patStr);
+ final Matcher matcher = pat.matcher(testData);
+
+ List<Extractor.Entity> entities = new ArrayList<Extractor.Entity>();
+ List<Integer> codePointOffsets = new ArrayList<Integer>();
+ List<Integer> charOffsets = new ArrayList<Integer>();
+ while (matcher.find()) {
+ final int charOffset = matcher.start();
+ charOffsets.add(charOffset);
+ codePointOffsets.add(testData.codePointCount(0, charOffset));
+ entities.add(new Extractor.Entity(matcher, "unused", 0, 0));
+ }
+
+ extractor.modifyIndicesFromUTF16ToToUnicode(testData, entities);
+
+ for (int i = 0; i < entities.size(); i++) {
+ assertEquals(codePointOffsets.get(i), entities.get(i).getStart());
+ }
+
+ extractor.modifyIndicesFromUnicodeToUTF16(testData, entities);
+
+ for (int i = 0; i < entities.size(); i++) {
+ // This assertion could fail if the entity location is in the middle
+ // of a surrogate pair, since there is no equivalent code point
+ // offset to that location. It would be pathological for an entity to
+ // start at that point, so we can just let the test fail in that case.
+ assertEquals(charOffsets.get(i), entities.get(i).getStart());
+ }
+ }
+ }
+
/**
* Tests for the extractReplyScreenname method
*/
@@ -92,11 +189,11 @@ public void testMentionWithSupplementaryCharacters() {
// count U+10400 as 2 characters (as in UTF-16)
extractor.modifyIndicesFromUnicodeToUTF16(text, extracted);
- assertEquals(extracted.size(), 2);
- assertEquals(extracted.get(0).start, 3);
- assertEquals(extracted.get(0).end, 11);
- assertEquals(extracted.get(1).start, 15);
- assertEquals(extracted.get(1).end, 23);
+ assertEquals(2, extracted.size());
+ assertEquals(3, extracted.get(0).start);
+ assertEquals(11, extracted.get(0).end);
+ assertEquals(15, extracted.get(1).start);
+ assertEquals(23, extracted.get(1).end);
}
}
Something went wrong with that request. Please try again.