Permalink
Browse files

Merge branch 'master' of github.com:twitter/twitter-text-java into pu…

…nct_before_url

Conflicts:
	tests/com/twitter/RegexTest.java
  • Loading branch information...
2 parents f9faa27 + 16f2d7e commit dbeb6917d275939aea19cc0f3040ec6f4ec1987a keita committed Mar 27, 2012
View
@@ -3,7 +3,7 @@
<groupId>com.twitter</groupId>
<artifactId>twitter-text</artifactId>
- <version>1.4.9</version>
+ <version>1.4.10</version>
<build>
<directory>out</directory>
@@ -375,43 +375,84 @@ public boolean isExtractURLWithoutProtocol() {
*
* In UTF-16 based indices, Unicode supplementary characters are counted as two characters.
*
+ * This method requires that the list of entities be in ascending order by start index.
+ *
* @param text original text
* @param entities entities with Unicode based indices
*/
public void modifyIndicesFromUnicodeToUTF16(String text, List<Entity> entities) {
- shiftIndices(text, entities, +1);
+ IndexConverter convert = new IndexConverter(text);
+
+ for (Entity entity : entities) {
+ entity.start = convert.codePointsToCodeUnits(entity.start);
+ entity.end = convert.codePointsToCodeUnits(entity.end);
+ }
}
/*
* Modify UTF-16-based indices of the entities to Unicode-based indices.
*
* In Unicode-based indices, Unicode supplementary characters are counted as single characters.
*
+ * This method requires that the list of entities be in ascending order by start index.
+ *
* @param text original text
* @param entities entities with UTF-16 based indices
*/
public void modifyIndicesFromUTF16ToToUnicode(String text, List<Entity> entities) {
- shiftIndices(text, entities, -1);
+ IndexConverter convert = new IndexConverter(text);
+
+ for (Entity entity : entities) {
+ entity.start = convert.codeUnitsToCodePoints(entity.start);
+ entity.end = convert.codeUnitsToCodePoints(entity.end);
+ }
}
- /*
- * Shift Entity's indices by {@code diff} for every Unicode supplementary character
- * which appears before the entity.
- *
- * @param text original text
- * @param entities extracted entities
- * @param the amount to shift the entity's indices.
+ /**
+ * An efficient converter of indices between code points and code units.
*/
- protected void shiftIndices(String text, List<Entity> entities, int diff) {
- for (int i = 0; i < text.length() - 1; i++) {
- if (Character.isSupplementaryCodePoint(text.codePointAt(i))) {
- for (Entity entity: entities) {
- if (entity.start > i) {
- entity.start += diff;
- entity.end += diff;
- }
- }
+ private static final class IndexConverter {
+ protected final String text;
+
+ // Keep track of a single corresponding pair of code unit and code point
+ // offsets so that we can re-use counting work if the next requested
+ // entity is near the most recent entity.
+ protected int codePointIndex = 0;
+ protected int charIndex = 0;
+
+ IndexConverter(String text) {
+ this.text = text;
+ }
+
+ /**
+ * @param charIndex Index into the string measured in code units.
+ * @return The code point index that corresponds to the specified character index.
+ */
+ int codeUnitsToCodePoints(int charIndex) {
+ if (charIndex < this.charIndex) {
+ this.codePointIndex -= text.codePointCount(charIndex, this.charIndex);
+ } else {
+ this.codePointIndex += text.codePointCount(this.charIndex, charIndex);
+ }
+ this.charIndex = charIndex;
+
+ // Make sure that charIndex never points to the second code unit of a
+ // surrogate pair.
+ if (charIndex > 0 && Character.isSupplementaryCodePoint(text.codePointAt(charIndex - 1))) {
+ this.charIndex -= 1;
}
+ return this.codePointIndex;
+ }
+
+ /**
+ * @param codePointIndex Index into the string measured in code points.
+ * @return the code unit index that corresponds to the specified code point index.
+ */
+ int codePointsToCodeUnits(int codePointIndex) {
+ // Note that offsetByCodePoints accepts negative indices.
+ this.charIndex = text.offsetByCodePoints(this.charIndex, codePointIndex - this.codePointIndex);
+ this.codePointIndex = codePointIndex;
+ return this.charIndex;
}
}
}
View
@@ -26,11 +26,22 @@
"\\u1e00-\\u1eff"; // Latin Extended Additional (mostly for Vietnamese)
private static final String HASHTAG_ALPHA_CHARS = "a-z" + LATIN_ACCENTS_CHARS +
"\\u0400-\\u04ff\\u0500-\\u0527" + // Cyrillic
- "\\u2de0–\\u2dff\\ua640–\\ua69f" + // Cyrillic Extended A/B
+ "\\u2de0-\\u2dff\\ua640-\\ua69f" + // Cyrillic Extended A/B
+ "\\u0591-\\u05bf\\u05c1-\\u05c2\\u05c4-\\u05c5\\u05c7" +
+ "\\u05d0-\\u05ea\\u05f0-\\u05f4" + // Hebrew
+ "\\ufb1d-\\ufb28\\ufb2a-\\ufb36\\ufb38-\\ufb3c\\ufb3e\\ufb40-\\ufb41" +
+ "\\ufb43-\\ufb44\\ufb46-\\ufb4f" + // Hebrew Pres. Forms
+ "\\u0610-\\u061a\\u0620-\\u065f\\u066e-\\u06d3\\u06d5-\\u06dc" +
+ "\\u06de-\\u06e8\\u06ea-\\u06ef\\u06fa-\\u06fc\\u06ff" + // Arabic
+ "\\u0750-\\u077f\\u08a0\\u08a2-\\u08ac\\u08e4-\\u08fe" + // Arabic Supplement and Extended A
+ "\\ufb50-\\ufbb1\\ufbd3-\\ufd3d\\ufd50-\\ufd8f\\ufd92-\\ufdc7\\ufdf0-\\ufdfb" + // Pres. Forms A
+ "\\ufe70-\\ufe74\\ufe76-\\ufefc" + // Pres. Forms B
+ "\\u200c" + // Zero-Width Non-Joiner
+ "\\u0e01-\\u0e3a\\u0e40-\\u0e4e" + // Thai
"\\u1100-\\u11ff\\u3130-\\u3185\\uA960-\\uA97F\\uAC00-\\uD7AF\\uD7B0-\\uD7FF" + // Hangul (Korean)
"\\p{InHiragana}\\p{InKatakana}" + // Japanese Hiragana and Katakana
"\\p{InCJKUnifiedIdeographs}" + // Japanese Kanji / Chinese Han
- "\\u3005\\u303b" + // Kanji/Han iteration marks
+ "\\u3003\\u3005\\u303b" + // Kanji/Han iteration marks
"\\uff21-\\uff3a\\uff41-\\uff5a" + // full width Alphabet
"\\uff66-\\uff9f" + // half width Katakana
"\\uffa1-\\uffdc"; // half width Hangul (Korean)
@@ -48,7 +59,7 @@
private static final String URL_VALID_UNICODE_CHARS = "[.[^\\p{Punct}\\s\\p{Z}\\p{InGeneralPunctuation}]]";
private static final String URL_VALID_GTLD =
- "(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=\\P{Alpha}|$))";
+ "(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=\\P{Alnum}|$))";
private static final String URL_VALID_CCTLD =
"(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" +
"bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" +
@@ -57,7 +68,7 @@
"lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" +
"nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" +
"sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" +
- "va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)(?=\\P{Alpha}|$))";
+ "va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)(?=\\P{Alnum}|$))";
private static final String URL_PUNYCODE = "(?:xn--[0-9a-z]+)";
private static final String URL_VALID_DOMAIN =
@@ -2,6 +2,11 @@
package com.twitter;
import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.twitter.Extractor.Entity.Type;
+
import junit.framework.TestCase;
import junit.framework.TestSuite;
import junit.framework.Test;
@@ -10,14 +15,108 @@
protected Extractor extractor;
public static Test suite() {
- Class<?>[] testClasses = { ReplyTest.class, MentionTest.class, HashtagTest.class, URLTest.class };
+ Class<?>[] testClasses = { OffsetConversionTest.class, ReplyTest.class,
+ MentionTest.class, HashtagTest.class, URLTest.class };
return new TestSuite(testClasses);
}
public void setUp() throws Exception {
extractor = new Extractor();
}
+ public static class OffsetConversionTest extends ExtractorTest {
+
+ public void testConvertIndices() {
+ assertOffsetConversionOk("abc", "abc");
+ assertOffsetConversionOk("\ud83d\ude02abc", "abc");
+ assertOffsetConversionOk("\ud83d\ude02abc\ud83d\ude02", "abc");
+ assertOffsetConversionOk("\ud83d\ude02abc\ud838\ude02abc", "abc");
+ assertOffsetConversionOk("\ud83d\ude02abc\ud838\ude02abc\ud83d\ude02",
+ "abc");
+ assertOffsetConversionOk("\ud83d\ude02\ud83d\ude02abc", "abc");
+ assertOffsetConversionOk("\ud83d\ude02\ud83d\ude02\ud83d\ude02abc",
+ "abc");
+
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d\ude02", "abc");
+
+ // Several surrogate pairs following the entity
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d\ude02\ud83d" +
+ "\ude02\ud83d\ude02", "abc");
+
+ // Several surrogate pairs surrounding multiple entities
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02", "abc");
+
+ // unpaired low surrogate (at start)
+ assertOffsetConversionOk
+ ("\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02", "abc");
+
+ // unpaired low surrogate (at end)
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ude02", "abc");
+
+ // unpaired low and high surrogates (at end)
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ud83d\ude02\ude02", "abc");
+
+ assertOffsetConversionOk("\ud83dabc\ud83d", "abc");
+
+ assertOffsetConversionOk("\ude02abc\ude02", "abc");
+
+ assertOffsetConversionOk("\ude02\ude02abc\ude02\ude02", "abc");
+
+ assertOffsetConversionOk("abcabc", "abc");
+
+ assertOffsetConversionOk("abc\ud83d\ude02abc", "abc");
+
+ assertOffsetConversionOk("aa", "a");
+
+ assertOffsetConversionOk("\ud83d\ude02a\ud83d\ude02a\ud83d\ude02", "a");
+ }
+
+ private void assertOffsetConversionOk(String testData, String patStr) {
+ // Build an entity at the location of patStr
+ final Pattern pat = Pattern.compile(patStr);
+ final Matcher matcher = pat.matcher(testData);
+
+ List<Extractor.Entity> entities = new ArrayList<Extractor.Entity>();
+ List<Integer> codePointOffsets = new ArrayList<Integer>();
+ List<Integer> charOffsets = new ArrayList<Integer>();
+ while (matcher.find()) {
+ final int charOffset = matcher.start();
+ charOffsets.add(charOffset);
+ codePointOffsets.add(testData.codePointCount(0, charOffset));
+ entities.add(new Extractor.Entity(matcher, Type.HASHTAG, 0, 0));
+ }
+
+ extractor.modifyIndicesFromUTF16ToToUnicode(testData, entities);
+
+ for (int i = 0; i < entities.size(); i++) {
+ assertEquals(codePointOffsets.get(i), entities.get(i).getStart());
+ }
+
+ extractor.modifyIndicesFromUnicodeToUTF16(testData, entities);
+
+ for (int i = 0; i < entities.size(); i++) {
+ // This assertion could fail if the entity location is in the middle
+ // of a surrogate pair, since there is no equivalent code point
+ // offset to that location. It would be pathological for an entity to
+ // start at that point, so we can just let the test fail in that case.
+ assertEquals(charOffsets.get(i), entities.get(i).getStart());
+ }
+ }
+ }
+
/**
* Tests for the extractReplyScreenname method
*/
@@ -92,11 +191,11 @@ public void testMentionWithSupplementaryCharacters() {
// count U+10400 as 2 characters (as in UTF-16)
extractor.modifyIndicesFromUnicodeToUTF16(text, extracted);
- assertEquals(extracted.size(), 2);
- assertEquals(extracted.get(0).start, 3);
- assertEquals(extracted.get(0).end, 11);
- assertEquals(extracted.get(1).start, 15);
- assertEquals(extracted.get(1).end, 23);
+ assertEquals(2, extracted.size());
+ assertEquals(3, extracted.get(0).start);
+ assertEquals(11, extracted.get(0).end);
+ assertEquals(15, extracted.get(1).start);
+ assertEquals(23, extracted.get(1).end);
}
}
@@ -13,6 +13,16 @@ public void testAutoLinkHashtags() {
assertCaptureCount(3, Regex.VALID_HASHTAG, "#Ċaoiṁín");
assertCaptureCount(3, Regex.VALID_HASHTAG, "#Caoiṁín");
assertCaptureCount(3, Regex.VALID_HASHTAG, "#caf\u00E9");
+ assertCaptureCount(3, Regex.VALID_HASHTAG, "#\u05e2\u05d1\u05e8\u05d9\u05ea"); // "#Hebrew"
+ assertCaptureCount(3, Regex.VALID_HASHTAG, "#\u05d0\u05b2\u05e9\u05b6\u05c1\u05e8"); // with marks
+ assertCaptureCount(3, Regex.VALID_HASHTAG, "#\u05e2\u05b7\u05dc\u05be\u05d9\u05b0\u05d3\u05b5\u05d9"); // with maqaf 05be
+ assertCaptureCount(3, Regex.VALID_HASHTAG, "#\u05d5\u05db\u05d5\u05f3"); // with geresh 05f3
+ assertCaptureCount(3, Regex.VALID_HASHTAG, "#\u05de\u05f4\u05db"); // with gershayim 05f4
+ assertCaptureCount(3, Regex.VALID_HASHTAG, "#\u0627\u0644\u0639\u0631\u0628\u064a\u0629"); // "#Arabic"
+ assertCaptureCount(3, Regex.VALID_HASHTAG, "#\u062d\u0627\u0644\u064a\u0627\u064b"); // with mark
+ assertCaptureCount(3, Regex.VALID_HASHTAG, "#\u064a\u0640\ufbb1\u0640\u064e\u0671"); // with pres. form
+ assertCaptureCount(3, Regex.VALID_HASHTAG, "#ประเทศไทย");
+ assertCaptureCount(3, Regex.VALID_HASHTAG, "#ฟรี"); // with mark
assertCaptureCount(3, Regex.VALID_HASHTAG, "#日本語ハッシュタグ");
assertCaptureCount(3, Regex.VALID_HASHTAG, "#日本語ハッシュタグ");

0 comments on commit dbeb691

Please sign in to comment.