Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Merge branch 'master' of github.com:twitter/twitter-text-java into no…

…_number_after_tld
  • Loading branch information...
commit 9564b4a8e5fa46957ca7bd27e9de69bf18c44e01 2 parents 26e66ef + de7a3c4
@keitaf keitaf authored
View
2  pom.xml
@@ -3,7 +3,7 @@
<groupId>com.twitter</groupId>
<artifactId>twitter-text</artifactId>
- <version>1.4.8</version>
+ <version>1.4.10</version>
<build>
<directory>out</directory>
View
31 src/com/twitter/Autolink.java
@@ -19,11 +19,11 @@
/** Default CSS class for auto-linked hashtag URLs */
public static final String DEFAULT_HASHTAG_CLASS = "hashtag";
/** Default href for username links (the username without the @ will be appended) */
- public static final String DEFAULT_USERNAME_URL_BASE = "http://twitter.com/";
+ public static final String DEFAULT_USERNAME_URL_BASE = "https://twitter.com/";
/** Default href for list links (the username/list without the @ will be appended) */
- public static final String DEFAULT_LIST_URL_BASE = "http://twitter.com/";
+ public static final String DEFAULT_LIST_URL_BASE = "https://twitter.com/";
/** Default href for hashtag links (the hashtag without the # will be appended) */
- public static final String DEFAULT_HASHTAG_URL_BASE = "http://twitter.com/#!/search?q=%23";
+ public static final String DEFAULT_HASHTAG_URL_BASE = "https://twitter.com/#!/search?q=%23";
/** HTML attribute to add when noFollow is true (default) */
public static final String NO_FOLLOW_HTML_ATTRIBUTE = " rel=\"nofollow\"";
@@ -35,6 +35,7 @@
protected String listUrlBase;
protected String hashtagUrlBase;
protected boolean noFollow = true;
+ protected boolean usernameIncludeSymbol = false;
public Autolink() {
urlClass = DEFAULT_URL_CLASS;
@@ -104,6 +105,13 @@ public String autoLinkUsernamesAndLists(String text) {
// Outside of a tag, do real work with this chunk
matcher = Regex.AUTO_LINK_USERNAMES_OR_LISTS.matcher(chunk);
while (matcher.find()) {
+ String at = matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_AT);
+ String username = matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_USERNAME);
+ if (usernameIncludeSymbol) {
+ username = at + username;
+ at = "";
+ }
+
if (matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_LIST) == null ||
matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_LIST).isEmpty()) {
@@ -111,14 +119,14 @@ public String autoLinkUsernamesAndLists(String text) {
if (!Regex.SCREEN_NAME_MATCH_END.matcher(chunk.substring(matcher.end())).find()) {
StringBuilder rb = new StringBuilder(capacity);
rb.append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_BEFORE))
- .append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_AT))
+ .append(at)
.append("<a class=\"").append(urlClass).append(" ").append(usernameClass)
.append("\" href=\"").append(usernameUrlBase)
.append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_USERNAME))
.append("\"");
if (noFollow) rb.append(NO_FOLLOW_HTML_ATTRIBUTE);
rb.append(">")
- .append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_USERNAME))
+ .append(username)
.append("</a>");
matcher.appendReplacement(sb, rb.toString());
} else {
@@ -129,14 +137,14 @@ public String autoLinkUsernamesAndLists(String text) {
// Username and list
StringBuilder rb = new StringBuilder(capacity);
rb.append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_BEFORE))
- .append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_AT))
+ .append(at)
.append("<a class=\"").append(urlClass).append(" ").append(listClass)
.append("\" href=\"").append(listUrlBase)
.append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_USERNAME))
.append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_LIST))
.append("\"");
if (noFollow) rb.append(NO_FOLLOW_HTML_ATTRIBUTE);
- rb.append(">").append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_USERNAME))
+ rb.append(">").append(username)
.append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_LIST))
.append("</a>");
matcher.appendReplacement(sb, rb.toString());
@@ -370,6 +378,15 @@ public void setNoFollow(boolean noFollow) {
this.noFollow = noFollow;
}
+ /**
+ * Set if the at mark '@' should be included in the link (false by default)
+ *
+ * @param noFollow new noFollow value
+ */
+ public void setUsernameIncludeSymbol(boolean usernameIncludeSymbol) {
+ this.usernameIncludeSymbol = usernameIncludeSymbol;
+ }
+
// The default String split is horribly inefficient
protected static Iterable<String> split(final String s, final String d) {
List<String> strings = new ArrayList<String>();
View
96 src/com/twitter/Extractor.java
@@ -9,8 +9,8 @@
*/
public class Extractor {
public static class Entity {
- protected Integer start = null;
- protected Integer end = null;
+ protected int start;
+ protected int end;
protected String value = null;
protected String type = null;
@@ -47,8 +47,8 @@ public boolean equals(Object obj) {
Entity other = (Entity)obj;
if (this.type.equals(other.type) &&
- this.start.equals(other.start) &&
- this.end.equals(other.end) &&
+ this.start == other.start &&
+ this.end == other.end &&
this.value.equals(other.value)) {
return true;
} else {
@@ -269,4 +269,90 @@ public String extractReplyScreenname(String text) {
}
return extracted;
}
-}
+
+ /*
+ * Modify Unicode-based indices of the entities to UTF-16 based indices.
+ *
+ * In UTF-16 based indices, Unicode supplementary characters are counted as two characters.
+ *
+ * This method requires that the list of entities be in ascending order by start index.
+ *
+ * @param text original text
+ * @param entities entities with Unicode based indices
+ */
+ public void modifyIndicesFromUnicodeToUTF16(String text, List<Entity> entities) {
+ IndexConverter convert = new IndexConverter(text);
+
+ for (Entity entity : entities) {
+ entity.start = convert.codePointsToCodeUnits(entity.start);
+ entity.end = convert.codePointsToCodeUnits(entity.end);
+ }
+ }
+
+ /*
+ * Modify UTF-16-based indices of the entities to Unicode-based indices.
+ *
+ * In Unicode-based indices, Unicode supplementary characters are counted as single characters.
+ *
+ * This method requires that the list of entities be in ascending order by start index.
+ *
+ * @param text original text
+ * @param entities entities with UTF-16 based indices
+ */
+ public void modifyIndicesFromUTF16ToToUnicode(String text, List<Entity> entities) {
+ IndexConverter convert = new IndexConverter(text);
+
+ for (Entity entity : entities) {
+ entity.start = convert.codeUnitsToCodePoints(entity.start);
+ entity.end = convert.codeUnitsToCodePoints(entity.end);
+ }
+ }
+
+ /**
+ * An efficient converter of indices between code points and code units.
+ */
+ private static final class IndexConverter {
+ protected final String text;
+
+ // Keep track of a single corresponding pair of code unit and code point
+ // offsets so that we can re-use counting work if the next requested
+ // entity is near the most recent entity.
+ protected int codePointIndex = 0;
+ protected int charIndex = 0;
+
+ IndexConverter(String text) {
+ this.text = text;
+ }
+
+ /**
+ * @param charIndex Index into the string measured in code units.
+ * @return The code point index that corresponds to the specified character index.
+ */
+ int codeUnitsToCodePoints(int charIndex) {
+ if (charIndex < this.charIndex) {
+ this.codePointIndex -= text.codePointCount(charIndex, this.charIndex);
+ } else {
+ this.codePointIndex += text.codePointCount(this.charIndex, charIndex);
+ }
+ this.charIndex = charIndex;
+
+ // Make sure that charIndex never points to the second code unit of a
+ // surrogate pair.
+ if (charIndex > 0 && Character.isSupplementaryCodePoint(text.codePointAt(charIndex - 1))) {
+ this.charIndex -= 1;
+ }
+ return this.codePointIndex;
+ }
+
+ /**
+ * @param codePointIndex Index into the string measured in code points.
+ * @return the code unit index that corresponds to the specified code point index.
+ */
+ int codePointsToCodeUnits(int codePointIndex) {
+ // Note that offsetByCodePoints accepts negative indices.
+ this.charIndex = text.offsetByCodePoints(this.charIndex, codePointIndex - this.codePointIndex);
+ this.codePointIndex = codePointIndex;
+ return this.charIndex;
+ }
+ }
+}
View
23 src/com/twitter/Regex.java
@@ -4,10 +4,25 @@
import java.util.regex.*;
public class Regex {
- private static String LATIN_ACCENTS_CHARS = "\\u00c0-\\u00d6\\u00d8-\\u00f6\\u00f8-\\u00ff\\u015f";
+ private static String LATIN_ACCENTS_CHARS = "\\u00c0-\\u00d6\\u00d8-\\u00f6\\u00f8-\\u00ff" + // Latin-1
+ "\\u0100-\\u024f" + // Latin Extended A and B
+ "\\u0253\\u0254\\u0256\\u0257\\u0259\\u025b\\u0263\\u0268\\u026f\\u0272\\u0289\\u028b" + // IPA Extensions
+ "\\u02bb" + // Hawaiian
+ "\\u1e00-\\u1eff"; // Latin Extended Additional (mostly for Vietnamese)
private static final String HASHTAG_ALPHA_CHARS = "a-z" + LATIN_ACCENTS_CHARS +
"\\u0400-\\u04ff\\u0500-\\u0527" + // Cyrillic
- "\\u2de0–\\u2dff\\ua640–\\ua69f" + // Cyrillic Extended A/B
+ "\\u2de0-\\u2dff\\ua640-\\ua69f" + // Cyrillic Extended A/B
+ "\\u0591-\\u05bf\\u05c1-\\u05c2\\u05c4-\\u05c5\\u05c7" +
+ "\\u05d0-\\u05ea\\u05f0-\\u05f4" + // Hebrew
+ "\\ufb1d-\\ufb28\\ufb2a-\\ufb36\\ufb38-\\ufb3c\\ufb3e\\ufb40-\\ufb41" +
+ "\\ufb43-\\ufb44\\ufb46-\\ufb4f" + // Hebrew Pres. Forms
+ "\\u0610-\\u061a\\u0620-\\u065f\\u066e-\\u06d3\\u06d5-\\u06dc" +
+ "\\u06de-\\u06e8\\u06ea-\\u06ef\\u06fa-\\u06fc\\u06ff" + // Arabic
+ "\\u0750-\\u077f\\u08a0\\u08a2-\\u08ac\\u08e4-\\u08fe" + // Arabic Supplement and Extended A
+ "\\ufb50-\\ufbb1\\ufbd3-\\ufd3d\\ufd50-\\ufd8f\\ufd92-\\ufdc7\\ufdf0-\\ufdfb" + // Pres. Forms A
+ "\\ufe70-\\ufe74\\ufe76-\\ufefc" + // Pres. Forms B
+ "\\u200c" + // Zero-Width Non-Joiner
+ "\\u0e01-\\u0e3a\\u0e40-\\u0e4e" + // Thai
"\\u1100-\\u11ff\\u3130-\\u3185\\uA960-\\uA97F\\uAC00-\\uD7AF\\uD7B0-\\uD7FF" + // Hangul (Korean)
"\\p{InHiragana}\\p{InKatakana}" + // Japanese Hiragana and Katakana
"\\p{InCJKUnifiedIdeographs}" + // Japanese Kanji / Chinese Han
@@ -113,7 +128,7 @@
public static final int AUTO_LINK_HASHTAGS_GROUP_TAG = 3;
public static final Pattern HASHTAG_MATCH_END = Pattern.compile("^(?:[##]|://)");
- public static final Pattern AUTO_LINK_USERNAMES_OR_LISTS = Pattern.compile("([^a-z0-9_]|^|RT:?)(" + AT_SIGNS + "+)([a-z0-9_]{1,20})(/[a-z][a-z0-9_\\-]{0,24})?", Pattern.CASE_INSENSITIVE);
+ public static final Pattern AUTO_LINK_USERNAMES_OR_LISTS = Pattern.compile("([^a-z0-9_!#$%&*" + AT_SIGNS_CHARS + "]|^|RT:?)(" + AT_SIGNS + "+)([a-z0-9_]{1,20})(/[a-z][a-z0-9_\\-]{0,24})?", Pattern.CASE_INSENSITIVE);
public static final int AUTO_LINK_USERNAME_OR_LISTS_GROUP_BEFORE = 1;
public static final int AUTO_LINK_USERNAME_OR_LISTS_GROUP_AT = 2;
public static final int AUTO_LINK_USERNAME_OR_LISTS_GROUP_USERNAME = 3;
@@ -131,7 +146,7 @@
public static final Pattern VALID_TCO_URL = Pattern.compile("^https?:\\/\\/t\\.co\\/[a-z0-9]+", Pattern.CASE_INSENSITIVE);
- public static final Pattern EXTRACT_MENTIONS = Pattern.compile("(^|[^a-z0-9_])" + AT_SIGNS + "([a-z0-9_]{1,20})", Pattern.CASE_INSENSITIVE);
+ public static final Pattern EXTRACT_MENTIONS = Pattern.compile("(^|[^a-z0-9_!#$%&*" + AT_SIGNS_CHARS + "])" + AT_SIGNS + "([a-z0-9_]{1,20})", Pattern.CASE_INSENSITIVE);
public static final int EXTRACT_MENTIONS_GROUP_BEFORE = 1;
public static final int EXTRACT_MENTIONS_GROUP_USERNAME = 2;
2  test-data/twitter-text-conformance
@@ -1 +1 @@
-Subproject commit af6224c5d7164db9a69949665d2a7abf8bbb6c10
+Subproject commit 17b101d33d6a1d052e0a42f8f67c8762bb9258d9
View
13 tests/com/twitter/AutolinkTest.java
@@ -14,14 +14,14 @@ public void setUp() {
public void testNoFollowByDefault() {
String tweet = "This has a #hashtag";
- String expected = "This has a <a href=\"http://twitter.com/#!/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\" rel=\"nofollow\">#hashtag</a>";
+ String expected = "This has a <a href=\"https://twitter.com/#!/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\" rel=\"nofollow\">#hashtag</a>";
assertAutolink(expected, linker.autoLinkHashtags(tweet));
}
public void testNoFollowDisabled() {
linker.setNoFollow(false);
String tweet = "This has a #hashtag";
- String expected = "This has a <a href=\"http://twitter.com/#!/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>";
+ String expected = "This has a <a href=\"https://twitter.com/#!/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>";
assertAutolink(expected, linker.autoLinkHashtags(tweet));
}
@@ -51,7 +51,14 @@ public void testURLWithoutProtocol() {
public void testWithAngleBrackets() {
linker.setNoFollow(false);
String tweet = "(Debugging) <3 #idol2011";
- String expected = "(Debugging) &lt;3 <a href=\"http://twitter.com/#!/search?q=%23idol2011\" title=\"#idol2011\" class=\"tweet-url hashtag\">#idol2011</a>";
+ String expected = "(Debugging) &lt;3 <a href=\"https://twitter.com/#!/search?q=%23idol2011\" title=\"#idol2011\" class=\"tweet-url hashtag\">#idol2011</a>";
+ assertAutolink(expected, linker.autoLink(tweet));
+ }
+
+ public void testUsernameIncludeSymbol() {
+ linker.setUsernameIncludeSymbol(true);
+ String tweet = "Testing @mention and @mention/list";
+ String expected = "Testing <a class=\"tweet-url username\" href=\"https://twitter.com/mention\" rel=\"nofollow\">@mention</a> and <a class=\"tweet-url list-slug\" href=\"https://twitter.com/mention/list\" rel=\"nofollow\">@mention/list</a>";
assertAutolink(expected, linker.autoLink(tweet));
}
View
192 tests/com/twitter/ExtractorTest.java
@@ -2,6 +2,9 @@
package com.twitter;
import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
import junit.framework.TestCase;
import junit.framework.TestSuite;
import junit.framework.Test;
@@ -10,7 +13,8 @@
protected Extractor extractor;
public static Test suite() {
- Class<?>[] testClasses = { ReplyTest.class, MentionTest.class, HashtagTest.class, URLTest.class };
+ Class<?>[] testClasses = { OffsetConversionTest.class, ReplyTest.class,
+ MentionTest.class, HashtagTest.class, URLTest.class };
return new TestSuite(testClasses);
}
@@ -18,6 +22,99 @@ public void setUp() throws Exception {
extractor = new Extractor();
}
+ public static class OffsetConversionTest extends ExtractorTest {
+
+ public void testConvertIndices() {
+ assertOffsetConversionOk("abc", "abc");
+ assertOffsetConversionOk("\ud83d\ude02abc", "abc");
+ assertOffsetConversionOk("\ud83d\ude02abc\ud83d\ude02", "abc");
+ assertOffsetConversionOk("\ud83d\ude02abc\ud838\ude02abc", "abc");
+ assertOffsetConversionOk("\ud83d\ude02abc\ud838\ude02abc\ud83d\ude02",
+ "abc");
+ assertOffsetConversionOk("\ud83d\ude02\ud83d\ude02abc", "abc");
+ assertOffsetConversionOk("\ud83d\ude02\ud83d\ude02\ud83d\ude02abc",
+ "abc");
+
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d\ude02", "abc");
+
+ // Several surrogate pairs following the entity
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d\ude02\ud83d" +
+ "\ude02\ud83d\ude02", "abc");
+
+ // Several surrogate pairs surrounding multiple entities
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02", "abc");
+
+ // unpaired low surrogate (at start)
+ assertOffsetConversionOk
+ ("\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02", "abc");
+
+ // unpaired low surrogate (at end)
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ude02", "abc");
+
+ // unpaired low and high surrogates (at end)
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ud83d\ude02\ude02", "abc");
+
+ assertOffsetConversionOk("\ud83dabc\ud83d", "abc");
+
+ assertOffsetConversionOk("\ude02abc\ude02", "abc");
+
+ assertOffsetConversionOk("\ude02\ude02abc\ude02\ude02", "abc");
+
+ assertOffsetConversionOk("abcabc", "abc");
+
+ assertOffsetConversionOk("abc\ud83d\ude02abc", "abc");
+
+ assertOffsetConversionOk("aa", "a");
+
+ assertOffsetConversionOk("\ud83d\ude02a\ud83d\ude02a\ud83d\ude02", "a");
+ }
+
+ private void assertOffsetConversionOk(String testData, String patStr) {
+ // Build an entity at the location of patStr
+ final Pattern pat = Pattern.compile(patStr);
+ final Matcher matcher = pat.matcher(testData);
+
+ List<Extractor.Entity> entities = new ArrayList<Extractor.Entity>();
+ List<Integer> codePointOffsets = new ArrayList<Integer>();
+ List<Integer> charOffsets = new ArrayList<Integer>();
+ while (matcher.find()) {
+ final int charOffset = matcher.start();
+ charOffsets.add(charOffset);
+ codePointOffsets.add(testData.codePointCount(0, charOffset));
+ entities.add(new Extractor.Entity(matcher, "unused", 0, 0));
+ }
+
+ extractor.modifyIndicesFromUTF16ToToUnicode(testData, entities);
+
+ for (int i = 0; i < entities.size(); i++) {
+ assertEquals(codePointOffsets.get(i), entities.get(i).getStart());
+ }
+
+ extractor.modifyIndicesFromUnicodeToUTF16(testData, entities);
+
+ for (int i = 0; i < entities.size(); i++) {
+ // This assertion could fail if the entity location is in the middle
+ // of a surrogate pair, since there is no equivalent code point
+ // offset to that location. It would be pathological for an entity to
+ // start at that point, so we can just let the test fail in that case.
+ assertEquals(charOffsets.get(i), entities.get(i).getStart());
+ }
+ }
+ }
+
/**
* Tests for the extractReplyScreenname method
*/
@@ -67,6 +164,37 @@ public void testMentionWithIndices() {
assertEquals(extracted.get(2).getStart().intValue(), 28);
assertEquals(extracted.get(2).getEnd().intValue(), 34);
}
+
+ public void testMentionWithSupplementaryCharacters() {
+ // insert U+10400 before " @mention"
+ String text = String.format("%c @mention %c @mention", 0x00010400, 0x00010400);
+
+ // count U+10400 as 2 characters (as in UTF-16)
+ List<Extractor.Entity> extracted = extractor.extractMentionedScreennamesWithIndices(text);
+ assertEquals(extracted.size(), 2);
+ assertEquals(extracted.get(0).value, "mention");
+ assertEquals(extracted.get(0).start, 3);
+ assertEquals(extracted.get(0).end, 11);
+ assertEquals(extracted.get(1).value, "mention");
+ assertEquals(extracted.get(1).start, 15);
+ assertEquals(extracted.get(1).end, 23);
+
+ // count U+10400 as single character
+ extractor.modifyIndicesFromUTF16ToToUnicode(text, extracted);
+ assertEquals(extracted.size(), 2);
+ assertEquals(extracted.get(0).start, 2);
+ assertEquals(extracted.get(0).end, 10);
+ assertEquals(extracted.get(1).start, 13);
+ assertEquals(extracted.get(1).end, 21);
+
+ // count U+10400 as 2 characters (as in UTF-16)
+ extractor.modifyIndicesFromUnicodeToUTF16(text, extracted);
+ assertEquals(2, extracted.size());
+ assertEquals(3, extracted.get(0).start);
+ assertEquals(11, extracted.get(0).end);
+ assertEquals(15, extracted.get(1).start);
+ assertEquals(23, extracted.get(1).end);
+ }
}
/**
@@ -103,6 +231,37 @@ public void testHashtagWithIndices() {
assertEquals(extracted.get(2).getStart().intValue(), 28);
assertEquals(extracted.get(2).getEnd().intValue(), 34);
}
+
+ public void testHashtagWithSupplementaryCharacters() {
+ // insert U+10400 before " #hashtag"
+ String text = String.format("%c #hashtag %c #hashtag", 0x00010400, 0x00010400);
+
+ // count U+10400 as 2 characters (as in UTF-16)
+ List<Extractor.Entity> extracted = extractor.extractHashtagsWithIndices(text);
+ assertEquals(extracted.size(), 2);
+ assertEquals(extracted.get(0).value, "hashtag");
+ assertEquals(extracted.get(0).start, 3);
+ assertEquals(extracted.get(0).end, 11);
+ assertEquals(extracted.get(1).value, "hashtag");
+ assertEquals(extracted.get(1).start, 15);
+ assertEquals(extracted.get(1).end, 23);
+
+ // count U+10400 as single character
+ extractor.modifyIndicesFromUTF16ToToUnicode(text, extracted);
+ assertEquals(extracted.size(), 2);
+ assertEquals(extracted.get(0).start, 2);
+ assertEquals(extracted.get(0).end, 10);
+ assertEquals(extracted.get(1).start, 13);
+ assertEquals(extracted.get(1).end, 21);
+
+ // count U+10400 as 2 characters (as in UTF-16)
+ extractor.modifyIndicesFromUnicodeToUTF16(text, extracted);
+ assertEquals(extracted.size(), 2);
+ assertEquals(extracted.get(0).start, 3);
+ assertEquals(extracted.get(0).end, 11);
+ assertEquals(extracted.get(1).start, 15);
+ assertEquals(extracted.get(1).end, 23);
+ }
}
/**
@@ -156,6 +315,37 @@ public void testUrlWithPunctuation() {
assertEquals(url, extractor.extractURLs(url).get(0));
}
}
+
+ public void testUrlnWithSupplementaryCharacters() {
+ // insert U+10400 before " http://twitter.com"
+ String text = String.format("%c http://twitter.com %c http://twitter.com", 0x00010400, 0x00010400);
+
+ // count U+10400 as 2 characters (as in UTF-16)
+ List<Extractor.Entity> extracted = extractor.extractURLsWithIndices(text);
+ assertEquals(extracted.size(), 2);
+ assertEquals(extracted.get(0).value, "http://twitter.com");
+ assertEquals(extracted.get(0).start, 3);
+ assertEquals(extracted.get(0).end, 21);
+ assertEquals(extracted.get(1).value, "http://twitter.com");
+ assertEquals(extracted.get(1).start, 25);
+ assertEquals(extracted.get(1).end, 43);
+
+ // count U+10400 as single character
+ extractor.modifyIndicesFromUTF16ToToUnicode(text, extracted);
+ assertEquals(extracted.size(), 2);
+ assertEquals(extracted.get(0).start, 2);
+ assertEquals(extracted.get(0).end, 20);
+ assertEquals(extracted.get(1).start, 23);
+ assertEquals(extracted.get(1).end, 41);
+
+ // count U+10400 as 2 characters (as in UTF-16)
+ extractor.modifyIndicesFromUnicodeToUTF16(text, extracted);
+ assertEquals(extracted.size(), 2);
+ assertEquals(extracted.get(0).start, 3);
+ assertEquals(extracted.get(0).end, 21);
+ assertEquals(extracted.get(1).start, 25);
+ assertEquals(extracted.get(1).end, 43);
+ }
}
/**
View
22 tests/com/twitter/RegexTest.java
@@ -7,7 +7,22 @@
public class RegexTest extends TestCase {
public void testAutoLinkHashtags() {
assertCaptureCount(3, Regex.AUTO_LINK_HASHTAGS, "#hashtag");
+ assertCaptureCount(3, Regex.AUTO_LINK_HASHTAGS, "#Azərbaycanca");
+ assertCaptureCount(3, Regex.AUTO_LINK_HASHTAGS, "#mûǁae");
+ assertCaptureCount(3, Regex.AUTO_LINK_HASHTAGS, "#Čeština");
+ assertCaptureCount(3, Regex.AUTO_LINK_HASHTAGS, "#Ċaoiṁín");
+ assertCaptureCount(3, Regex.AUTO_LINK_HASHTAGS, "#Caoiṁín");
assertCaptureCount(3, Regex.AUTO_LINK_HASHTAGS, "#caf\u00E9");
+ assertCaptureCount(3, Regex.AUTO_LINK_HASHTAGS, "#\u05e2\u05d1\u05e8\u05d9\u05ea"); // "#Hebrew"
+ assertCaptureCount(3, Regex.AUTO_LINK_HASHTAGS, "#\u05d0\u05b2\u05e9\u05b6\u05c1\u05e8"); // with marks
+ assertCaptureCount(3, Regex.AUTO_LINK_HASHTAGS, "#\u05e2\u05b7\u05dc\u05be\u05d9\u05b0\u05d3\u05b5\u05d9"); // with maqaf 05be
+ assertCaptureCount(3, Regex.AUTO_LINK_HASHTAGS, "#\u05d5\u05db\u05d5\u05f3"); // with geresh 05f3
+ assertCaptureCount(3, Regex.AUTO_LINK_HASHTAGS, "#\u05de\u05f4\u05db"); // with gershayim 05f4
+ assertCaptureCount(3, Regex.AUTO_LINK_HASHTAGS, "#\u0627\u0644\u0639\u0631\u0628\u064a\u0629"); // "#Arabic"
+ assertCaptureCount(3, Regex.AUTO_LINK_HASHTAGS, "#\u062d\u0627\u0644\u064a\u0627\u064b"); // with mark
+ assertCaptureCount(3, Regex.AUTO_LINK_HASHTAGS, "#\u064a\u0640\ufbb1\u0640\u064e\u0671"); // with pres. form
+ assertCaptureCount(3, Regex.AUTO_LINK_HASHTAGS, "#ประเทศไทย");
+ assertCaptureCount(3, Regex.AUTO_LINK_HASHTAGS, "#ฟรี"); // with mark
assertCaptureCount(3, Regex.AUTO_LINK_HASHTAGS, "#日本語ハッシュタグ");
assertCaptureCount(3, Regex.AUTO_LINK_HASHTAGS, "#日本語ハッシュタグ");
@@ -87,6 +102,13 @@ public void testExtractMentions() {
assertCaptureCount(2, Regex.EXTRACT_MENTIONS, "sample @user mention");
}
+ public void testInvalidMentions() {
+ char[] invalid_chars = new char[]{'!', '@', '#', '$', '%', '&', '*'};
+ for (char c : invalid_chars) {
+ assertFalse("Failed to ignore a mention preceded by " + c, Regex.EXTRACT_MENTIONS.matcher("f" + c + "@kn").find());
+ }
+ }
+
public void testExtractReply() {
assertCaptureCount(1, Regex.EXTRACT_REPLY, "@user reply");
assertCaptureCount(1, Regex.EXTRACT_REPLY, " @user reply");
Please sign in to comment.
Something went wrong with that request. Please try again.