Permalink
Browse files

Merge branch 'master' of github.com:twitter/twitter-text-java into no…

…_number_after_tld
  • Loading branch information...
2 parents 26e66ef + de7a3c4 commit 9564b4a8e5fa46957ca7bd27e9de69bf18c44e01 @keitaf keitaf committed Mar 26, 2012
View
@@ -3,7 +3,7 @@
<groupId>com.twitter</groupId>
<artifactId>twitter-text</artifactId>
- <version>1.4.8</version>
+ <version>1.4.10</version>
<build>
<directory>out</directory>
@@ -19,11 +19,11 @@
/** Default CSS class for auto-linked hashtag URLs */
public static final String DEFAULT_HASHTAG_CLASS = "hashtag";
/** Default href for username links (the username without the @ will be appended) */
- public static final String DEFAULT_USERNAME_URL_BASE = "http://twitter.com/";
+ public static final String DEFAULT_USERNAME_URL_BASE = "https://twitter.com/";
/** Default href for list links (the username/list without the @ will be appended) */
- public static final String DEFAULT_LIST_URL_BASE = "http://twitter.com/";
+ public static final String DEFAULT_LIST_URL_BASE = "https://twitter.com/";
/** Default href for hashtag links (the hashtag without the # will be appended) */
- public static final String DEFAULT_HASHTAG_URL_BASE = "http://twitter.com/#!/search?q=%23";
+ public static final String DEFAULT_HASHTAG_URL_BASE = "https://twitter.com/#!/search?q=%23";
/** HTML attribute to add when noFollow is true (default) */
public static final String NO_FOLLOW_HTML_ATTRIBUTE = " rel=\"nofollow\"";
@@ -35,6 +35,7 @@
protected String listUrlBase;
protected String hashtagUrlBase;
protected boolean noFollow = true;
+ protected boolean usernameIncludeSymbol = false;
public Autolink() {
urlClass = DEFAULT_URL_CLASS;
@@ -104,21 +105,28 @@ public String autoLinkUsernamesAndLists(String text) {
// Outside of a tag, do real work with this chunk
matcher = Regex.AUTO_LINK_USERNAMES_OR_LISTS.matcher(chunk);
while (matcher.find()) {
+ String at = matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_AT);
+ String username = matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_USERNAME);
+ if (usernameIncludeSymbol) {
+ username = at + username;
+ at = "";
+ }
+
if (matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_LIST) == null ||
matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_LIST).isEmpty()) {
// Username only
if (!Regex.SCREEN_NAME_MATCH_END.matcher(chunk.substring(matcher.end())).find()) {
StringBuilder rb = new StringBuilder(capacity);
rb.append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_BEFORE))
- .append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_AT))
+ .append(at)
.append("<a class=\"").append(urlClass).append(" ").append(usernameClass)
.append("\" href=\"").append(usernameUrlBase)
.append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_USERNAME))
.append("\"");
if (noFollow) rb.append(NO_FOLLOW_HTML_ATTRIBUTE);
rb.append(">")
- .append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_USERNAME))
+ .append(username)
.append("</a>");
matcher.appendReplacement(sb, rb.toString());
} else {
@@ -129,14 +137,14 @@ public String autoLinkUsernamesAndLists(String text) {
// Username and list
StringBuilder rb = new StringBuilder(capacity);
rb.append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_BEFORE))
- .append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_AT))
+ .append(at)
.append("<a class=\"").append(urlClass).append(" ").append(listClass)
.append("\" href=\"").append(listUrlBase)
.append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_USERNAME))
.append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_LIST))
.append("\"");
if (noFollow) rb.append(NO_FOLLOW_HTML_ATTRIBUTE);
- rb.append(">").append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_USERNAME))
+ rb.append(">").append(username)
.append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_LIST))
.append("</a>");
matcher.appendReplacement(sb, rb.toString());
@@ -370,6 +378,15 @@ public void setNoFollow(boolean noFollow) {
this.noFollow = noFollow;
}
+ /**
+ * Set if the at mark '@' should be included in the link (false by default)
+ *
+ * @param noFollow new noFollow value
+ */
+ public void setUsernameIncludeSymbol(boolean usernameIncludeSymbol) {
+ this.usernameIncludeSymbol = usernameIncludeSymbol;
+ }
+
// The default String split is horribly inefficient
protected static Iterable<String> split(final String s, final String d) {
List<String> strings = new ArrayList<String>();
@@ -9,8 +9,8 @@
*/
public class Extractor {
public static class Entity {
- protected Integer start = null;
- protected Integer end = null;
+ protected int start;
+ protected int end;
protected String value = null;
protected String type = null;
@@ -47,8 +47,8 @@ public boolean equals(Object obj) {
Entity other = (Entity)obj;
if (this.type.equals(other.type) &&
- this.start.equals(other.start) &&
- this.end.equals(other.end) &&
+ this.start == other.start &&
+ this.end == other.end &&
this.value.equals(other.value)) {
return true;
} else {
@@ -269,4 +269,90 @@ public String extractReplyScreenname(String text) {
}
return extracted;
}
-}
+
+ /*
+ * Modify Unicode-based indices of the entities to UTF-16 based indices.
+ *
+ * In UTF-16 based indices, Unicode supplementary characters are counted as two characters.
+ *
+ * This method requires that the list of entities be in ascending order by start index.
+ *
+ * @param text original text
+ * @param entities entities with Unicode based indices
+ */
+ public void modifyIndicesFromUnicodeToUTF16(String text, List<Entity> entities) {
+ IndexConverter convert = new IndexConverter(text);
+
+ for (Entity entity : entities) {
+ entity.start = convert.codePointsToCodeUnits(entity.start);
+ entity.end = convert.codePointsToCodeUnits(entity.end);
+ }
+ }
+
+ /*
+ * Modify UTF-16-based indices of the entities to Unicode-based indices.
+ *
+ * In Unicode-based indices, Unicode supplementary characters are counted as single characters.
+ *
+ * This method requires that the list of entities be in ascending order by start index.
+ *
+ * @param text original text
+ * @param entities entities with UTF-16 based indices
+ */
+ public void modifyIndicesFromUTF16ToToUnicode(String text, List<Entity> entities) {
+ IndexConverter convert = new IndexConverter(text);
+
+ for (Entity entity : entities) {
+ entity.start = convert.codeUnitsToCodePoints(entity.start);
+ entity.end = convert.codeUnitsToCodePoints(entity.end);
+ }
+ }
+
+ /**
+ * An efficient converter of indices between code points and code units.
+ */
+ private static final class IndexConverter {
+ protected final String text;
+
+ // Keep track of a single corresponding pair of code unit and code point
+ // offsets so that we can re-use counting work if the next requested
+ // entity is near the most recent entity.
+ protected int codePointIndex = 0;
+ protected int charIndex = 0;
+
+ IndexConverter(String text) {
+ this.text = text;
+ }
+
+ /**
+ * @param charIndex Index into the string measured in code units.
+ * @return The code point index that corresponds to the specified character index.
+ */
+ int codeUnitsToCodePoints(int charIndex) {
+ if (charIndex < this.charIndex) {
+ this.codePointIndex -= text.codePointCount(charIndex, this.charIndex);
+ } else {
+ this.codePointIndex += text.codePointCount(this.charIndex, charIndex);
+ }
+ this.charIndex = charIndex;
+
+ // Make sure that charIndex never points to the second code unit of a
+ // surrogate pair.
+ if (charIndex > 0 && Character.isSupplementaryCodePoint(text.codePointAt(charIndex - 1))) {
+ this.charIndex -= 1;
+ }
+ return this.codePointIndex;
+ }
+
+ /**
+ * @param codePointIndex Index into the string measured in code points.
+ * @return the code unit index that corresponds to the specified code point index.
+ */
+ int codePointsToCodeUnits(int codePointIndex) {
+ // Note that offsetByCodePoints accepts negative indices.
+ this.charIndex = text.offsetByCodePoints(this.charIndex, codePointIndex - this.codePointIndex);
+ this.codePointIndex = codePointIndex;
+ return this.charIndex;
+ }
+ }
+}
View
@@ -4,10 +4,25 @@
import java.util.regex.*;
public class Regex {
- private static String LATIN_ACCENTS_CHARS = "\\u00c0-\\u00d6\\u00d8-\\u00f6\\u00f8-\\u00ff\\u015f";
+ private static String LATIN_ACCENTS_CHARS = "\\u00c0-\\u00d6\\u00d8-\\u00f6\\u00f8-\\u00ff" + // Latin-1
+ "\\u0100-\\u024f" + // Latin Extended A and B
+ "\\u0253\\u0254\\u0256\\u0257\\u0259\\u025b\\u0263\\u0268\\u026f\\u0272\\u0289\\u028b" + // IPA Extensions
+ "\\u02bb" + // Hawaiian
+ "\\u1e00-\\u1eff"; // Latin Extended Additional (mostly for Vietnamese)
private static final String HASHTAG_ALPHA_CHARS = "a-z" + LATIN_ACCENTS_CHARS +
"\\u0400-\\u04ff\\u0500-\\u0527" + // Cyrillic
- "\\u2de0–\\u2dff\\ua640–\\ua69f" + // Cyrillic Extended A/B
+ "\\u2de0-\\u2dff\\ua640-\\ua69f" + // Cyrillic Extended A/B
+ "\\u0591-\\u05bf\\u05c1-\\u05c2\\u05c4-\\u05c5\\u05c7" +
+ "\\u05d0-\\u05ea\\u05f0-\\u05f4" + // Hebrew
+ "\\ufb1d-\\ufb28\\ufb2a-\\ufb36\\ufb38-\\ufb3c\\ufb3e\\ufb40-\\ufb41" +
+ "\\ufb43-\\ufb44\\ufb46-\\ufb4f" + // Hebrew Pres. Forms
+ "\\u0610-\\u061a\\u0620-\\u065f\\u066e-\\u06d3\\u06d5-\\u06dc" +
+ "\\u06de-\\u06e8\\u06ea-\\u06ef\\u06fa-\\u06fc\\u06ff" + // Arabic
+ "\\u0750-\\u077f\\u08a0\\u08a2-\\u08ac\\u08e4-\\u08fe" + // Arabic Supplement and Extended A
+ "\\ufb50-\\ufbb1\\ufbd3-\\ufd3d\\ufd50-\\ufd8f\\ufd92-\\ufdc7\\ufdf0-\\ufdfb" + // Pres. Forms A
+ "\\ufe70-\\ufe74\\ufe76-\\ufefc" + // Pres. Forms B
+ "\\u200c" + // Zero-Width Non-Joiner
+ "\\u0e01-\\u0e3a\\u0e40-\\u0e4e" + // Thai
"\\u1100-\\u11ff\\u3130-\\u3185\\uA960-\\uA97F\\uAC00-\\uD7AF\\uD7B0-\\uD7FF" + // Hangul (Korean)
"\\p{InHiragana}\\p{InKatakana}" + // Japanese Hiragana and Katakana
"\\p{InCJKUnifiedIdeographs}" + // Japanese Kanji / Chinese Han
@@ -113,7 +128,7 @@
public static final int AUTO_LINK_HASHTAGS_GROUP_TAG = 3;
public static final Pattern HASHTAG_MATCH_END = Pattern.compile("^(?:[##]|://)");
- public static final Pattern AUTO_LINK_USERNAMES_OR_LISTS = Pattern.compile("([^a-z0-9_]|^|RT:?)(" + AT_SIGNS + "+)([a-z0-9_]{1,20})(/[a-z][a-z0-9_\\-]{0,24})?", Pattern.CASE_INSENSITIVE);
+ public static final Pattern AUTO_LINK_USERNAMES_OR_LISTS = Pattern.compile("([^a-z0-9_!#$%&*" + AT_SIGNS_CHARS + "]|^|RT:?)(" + AT_SIGNS + "+)([a-z0-9_]{1,20})(/[a-z][a-z0-9_\\-]{0,24})?", Pattern.CASE_INSENSITIVE);
public static final int AUTO_LINK_USERNAME_OR_LISTS_GROUP_BEFORE = 1;
public static final int AUTO_LINK_USERNAME_OR_LISTS_GROUP_AT = 2;
public static final int AUTO_LINK_USERNAME_OR_LISTS_GROUP_USERNAME = 3;
@@ -131,7 +146,7 @@
public static final Pattern VALID_TCO_URL = Pattern.compile("^https?:\\/\\/t\\.co\\/[a-z0-9]+", Pattern.CASE_INSENSITIVE);
- public static final Pattern EXTRACT_MENTIONS = Pattern.compile("(^|[^a-z0-9_])" + AT_SIGNS + "([a-z0-9_]{1,20})", Pattern.CASE_INSENSITIVE);
+ public static final Pattern EXTRACT_MENTIONS = Pattern.compile("(^|[^a-z0-9_!#$%&*" + AT_SIGNS_CHARS + "])" + AT_SIGNS + "([a-z0-9_]{1,20})", Pattern.CASE_INSENSITIVE);
public static final int EXTRACT_MENTIONS_GROUP_BEFORE = 1;
public static final int EXTRACT_MENTIONS_GROUP_USERNAME = 2;
@@ -14,14 +14,14 @@ public void setUp() {
public void testNoFollowByDefault() {
String tweet = "This has a #hashtag";
- String expected = "This has a <a href=\"http://twitter.com/#!/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\" rel=\"nofollow\">#hashtag</a>";
+ String expected = "This has a <a href=\"https://twitter.com/#!/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\" rel=\"nofollow\">#hashtag</a>";
assertAutolink(expected, linker.autoLinkHashtags(tweet));
}
public void testNoFollowDisabled() {
linker.setNoFollow(false);
String tweet = "This has a #hashtag";
- String expected = "This has a <a href=\"http://twitter.com/#!/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>";
+ String expected = "This has a <a href=\"https://twitter.com/#!/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>";
assertAutolink(expected, linker.autoLinkHashtags(tweet));
}
@@ -51,7 +51,14 @@ public void testURLWithoutProtocol() {
public void testWithAngleBrackets() {
linker.setNoFollow(false);
String tweet = "(Debugging) <3 #idol2011";
- String expected = "(Debugging) &lt;3 <a href=\"http://twitter.com/#!/search?q=%23idol2011\" title=\"#idol2011\" class=\"tweet-url hashtag\">#idol2011</a>";
+ String expected = "(Debugging) &lt;3 <a href=\"https://twitter.com/#!/search?q=%23idol2011\" title=\"#idol2011\" class=\"tweet-url hashtag\">#idol2011</a>";
+ assertAutolink(expected, linker.autoLink(tweet));
+ }
+
+ public void testUsernameIncludeSymbol() {
+ linker.setUsernameIncludeSymbol(true);
+ String tweet = "Testing @mention and @mention/list";
+ String expected = "Testing <a class=\"tweet-url username\" href=\"https://twitter.com/mention\" rel=\"nofollow\">@mention</a> and <a class=\"tweet-url list-slug\" href=\"https://twitter.com/mention/list\" rel=\"nofollow\">@mention/list</a>";
assertAutolink(expected, linker.autoLink(tweet));
}
Oops, something went wrong.

0 comments on commit 9564b4a

Please sign in to comment.