Skip to content
This repository has been archived by the owner on Sep 18, 2021. It is now read-only.

Commit

Permalink
Refactor Extractor.Entity. Implement simple escapeHTML() in Autolink.
Browse files Browse the repository at this point in the history
  • Loading branch information
keita committed Jan 12, 2012
1 parent a6878bd commit e78d821
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 54 deletions.
5 changes: 0 additions & 5 deletions pom.xml
Expand Up @@ -68,11 +68,6 @@
<version>1.3</version>
</dependency>

<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>2.3</version>
</dependency>

</dependencies>
</project>
43 changes: 22 additions & 21 deletions src/com/twitter/Autolink.java
Expand Up @@ -4,9 +4,6 @@

import java.util.ArrayList;
import java.util.List;
import java.util.regex.MatchResult;

import org.apache.commons.lang.StringEscapeUtils;

/**
* A class for adding HTML links to hashtag, username and list references in Tweet text.
Expand Down Expand Up @@ -40,6 +37,21 @@ public class Autolink {

private Extractor extractor = new Extractor();

private static CharSequence escapeHTML(String text) {
StringBuilder builder = new StringBuilder(text.length() * 2);
for (char c : text.toCharArray()) {
switch(c) {
case '&': builder.append("&amp;"); break;
case '>': builder.append("&gt;"); break;
case '<': builder.append("&lt;"); break;
case '"': builder.append("&quot;"); break;
case '\'': builder.append("&#39;"); break;
default: builder.append(c); break;
}
}
return builder;
}

public Autolink() {
urlClass = DEFAULT_URL_CLASS;
listClass = DEFAULT_LIST_CLASS;
Expand Down Expand Up @@ -79,18 +91,7 @@ private String autoLinkEntities(String text, List<Entity> entities) {
StringBuilder replaceStr = new StringBuilder(text.length());
switch(entity.type) {
case URL:
String url = entity.getValue();
MatchResult matcher = entity.getMatchResult();
String query_string = matcher.group(Regex.VALID_URL_GROUP_QUERY_STRING);
if (query_string != null && matcher.start(Regex.VALID_URL_GROUP_QUERY_STRING) < entity.end) {
// Doing a replace isn't safe as the query string might match something else in the URL
int us = matcher.start(Regex.VALID_URL_GROUP_URL);
int qs = matcher.start(Regex.VALID_URL_GROUP_QUERY_STRING);
int qe = matcher.end(Regex.VALID_URL_GROUP_QUERY_STRING);
String replacement = StringEscapeUtils.escapeHtml(query_string);
url = url.substring(0, qs - us) + replacement + url.substring(qe - us);
}

CharSequence url = escapeHTML(entity.getValue());
replaceStr.append("<a href=\"").append(url).append("\"");
if (noFollow){
replaceStr.append(NO_FOLLOW_HTML_ATTRIBUTE);
Expand All @@ -108,20 +109,20 @@ private String autoLinkEntities(String text, List<Entity> entities) {
replaceStr.append(NO_FOLLOW_HTML_ATTRIBUTE);
}
replaceStr.append(">")
.append(entity.getMatchResult().group(Regex.VALID_HASHTAG_GROUP_HASH))
.append(text.subSequence(entity.getStart(), entity.getStart() + 1))
.append(entity.getValue()).append("</a>");
break;
case MENTION:
CharSequence at = text.subSequence(entity.getStart(), entity.getStart() + 1);
replaceStr
.append(entity.getMatchResult().group(Regex.VALID_MENTION_OR_LIST_GROUP_AT))
.append(at)
.append("<a class=\"").append(urlClass).append(" ");
String mention = entity.getValue();
String list = entity.getMatchResult().group(Regex.VALID_MENTION_OR_LIST_GROUP_LIST);
if (list != null) {
if (entity.listSlug != null) {
// this is list
replaceStr.append(listClass).append("\" href=\"").append(listUrlBase);
mention += list;
nextIndex += list.length();
mention += entity.listSlug;
nextIndex += entity.listSlug.length();
} else {
// this is @mention
replaceStr.append(usernameClass).append("\" href=\"").append(usernameUrlBase);
Expand Down
65 changes: 39 additions & 26 deletions src/com/twitter/Extractor.java
Expand Up @@ -8,32 +8,36 @@
* A class to extract usernames, lists, hashtags and URLs from Tweet text.
*/
public class Extractor {
public enum EntityType {
URL, HASHTAG, MENTION, LIST
};

public static class Entity {
protected int start;
protected int end;
protected String value = null;
protected EntityType type = null;
protected MatchResult matchResult = null;
public enum Type {
URL, HASHTAG, MENTION
}

protected final int start;
protected final int end;
protected final String value;
protected final String listSlug;
protected final Type type;

public Entity(int start, int end, String value, EntityType type, MatchResult matchResult) {
public Entity(int start, int end, String value, String listSlug, Type type) {
this.start = start;
this.end = end;
this.value = value;
this.listSlug = listSlug;
this.type = type;
this.matchResult = matchResult;
}

public Entity(Matcher matcher, EntityType type, int groupNumber) {
public Entity(int start, int end, String value, Type type) {
this(start, end, value, null, type);
}

public Entity(Matcher matcher, Type type, int groupNumber) {
// Offset -1 on start index to include @, # symbols for mentions and hashtags
this(matcher, type, groupNumber, -1);
}

public Entity(Matcher matcher, EntityType type, int groupNumber, int startOffset) {
this(matcher.start(groupNumber) + startOffset, matcher.end(groupNumber), matcher.group(groupNumber), type, matcher.toMatchResult());
public Entity(Matcher matcher, Type type, int groupNumber, int startOffset) {
this(matcher.start(groupNumber) + startOffset, matcher.end(groupNumber), matcher.group(groupNumber), type);
}

public boolean equals(Object obj) {
Expand All @@ -42,7 +46,6 @@ public boolean equals(Object obj) {
}

if (!(obj instanceof Entity)) {
System.out.println("incorrect type");
return false;
}

Expand Down Expand Up @@ -74,12 +77,12 @@ public String getValue() {
return value;
}

public EntityType getType() {
return type;
public String getListSlug() {
return listSlug;
}

public MatchResult getMatchResult() {
return matchResult;
public Type getType() {
return type;
}
}

Expand Down Expand Up @@ -166,7 +169,15 @@ public List<Entity> extractMentionedScreennamesWithIndices(String text) {
while (matcher.find()) {
String after = text.substring(matcher.end());
if (! Regex.INVALID_MENTION_MATCH_END.matcher(after).find()) {
extracted.add(new Entity(matcher, EntityType.MENTION, Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME));
if (matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_LIST) == null) {
extracted.add(new Entity(matcher, Entity.Type.MENTION, Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME));
} else {
extracted.add(new Entity(matcher.start(Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME) - 1,
matcher.end(Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME),
matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME),
matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_LIST),
Entity.Type.MENTION));
}
}
}
return extracted;
Expand Down Expand Up @@ -222,7 +233,8 @@ public List<String> extractURLs(String text) {
* @return List of URLs referenced.
*/
public List<Entity> extractURLsWithIndices(String text) {
if (text == null || text.isEmpty() || text.indexOf('.') == -1) {
if (text == null || text.isEmpty()
|| (extractURLWithoutProtocol ? text.indexOf('.') : text.indexOf(':')) == -1) {
return Collections.emptyList();
}

Expand All @@ -239,16 +251,17 @@ public List<Entity> extractURLsWithIndices(String text) {
continue;
}
}
Entity entity = new Entity(matcher, EntityType.URL, Regex.VALID_URL_GROUP_URL, 0);
String url = matcher.group(Regex.VALID_URL_GROUP_URL);
int start = matcher.start(Regex.VALID_URL_GROUP_URL);
int end = matcher.end(Regex.VALID_URL_GROUP_URL);
Matcher tco_matcher = Regex.VALID_TCO_URL.matcher(url);
if (tco_matcher.find()) {
// In the case of t.co URLs, don't allow additional path characters.
entity.value = tco_matcher.group();
entity.end = entity.start + entity.value.length();
url = tco_matcher.group();
end = start + url.length();
}

urls.add(entity);
urls.add(new Entity(start, end, url, Entity.Type.URL));
}

return urls;
Expand Down Expand Up @@ -302,7 +315,7 @@ public List<Entity> extractHashtagsWithIndices(String text) {
while (matcher.find()) {
String after = text.substring(matcher.end());
if (!Regex.INVALID_HASHTAG_MATCH_END.matcher(after).find()) {
extracted.add(new Entity(matcher, EntityType.HASHTAG, Regex.VALID_HASHTAG_GROUP_TAG));
extracted.add(new Entity(matcher, Entity.Type.HASHTAG, Regex.VALID_HASHTAG_GROUP_TAG));
}
}
return extracted;
Expand Down
4 changes: 2 additions & 2 deletions tests/com/twitter/ConformanceTest.java
Expand Up @@ -10,7 +10,7 @@
import junit.framework.TestCase;
import org.ho.yaml.Yaml;

import com.twitter.Extractor.EntityType;
import com.twitter.Extractor.Entity;
import com.twitter.ExtractorTest.HashtagTest;

public class ConformanceTest extends TestCase {
Expand Down Expand Up @@ -63,7 +63,7 @@ public void testHashtagsWithIndicesExtractor() throws Exception {
List<Extractor.Entity> expected = new ArrayList<Extractor.Entity>();
for (Map<String, Object> configEntry : expectedConfig) {
List<Integer> indices = (List<Integer>)configEntry.get("indices");
expected.add(new Extractor.Entity(indices.get(0), indices.get(1), configEntry.get("hashtag").toString(), EntityType.HASHTAG, null));
expected.add(new Extractor.Entity(indices.get(0), indices.get(1), configEntry.get("hashtag").toString(), Entity.Type.HASHTAG));
}

assertEquals((String)testCase.get(KEY_DESCRIPTION),
Expand Down

0 comments on commit e78d821

Please sign in to comment.