Permalink
Browse files

Refactored plain-text URLs detection implementation.

For faster processing (measured about 2 times faster on many real-world
examples) and more advanced detection (previous algorithm detected only
URLs separated from the rest of the text by a space character).
  • Loading branch information...
luccioman committed Jun 27, 2017
1 parent 8da3174 commit 9b1bb2545e4d40f542e91049a8e3799f2593d8c3
@@ -362,9 +362,11 @@ public void scrapeText(final char[] newtext0, final String insideTag) {
}
}
private final static Pattern dpssp = Pattern.compile("://");
private final static Pattern protp = Pattern.compile("smb://|ftp://|http://|https://");
/** A regular expression pattern matching any whitespace character */
private final static Pattern WHITESPACE_PATTERN = Pattern.compile("\\s");
/**
* Try to detect and parse absolute URLs in text, then update the urls collection and fire anchorAdded event on listeners. Any parameter are can be null.
* @param text the text to parse
@@ -375,23 +377,33 @@ public static void findAbsoluteURLs(final String text, final Collection<AnchorUR
if(text == null) {
return;
}
int schemePosition, spacePosition, offset = 0;
int schemePosition, offset = 0;
boolean hasWhiteSpace;
String urlString;
AnchorURL url;
final Matcher urlSchemeMatcher = protp.matcher(text);
final Matcher whiteSpaceMatcher = WHITESPACE_PATTERN.matcher(text);
while (offset < text.length()) {
schemePosition = find(text, dpssp, offset);
if (schemePosition == Integer.MAX_VALUE) {
if(!urlSchemeMatcher.find(offset)) {
break;
}
offset = Math.max(0, schemePosition - 5);
schemePosition = find(text, protp, offset);
if (schemePosition == Integer.MAX_VALUE) {
break;
schemePosition = urlSchemeMatcher.start();
hasWhiteSpace = whiteSpaceMatcher.find(urlSchemeMatcher.end());
urlString = text.substring(schemePosition, hasWhiteSpace ? whiteSpaceMatcher.start() : text.length());
if (urlString.endsWith(".")) {
urlString = urlString.substring(0, urlString.length() - 1); // remove the '.' that was appended above
}
spacePosition = text.indexOf(" ", schemePosition + 1);
urlString = text.substring(schemePosition, spacePosition < 0 ? text.length() : spacePosition);
if (urlString.endsWith(".")) urlString = urlString.substring(0, urlString.length() - 1); // remove the '.' that was appended above
offset = schemePosition + 6;
/* URLs can contain brackets, furthermore as they can even be reserved characters in the URI syntax (see https://tools.ietf.org/html/rfc3986#section-2.2)
* But when unpaired, in most cases this is that the unpaired bracket is not part of the URL, but rather used to wrap it in the text*/
urlString = removeUnpairedBrackets(urlString, '(', ')');
urlString = removeUnpairedBrackets(urlString, '{', '}');
urlString = removeUnpairedBrackets(urlString, '[', ']');
offset = schemePosition + urlString.length();
try {
url = new AnchorURL(urlString);
if(urls != null) {
@@ -406,13 +418,59 @@ public static void findAbsoluteURLs(final String text, final Collection<AnchorUR
}
}
private static final int find(final String s, final Pattern m, final int start) {
final Matcher mm = m.matcher(s.subSequence(start, s.length()));
if (!mm.find()) return Integer.MAX_VALUE;
final int p = mm.start() + start;
//final int p = s.indexOf(m, start);
return (p < 0) ? Integer.MAX_VALUE : p;
}
/**
* Analyze bracket pairs found in the string and eventually
* return a truncated version of that string when one or more pairs are incomplete
*
* @param str
* the string to analyze
* @param openingMark
* the opening bracket character (example : '{')
* @param closingMark
* the closing bracket character (example : '}')
* @return the original string or a truncated copy
*/
protected static String removeUnpairedBrackets(final String str, final char openingMark,
final char closingMark) {
if(str == null) {
return null;
}
String result = str;
char ch;
int depth = 0, index = 0, lastUnpairedOpeningIndex = -1;
/* Loop on all characters of the string */
for(; index < str.length(); index++) {
ch = str.charAt(index);
if(ch == openingMark) {
if(depth == 0) {
lastUnpairedOpeningIndex = index;
}
depth++;
} else if(ch == closingMark) {
depth--;
if(depth == 0) {
lastUnpairedOpeningIndex = -1;
}
}
if(depth < 0) {
/* Unpaired closing mark : stop the loop here */
break;
}
}
if (depth > 0) {
/* One or more unpaired opening marks : truncate at the first opening level */
if(lastUnpairedOpeningIndex >= 0) {
result = str.substring(0, lastUnpairedOpeningIndex);
}
} else if (depth < 0) {
/* One or more unpaired closing marks : truncate at the current index as the loop should have been exited with a break */
if(index >= 0) {
result = str.substring(0, index);
}
}
return result;
}
/**
* @param relativePath relative path to this document base URL
@@ -25,17 +25,21 @@
import java.io.StringReader;
import java.io.Writer;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collection;
import java.util.Date;
import java.util.List;
import java.util.Set;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils;
import org.junit.Assert;
import org.junit.Test;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils;
/**
* Unit tests for ContentScrapper class.
* @author luc
@@ -158,5 +162,154 @@ public void testGetStartDates() throws MalformedURLException, IOException {
}
scraper.close();
}
/**
* Test absolute URLs detection in plain text
* @throws MalformedURLException should not happen
*/
@Test
public void testFindAbsoluteURLs() throws MalformedURLException {
final String[] urlStrings = { "http://yacy.net", "http://forum.yacy.de", "https://en.wikipedia.org" };
final List<AnchorURL> urls = new ArrayList<>();
for (String urlString : urlStrings) {
urls.add(new AnchorURL(urlString));
}
/* Test with various white space separators */
String[] separators = { " ", "\n", "\t", "\r" };
for (String separator : separators) {
StringBuilder text = new StringBuilder();
for (String urlString : urlStrings) {
if (text.length() > 0) {
text.append(separator);
}
text.append(urlString);
}
Collection<AnchorURL> detectedURLs = new ArrayList<>();
ContentScraper.findAbsoluteURLs(text.toString(), detectedURLs, null);
Assert.assertEquals(urls.size(), detectedURLs.size());
Assert.assertTrue(urls.containsAll(detectedURLs));
}
/* URLs surrounded with parenthesis */
String[] texts = { "(http://yacy.net)", "YaCy home page (http://yacy.net)",
"Nested parentheses (YaCy home page (http://yacy.net))",
"Text in parenthesis (example : http://yacy.net)", "A markdown link [YaCy home page](http://yacy.net)",
"A markdown [example](http://yacy.net \"YaCy home page\") inline link" };
for (String text : texts) {
Collection<AnchorURL> detectedURLs = new ArrayList<>();
ContentScraper.findAbsoluteURLs(text, detectedURLs, null);
Assert.assertEquals(1, detectedURLs.size());
Assert.assertEquals(new AnchorURL("http://yacy.net"), detectedURLs.iterator().next());
}
/* URLs surrounded with square brackets */
//http://[abcd:ef01:2345:6789:abcd:ef01:2345:6789]/
String[] squareBracketsTexts = { "[http://yacy.net]", "YaCy home page [http://yacy.net]",
"Nested brackets [YaCy home page [http://yacy.net]]",
"A mediawiki external link with different label [http://yacy.net YaCy home page]" };
for(String text : squareBracketsTexts) {
Collection<AnchorURL> detectedURLs = new ArrayList<>();
ContentScraper.findAbsoluteURLs(text, detectedURLs, null);
Assert.assertEquals(1, detectedURLs.size());
Assert.assertEquals(new AnchorURL("http://yacy.net"), detectedURLs.iterator().next());
}
/* URLs surrounded with curly brackets */
//http://[abcd:ef01:2345:6789:abcd:ef01:2345:6789]/
String[] curlyBracketsTexts = { "{http://yacy.net}", "YaCy home page {http://yacy.net}",
"Nested brackets {YaCy home page {http://yacy.net}}",
"Text in brackets {example : http://yacy.net}" };
for(String text : curlyBracketsTexts) {
Collection<AnchorURL> detectedURLs = new ArrayList<>();
ContentScraper.findAbsoluteURLs(text, detectedURLs, null);
Assert.assertEquals(1, detectedURLs.size());
Assert.assertEquals(new AnchorURL("http://yacy.net"), detectedURLs.iterator().next());
}
/* URL with parenthesis */
String text = "Example: https://en.wikipedia.org/wiki/Firefox_(disambiguation)";
Collection<AnchorURL> detectedURLs = new ArrayList<>();
ContentScraper.findAbsoluteURLs(text, detectedURLs, null);
Assert.assertEquals(1, detectedURLs.size());
Assert.assertEquals(new AnchorURL("https://en.wikipedia.org/wiki/Firefox_(disambiguation)"), detectedURLs.iterator().next());
/* IPV6 host */
text = "URL with IPV6 host : http://[abcd:ef01:2345:6789:abcd:ef01:2345:6789]";
detectedURLs = new ArrayList<>();
ContentScraper.findAbsoluteURLs(text, detectedURLs, null);
Assert.assertEquals(1, detectedURLs.size());
Assert.assertEquals(new AnchorURL("http://[abcd:ef01:2345:6789:abcd:ef01:2345:6789]"), detectedURLs.iterator().next());
/* Text containing only the '://' pattern */
detectedURLs = new ArrayList<>();
ContentScraper.findAbsoluteURLs("An absolute URL should contain the '://' pattern", detectedURLs, null);
Assert.assertEquals(0, detectedURLs.size());
/* Text containing only the 'http://' and 'https://' patterns */
detectedURLs = new ArrayList<>();
ContentScraper.findAbsoluteURLs("An absolute HTTP URL should start with 'http://' or 'https://'", detectedURLs, null);
Assert.assertEquals(0, detectedURLs.size());
/* Text containing a malformed URL */
detectedURLs = new ArrayList<>();
ContentScraper.findAbsoluteURLs("The URL https://example.com:demo is malformed", detectedURLs, null);
Assert.assertEquals(0, detectedURLs.size());
/* Empty text */
detectedURLs = new ArrayList<>();
ContentScraper.findAbsoluteURLs("", detectedURLs, null);
Assert.assertEquals(0, detectedURLs.size());
/* Null text */
detectedURLs = new ArrayList<>();
ContentScraper.findAbsoluteURLs("", detectedURLs, null);
Assert.assertEquals(0, detectedURLs.size());
}
/**
* Test unpaired brackets cleaning
*/
@Test
public void testRemoveUnpairedBrackets() {
/* Null String */
Assert.assertEquals(null, ContentScraper.removeUnpairedBrackets(null, '{', '}'));
/* Empty string */
Assert.assertEquals("", ContentScraper.removeUnpairedBrackets("", '{', '}'));
/* No bracket at all */
Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc", '{', '}'));
/* Missing one or more opening mark */
Assert.assertEquals("", ContentScraper.removeUnpairedBrackets("}", '{', '}'));
Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}", '{', '}'));
Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}def", '{', '}'));
Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}}", '{', '}'));
Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}def}", '{', '}'));
Assert.assertEquals("{abc}", ContentScraper.removeUnpairedBrackets("{abc}}", '{', '}'));
Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}{def}}", '{', '}'));
Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}{def}", '{', '}'));
Assert.assertEquals("{abc}def", ContentScraper.removeUnpairedBrackets("{abc}def}", '{', '}'));
Assert.assertEquals("{abc}def", ContentScraper.removeUnpairedBrackets("{abc}def}hij}", '{', '}'));
Assert.assertEquals("{{abc}{def}}", ContentScraper.removeUnpairedBrackets("{{abc}{def}}}", '{', '}'));
/* Missing both opening and closing */
Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}de{f", '{', '}'));
/* Missing one or more closing mark */
Assert.assertEquals("", ContentScraper.removeUnpairedBrackets("{", '{', '}'));
Assert.assertEquals("", ContentScraper.removeUnpairedBrackets("{abc", '{', '}'));
Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc{def", '{', '}'));
Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc{{", '{', '}'));
Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc{def{", '{', '}'));
Assert.assertEquals("", ContentScraper.removeUnpairedBrackets("{{abc}", '{', '}'));
Assert.assertEquals("", ContentScraper.removeUnpairedBrackets("{abc{def}", '{', '}'));
Assert.assertEquals("{{abc}{def}}", ContentScraper.removeUnpairedBrackets("{{abc}{def}}{", '{', '}'));
/* Correctly paired marks */
Assert.assertEquals("abc{}", ContentScraper.removeUnpairedBrackets("abc{}", '{', '}'));
Assert.assertEquals("{abc}", ContentScraper.removeUnpairedBrackets("{abc}", '{', '}'));
Assert.assertEquals("{abc}{def}", ContentScraper.removeUnpairedBrackets("{abc}{def}", '{', '}'));
Assert.assertEquals("{{abc}{def}}", ContentScraper.removeUnpairedBrackets("{{abc}{def}}", '{', '}'));
}
}

0 comments on commit 9b1bb25

Please sign in to comment.