Permalink
Browse files

Improved accuracy of URLs search filters : protocol, tld, host, file ext

  • Loading branch information...
luccioman committed Dec 1, 2017
1 parent d1c7dfd commit 0a120787e3a6ece55d38418d5adb35093c358102
@@ -912,7 +912,7 @@ public String getAuthority() {
}
/**
* @return the host part of this URL, Punycode encoded for Internationalized Domain Names
* @return the host part of this URL, Punycode encoded for Internationalized Domain Names. Can be null, for example for file URLs such as "file:///path/file.ext"
*/
public String getHost() {
return this.host;
@@ -926,6 +926,9 @@ public String getOrganization() {
return orga;
}
/**
* @return the top-level domain name part of this url host name, or the empty string.
*/
public String getTLD() {
if (this.host == null) return "";
int p = this.host.lastIndexOf('.');
@@ -46,6 +46,7 @@
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.SortClause;
import org.apache.solr.common.params.DisMaxParams;
@@ -55,6 +56,7 @@
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.geo.GeoLocation;
@@ -217,7 +219,7 @@ public QueryParams(
}
this.urlMask_isCatchall = this.urlMaskString.equals(catchall_pattern.toString());
if (this.urlMask_isCatchall) {
final String filter = QueryParams.buildURLFilter(modifier, tld);
final String filter = QueryParams.buildApproximateURLFilter(modifier, tld);
if (!QueryParams.catchall_pattern.toString().equals(filter)) {
this.urlMaskString = filter;
this.urlMaskAutomaton = Automata.makeString(filter);
@@ -277,6 +279,13 @@ public QueryParams(
}
/**
* Generate an URL filter from the query modifier and eventual tld, usable as a
* first approximation for filtering, and compatible with the yacy/search
* API.<br/>
* For truly accurate filtering, checking constraints against parsed URLs in
* MultiprotocolURL instances is easier and more reliable than building a complex regular
* expression that must be both compatible with the JDK {@link Pattern} and with Lucene {@link RegExp}.
*
* @param modifier
* query modifier with eventual protocol, sitehost and filetype
* constraints. The modifier parameter itselft must not be null.
@@ -285,7 +294,7 @@ public QueryParams(
* @return an URL filter regular expression from the provided modifier and tld
* constraints, matching anything when there are no constraints at all.
*/
protected static String buildURLFilter(final QueryModifier modifier, final String tld) {
protected static String buildApproximateURLFilter(final QueryModifier modifier, final String tld) {
final String protocolfilter = modifier.protocol == null ? ".*" : modifier.protocol;
final String defaulthostprefix = "www";
final String hostfilter;
@@ -416,6 +425,61 @@ public static String anonymizedQueryHashes(final HandleSet hashes) {
sb.append("]");
return sb.toString();
}
/**
* @param modifier
* the query modifier with eventual constraints on protocoln, host
* name or file extension
* @param tld
* an eventual top-level domain name to filter on
* @param url
* the url to check
* @return the constraint that did not match ("url" when url is null,
* "protocol", "sitehost", "tld", or "filetype"), or the empty string
* when the url matches
*/
public static String matchesURL(final QueryModifier modifier, final String tld, final MultiProtocolURL url) {
if (url == null) {
return "url";
}
if (modifier != null) {
if (modifier.protocol != null) {
if (!modifier.protocol.equalsIgnoreCase(url.getProtocol())) {
return "protocol";
}
}
if (modifier.sitehost != null) {
/*
* consider to search for hosts with 'www'-prefix, if not already part of the
* host name
*/
final String wwwPrefix = "www.";
final String host;
final String hostWithWwwPrefix;
if (modifier.sitehost.startsWith(wwwPrefix)) {
hostWithWwwPrefix = modifier.sitehost;
host = modifier.sitehost.substring(wwwPrefix.length());
} else {
hostWithWwwPrefix = wwwPrefix + modifier.sitehost;
host = modifier.sitehost;
}
if (!host.equalsIgnoreCase(url.getHost()) && !hostWithWwwPrefix.equals(url.getHost())) {
return "sitehost";
}
}
if (tld != null) {
if (!tld.equalsIgnoreCase(url.getTLD())) {
return "tld";
}
}
if (modifier.filetype != null) {
if (!modifier.filetype.equalsIgnoreCase(MultiProtocolURL.getFileExtension(url.getFileName()))) {
return "filetype";
}
}
}
return "";
}
/**
* check if the given text matches with the query
@@ -963,6 +963,16 @@ public void addNodes(
try {
pollloop: for (URIMetadataNode iEntry: nodeList) {
// check url related eventual constraints (protocol, tld, sitehost, and filetype)
final String matchingResult = QueryParams.matchesURL(this.query.modifier, this.query.tld, iEntry.url());
if (!matchingResult.isEmpty()) {
if (log.isFine()) {
log.fine("dropped Node: " + matchingResult);
}
updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators);
continue pollloop;
}
if ( !this.query.urlMask_isCatchall ) {
// check url mask
if (!iEntry.matches(this.query.urlMaskPattern)) {
@@ -1019,13 +1029,6 @@ public void addNodes(
updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators);
continue pollloop;
}
} else {
// filter out all domains that do not match with the site constraint
if (iEntry.url().getHost().indexOf(this.query.modifier.sitehost) < 0) {
if (log.isFine()) log.fine("dropped Node: sitehost");
updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators);
continue pollloop;
}
}
if (this.query.modifier.language != null) {
@@ -1393,6 +1396,16 @@ public URIMetadataNode pullOneFilteredFromRWI(final boolean skipDoubleDom) {
// returns from the current RWI list the best URL entry and removes this entry from the list
URIMetadataNode page;
mainloop: while ((page = pullOneRWI(skipDoubleDom)) != null) {
// check url related eventual constraints (protocol, tld, sitehost, and filetype)
final String matchingResult = QueryParams.matchesURL(this.query.modifier, this.query.tld, page.url());
if (!matchingResult.isEmpty()) {
if (log.isFine()) {
log.fine("dropped RWI: no match on " + matchingResult);
}
decrementCounts(page.word());
continue;
}
if (!this.query.urlMask_isCatchall && !page.matches(this.query.urlMaskPattern)) {
if (log.isFine()) log.fine("dropped RWI: no match with urlMask");
@@ -1427,14 +1440,6 @@ public URIMetadataNode pullOneFilteredFromRWI(final boolean skipDoubleDom) {
}
// filter query modifiers variables (these are host, filetype, protocol, language, author, collection, dates_in_content(on,from,to,timezone) )
// while ( protocol, host, filetype ) currently maybe incorporated in (this.query.urlMaskPattern) queryparam
// check modifier constraint filetype (using fileextension)
if (this.query.modifier.filetype != null && !this.query.modifier.filetype.equals(ext)) {
if (log.isFine()) log.fine("dropped RWI: file type constraint = " + this.query.modifier.filetype);
decrementCounts(page.word());
continue;
}
/* check again modifier constraint (language) with the language in the full metadata,
* that may differ from the one in the reverse word reference which is already checked in addRWIs()*/
@@ -1480,12 +1485,12 @@ public URIMetadataNode pullOneFilteredFromRWI(final boolean skipDoubleDom) {
// content control
if (Switchboard.getSwitchboard().getConfigBool("contentcontrol.enabled", false)) {
FilterEngine f = ContentControlFilterUpdateThread.getNetworkFilter();
if (f != null && !f.isListed(page.url(), null)) {
FilterEngine f = ContentControlFilterUpdateThread.getNetworkFilter();
if (f != null && !f.isListed(page.url(), null)) {
if (log.isFine()) log.fine("dropped RWI: url is blacklisted in contentcontrol");
decrementCounts(page.word());
decrementCounts(page.word());
continue;
}
}
}
final String pageurl = page.url().toNormalform(true);
Oops, something went wrong.

0 comments on commit 0a12078

Please sign in to comment.