Permalink
Browse files

Prefer fine URL match over approximate URL mask regex on final filtering

Also prevent adding a redundant and CPU costly Solr url mask filter
query when possible
  • Loading branch information...
luccioman committed Dec 1, 2017
1 parent 0a12078 commit c9d80b5b770935bfb8aba2c8eaf39e785883e43e
Showing with 19 additions and 6 deletions.
  1. +14 −3 source/net/yacy/search/query/QueryParams.java
  2. +5 −3 source/net/yacy/search/query/SearchEvent.java
@@ -120,12 +120,17 @@ public String toString() {
private final QueryGoal queryGoal;
public int itemsPerPage;
public int offset;
/** The URL mask pattern compiled from the urlMasString.
* Null when the urlMaskString is not user provided but generated from the query modifiers */
public Pattern urlMaskPattern;
public Automaton urlMaskAutomaton;
public String urlMaskString;
public final Pattern prefer;
public final String tld, inlink;
/** true when the urlMasString is just a catch all pattern such as ".*" */
boolean urlMask_isCatchall;
public final Classification.ContentDomain contentdom;
public final String targetlang;
@@ -224,7 +229,9 @@ public QueryParams(
this.urlMaskString = filter;
this.urlMaskAutomaton = Automata.makeString(filter);
this.urlMask_isCatchall = false;
this.urlMaskPattern = Pattern.compile(filter);
/* We let here the urlMaskPattern null :
* final URL match checking will be made with the more accurate matchesURL function */
this.urlMaskPattern = null;
}
}
this.tld = tld;
@@ -427,6 +434,10 @@ public static String anonymizedQueryHashes(final HandleSet hashes) {
}
/**
* Check wheter the given URL matches the eventual modifier and top-level domain
* constraints. Should be preferred as more accurate than the url mask pattern generated with
* {@link #buildApproximateURLFilter(QueryModifier, String)}.
*
* @param modifier
* the query modifier with eventual constraints on protocoln, host
* name or file extension
@@ -727,8 +738,8 @@ private SolrQuery getBasicParams(final boolean getFacets, final List<String> fqs
fqs.add(CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName() + ":\"" + this.inlink + '\"');
}
if (!this.urlMask_isCatchall) {
// add a filter query on urls
if (!this.urlMask_isCatchall && this.urlMaskPattern != null) {
// add a filter query on urls only if user custom and not generated from other modifiers
fqs.add(CollectionSchema.sku.getSolrFieldName() + ":/" + this.urlMaskString + "/");
}
@@ -973,8 +973,8 @@ public void addNodes(
continue pollloop;
}
if ( !this.query.urlMask_isCatchall ) {
// check url mask
if ( !this.query.urlMask_isCatchall && this.query.urlMaskPattern != null) {
// check url mask, only when not redundant with query modifier and tld constraints
if (!iEntry.matches(this.query.urlMaskPattern)) {
if (log.isFine()) log.fine("dropped Node: url mask does not match");
updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators);
@@ -1407,7 +1407,9 @@ public URIMetadataNode pullOneFilteredFromRWI(final boolean skipDoubleDom) {
continue;
}
if (!this.query.urlMask_isCatchall && !page.matches(this.query.urlMaskPattern)) {
if (!this.query.urlMask_isCatchall && this.query.urlMaskPattern != null
&& !page.matches(this.query.urlMaskPattern)) {
// check url mask, only when not redundant with query modifier and tld constraints
if (log.isFine()) log.fine("dropped RWI: no match with urlMask");
decrementCounts(page.word());
continue;

0 comments on commit c9d80b5

Please sign in to comment.