Permalink
Browse files

Added optional search parameter/setting to control content domain filter

Thus allowing to choose at configuration or per search request, whether
extending or not results beyond strict content domain filter (image,
video, audio or application).

Related graphical controls to be added to user interface.
  • Loading branch information...
luccioman committed Dec 23, 2017
1 parent f52217c commit e6907fdab37bb05d9766b2ea67a9d25f45e7f42c
@@ -843,6 +843,17 @@ search.audio = false
search.video = false
search.app = false
# Strict content domain filtering : when false, results can be extended to documents including links to documents
# of contentdom type, whithout being themselves of that type.
# Examples :
# - contentdom search param == image, strictContentDom == true
# - jpeg image : acceptable result
# - html page embedding images : rejected
# - contentdom search param == image, strictContentDom == false
# - jpeg image : acceptable result
# - html page embedding images : acceptable result
search.strictContentDom = false
# number of search results per page displayed by default
search.items = 10
@@ -116,6 +116,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
final String prefer = post.get("prefer", "");
final String contentdom = post.get("contentdom", "all");
final boolean strictContentDom = post.getBoolean("strictContentDom");
final String filter = post.get("filter", ".*"); // a filter on the url
final int timezoneOffset = post.getInt("timezoneOffset", 0);
QueryModifier modifier = new QueryModifier(timezoneOffset);
@@ -255,6 +256,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
0.0d,
new String[0]
);
theQuery.setStrictContentDom(strictContentDom);
Network.log.info("INIT HASH SEARCH (abstracts only): " + QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()) + " - " + theQuery.itemsPerPage() + " links");
final long timer = System.currentTimeMillis();
@@ -319,6 +321,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
0.0d,
new String[0]
);
theQuery.setStrictContentDom(strictContentDom);
Network.log.info("INIT HASH SEARCH (query-" + abstracts + "): " + QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()) + " - " + theQuery.itemsPerPage() + " links");
EventChannel.channels(EventChannel.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()), ""));
if (sb.getConfigBool(SwitchboardConstants.DECORATION_AUDIO, false)) Audio.Soundclip.remotesearch.play(-10.0f);
@@ -290,6 +290,11 @@ public static serverObjects respond(
// find search domain
final Classification.ContentDomain contentdom = post == null || !post.containsKey("contentdom") ? ContentDomain.ALL : ContentDomain.contentdomParser(post.get("contentdom", "all"));
// Strict/extended content domain constraint : configured setting may be overriden by request param
final boolean strictContentDom = !Boolean.FALSE.toString().equalsIgnoreCase(post.get("strictContentDom",
sb.getConfig(SwitchboardConstants.SEARCH_STRICT_CONTENT_DOM,
String.valueOf(SwitchboardConstants.SEARCH_STRICT_CONTENT_DOM_DEFAULT))));
// check the search tracker
TreeSet<Long> trackerHandles = sb.localSearchTracker.get(client);
@@ -692,6 +697,7 @@ public static serverObjects respond(
header.get(HeaderFramework.USER_AGENT, ""),
lat, lon, rad,
sb.getConfigArray("search.navigation", ""));
theQuery.setStrictContentDom(strictContentDom);
theQuery.setStandardFacetsMaxCount(sb.getConfigInt(SwitchboardConstants.SEARCH_NAVIGATION_MAXCOUNT,
QueryParams.FACETS_STANDARD_MAXCOUNT_DEFAULT));
theQuery.setDateFacetMaxCount(sb.getConfigInt(SwitchboardConstants.SEARCH_NAVIGATION_DATES_MAXCOUNT,
@@ -605,7 +605,7 @@ private static void processImage(final Switchboard sb, final serverObjects prop,
final SearchEvent theSearch, final String target_special_pattern, long timeout, boolean fullViewingRights, final boolean noreferrer) {
prop.put("content", theSearch.query.contentdom.getCode() + 1); // switch on specific content
try {
SearchEvent.ImageResult image = theSearch.oneImageResult(item, timeout);
SearchEvent.ImageResult image = theSearch.oneImageResult(item, timeout, theSearch.query.isStrictContentDom());
final String imageUrlstring = image.imageUrl.toNormalform(true);
final String imageUrlExt = MultiProtocolURL.getFileExtension(image.imageUrl.getFileName());
final String target = sb.getConfig(imageUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self");
@@ -488,6 +488,7 @@ protected static int primarySearch(
final String excludehashes,
final String language,
final ContentDomain contentdom,
final boolean strictContentDom,
final int count,
final long time,
final int maxDistance,
@@ -533,6 +534,7 @@ protected static int primarySearch(
"",
language,
contentdom,
strictContentDom,
count,
time,
maxDistance,
@@ -600,6 +602,7 @@ protected static int secondarySearch(
final String wordhashes,
final String urlhashes,
final ContentDomain contentdom,
final boolean strictContentDom,
final int count,
final long time,
final int maxDistance,
@@ -624,6 +627,7 @@ protected static int secondarySearch(
urlhashes,
"",
contentdom,
strictContentDom,
count,
time,
maxDistance,
@@ -889,6 +893,7 @@ public SearchResult(
final String urlhashes,
final String language,
final ContentDomain contentdom,
final boolean strictContentDom,
final int count,
final long time,
final int maxDistance,
@@ -941,6 +946,9 @@ public SearchResult(
//parts.put("sitehost", UTF8.StringBody(event.query.modifier.sitehost));
parts.put("author", UTF8.StringBody(event.query.modifier.author));
parts.put("contentdom", UTF8.StringBody(contentdom == null ? ContentDomain.ALL.toString() : contentdom.toString()));
if(strictContentDom) {
parts.put("strictContentDom", UTF8.StringBody("true"));
}
parts.put("maxdist", UTF8.StringBody(Integer.toString(maxDistance)));
parts.put("profile", UTF8.StringBody(crypt.simpleEncode(event.query.ranking.toExternalString())));
parts.put("constraint", UTF8.StringBody((event.query.constraint == null) ? "" : event.query.constraint.exportB64()));
@@ -61,6 +61,7 @@
final private SearchEvent event;
final private String wordhashes, excludehashes;
final private ContentDomain contentdom;
final private boolean strictContentDom;
final private int partitions;
final private SecondarySearchSuperviser secondarySearchSuperviser;
final private Blacklist blacklist;
@@ -78,6 +79,7 @@ public RemoteSearch(
final String excludehashes,
final String language,
final ContentDomain contentdom,
final boolean strictContentDom,
final int count,
final long time,
final int maxDistance,
@@ -91,6 +93,7 @@ public RemoteSearch(
this.excludehashes = excludehashes;
this.language = language;
this.contentdom = contentdom;
this.strictContentDom = strictContentDom;
this.partitions = partitions;
this.secondarySearchSuperviser = secondarySearchSuperviser;
this.blacklist = blacklist;
@@ -114,6 +117,7 @@ public void run() {
this.excludehashes,
this.language,
this.contentdom,
this.strictContentDom,
this.count,
this.time,
this.maxDistance,
@@ -264,7 +268,8 @@ public static void primaryRemoteSearches(
// start solr searches
final int targets = dhtPeers.size() + robinsonPeers.size();
if (!sb.getConfigBool(SwitchboardConstants.DEBUG_SEARCH_REMOTE_SOLR_OFF, false)) {
final SolrQuery solrQuery = event.query.solrQuery(event.getQuery().contentdom, useFacets, event.excludeintext_image);
final SolrQuery solrQuery = event.query.solrQuery(event.getQuery().contentdom,
event.query.isStrictContentDom(), useFacets, event.excludeintext_image);
for (Seed s: robinsonPeers) {
if (MemoryControl.shortStatus()
|| Memory.load() > sb.getConfigFloat(SwitchboardConstants.REMOTESEARCH_MAXLOAD_SOLR,
@@ -292,6 +297,7 @@ public static void primaryRemoteSearches(
QueryParams.hashSet2hashString(event.query.getQueryGoal().getExcludeHashes()),
event.query.targetlang == null ? "" : event.query.targetlang,
event.query.contentdom == null ? ContentDomain.ALL : event.query.contentdom,
event.query.isStrictContentDom(),
count,
time,
event.query.maxDistance,
@@ -336,6 +342,7 @@ public void run() {
QueryParams.hashSet2hashString(wordhashes),
urlhashes,
ContentDomain.ALL,
false,
20,
time,
999,
@@ -560,6 +560,16 @@
public static final String SEARCH_VERIFY = "search.verify";
public static final String SEARCH_VERIFY_DELETE = "search.verify.delete";
/**
* Key of the setting controlling whether content domain filtering is strict :
* when false, results can be extended to documents including links to documents
* of contentdom type, whithout being themselves of that type.
*/
public static final String SEARCH_STRICT_CONTENT_DOM = "search.strictContentDom";
/** Default setting value controlling whether content domain filtering is strict. */
public static final boolean SEARCH_STRICT_CONTENT_DOM_DEFAULT = false;
/** Key of the setting controlling whether search results resorting by browser JavaScript is enabled */
public static final String SEARCH_JS_RESORT = "search.jsresort";
@@ -368,77 +368,94 @@ public StringBuilder collectionTextQuery() {
/**
* Generate a Solr filter query to receive valid image results.
*
* This filters error-urls out and includes urls with mime image/* as well
* as urls with links to images.
* This filters error-urls out and includes urls with mime image/*, as well
* as urls with links to images when strict is false.
* We use the mime (image/*) only to find images as the parser assigned the
* best mime to index documents. This applies also to parsed file systems.
* This ensures that no text urls with image-fileextension is returned
* (as some large internet sites like to use such urls)
*
* @param strict when true, do not include non-image urls with links to images
* @return Solr filter query for image urls
*/
public List<String> collectionImageFilterQuery() {
public List<String> collectionImageFilterQuery(final boolean strict) {
final ArrayList<String> fqs = new ArrayList<>();
// add filter to prevent that results come from failed urls
fqs.add(CollectionSchema.httpstatus_i.getSolrFieldName() + ":" + HttpStatus.SC_OK);
fqs.add(
CollectionSchema.content_type.getSolrFieldName() + ":(image/*) OR " +
CollectionSchema.images_urlstub_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);
StringBuilder filter = new StringBuilder(CollectionSchema.content_type.getSolrFieldName()).append(":(image/*)");
if (!strict) {
filter.append(" OR ").append(CollectionSchema.images_urlstub_sxt.getSolrFieldName())
.append(AbstractSolrConnector.CATCHALL_DTERM);
}
fqs.add(filter.toString());
return fqs;
}
/**
* Generate Solr filter queries to receive valid video content results.
* Generate Solr filter queries to receive valid audio content results.
*
* This filters out documents with bad HTTP status and includes documents with MIME type matching the prefix video/* as well
* docuemnts with links to video content.
* This filters out documents with bad HTTP status and includes documents with MIME type matching the prefix audio/* as well
* documents with links to audio content when strict is false.
*
* @return Solr filter queries for video content URLs
* @param strict when true, do not include non-audio urls with links to audio
* @return Solr filter queries for audio content URLs
*/
public List<String> collectionAudioFilterQuery() {
public List<String> collectionAudioFilterQuery(final boolean strict) {
final ArrayList<String> fqs = new ArrayList<>();
// add filter to prevent that results come from failed urls
fqs.add(CollectionSchema.httpstatus_i.getSolrFieldName() + ":" + HttpStatus.SC_OK);
fqs.add(CollectionSchema.content_type.getSolrFieldName() + ":(audio/*) OR "
+ CollectionSchema.audiolinkscount_i.getSolrFieldName() + ":[1 TO *]");
StringBuilder filter = new StringBuilder(CollectionSchema.content_type.getSolrFieldName()).append(":(audio/*)");
if (!strict) {
filter.append(" OR ").append(CollectionSchema.audiolinkscount_i.getSolrFieldName()).append(":[1 TO *]");
}
fqs.add(filter.toString());
return fqs;
}
/**
* Generate Solr filter queries to receive valid video content results.
*
* This filters out documents with bad HTTP status and includes documents with MIME type matching the prefix video/* as well
* docuemnts with links to video content.
* documents with links to video content when strict is false.
*
* @param strict when true, do not include non-video urls with links to video
* @return Solr filter queries for video content URLs
*/
public List<String> collectionVideoFilterQuery() {
public List<String> collectionVideoFilterQuery(final boolean strict) {
final ArrayList<String> fqs = new ArrayList<>();
// add filter to prevent that results come from failed urls
fqs.add(CollectionSchema.httpstatus_i.getSolrFieldName() + ":" + HttpStatus.SC_OK);
fqs.add(CollectionSchema.content_type.getSolrFieldName() + ":(video/*) OR "
+ CollectionSchema.videolinkscount_i.getSolrFieldName() + ":[1 TO *]");
StringBuilder filter = new StringBuilder(CollectionSchema.content_type.getSolrFieldName()).append(":(video/*)");
if (!strict) {
filter.append(" OR ").append(CollectionSchema.videolinkscount_i.getSolrFieldName()).append(":[1 TO *]");
}
fqs.add(filter.toString());
return fqs;
}
/**
* Generate Solr filter queries to receive valid application specific content results.
*
* This filters out documents with bad HTTP status and includes documents with MIME type matching the prefix application/* as well
* docuemnts with links to application specific content.
* docuemnts with links to application specific content when strict is false.
*
* @param strict when true, do not include non-video urls with links to video
* @return Solr filter queries for application specific content URLs
*/
public List<String> collectionApplicationFilterQuery() {
public List<String> collectionApplicationFilterQuery(final boolean strict) {
final ArrayList<String> fqs = new ArrayList<>();
// add filter to prevent that results come from failed urls
fqs.add(CollectionSchema.httpstatus_i.getSolrFieldName() + ":" + HttpStatus.SC_OK);
fqs.add(CollectionSchema.content_type.getSolrFieldName() + ":(application/*) OR "
+ CollectionSchema.applinkscount_i.getSolrFieldName() + ":[1 TO *]");
StringBuilder filter = new StringBuilder(CollectionSchema.content_type.getSolrFieldName())
.append(":(application/*)");
if (!strict) {
filter.append(" OR ").append(CollectionSchema.applinkscount_i.getSolrFieldName()).append(":[1 TO *]");
}
fqs.add(filter.toString());
return fqs;
}
Oops, something went wrong.

0 comments on commit e6907fd

Please sign in to comment.