Skip to content

Commit

Permalink
enhanced postprocessing status report
Browse files Browse the repository at this point in the history
  • Loading branch information
Orbiter committed Jul 16, 2014
1 parent b5fc2b6 commit 8514bff
Show file tree
Hide file tree
Showing 3 changed files with 180 additions and 168 deletions.
2 changes: 1 addition & 1 deletion htroot/api/status_p.java
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea

prop.put("postprocessingCollectionRemainingCount", collectionRemainingCount);
prop.put("postprocessingWebgraphRemainingCount", webgraphRemainingCount);
prop.put("postprocessingRunning_activity", collectionRemainingCount == CollectionConfiguration.postprocessingCollection1Count && webgraphRemainingCount == CollectionConfiguration.postprocessingWebgraphCount ? "citation computation" : collectionRemainingCount == CollectionConfiguration.postprocessingCollection1Count ? "webgraph" : "collection");
prop.put("postprocessingRunning_activity", CollectionConfiguration.postprocessingActivity);
prop.put("postprocessingSpeed", speed);
prop.put("postprocessingElapsedTime", timeSinceStart);
prop.put("postprocessingRemainingTime", remainingTime);
Expand Down
160 changes: 0 additions & 160 deletions source/net/yacy/cora/federate/solr/SchemaConfiguration.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,27 +23,17 @@
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;

import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.storage.Configuration;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segment.ReferenceReport;
import net.yacy.search.index.Segment.ReferenceReportCache;
import net.yacy.search.schema.CollectionSchema;

public class SchemaConfiguration extends Configuration implements Serializable {

Expand Down Expand Up @@ -107,156 +97,6 @@ public SolrDocument toSolrDocument(final SolrInputDocument doc, Set<String> omit
return sd;
}

public void postprocessing_http_unique(Segment segment, SolrInputDocument sid, DigestURL url) {
if (!this.contains(CollectionSchema.http_unique_b)) return;
if (!url.isHTTPS() && !url.isHTTP()) return;
try {
DigestURL u = new DigestURL((url.isHTTP() ? "https://" : "http://") + url.urlstub(true, true));
SolrDocument d = segment.fulltext().getDefaultConnector().getDocumentById(ASCII.String(u.hash()), CollectionSchema.http_unique_b.getSolrFieldName());
set_unique_flag(CollectionSchema.http_unique_b, sid, d);
} catch (final IOException e) {}
}

public void postprocessing_www_unique(Segment segment, SolrInputDocument sid, DigestURL url) {
if (!this.contains(CollectionSchema.www_unique_b)) return;
final String us = url.urlstub(true, true);
try {
DigestURL u = new DigestURL(url.getProtocol() + (us.startsWith("www.") ? "://" + us.substring(4) : "://www." + us));
SolrDocument d = segment.fulltext().getDefaultConnector().getDocumentById(ASCII.String(u.hash()), CollectionSchema.www_unique_b.getSolrFieldName());
set_unique_flag(CollectionSchema.www_unique_b, sid, d);
} catch (final IOException e) {}
}

private void set_unique_flag(CollectionSchema field, SolrInputDocument sid, SolrDocument d) {
Object sb = sid.getFieldValue(field.getSolrFieldName());
boolean sbb = sb != null && ((Boolean) sb).booleanValue();
Object ob = d == null ? null : d.getFieldValue(field.getSolrFieldName());
boolean obb = ob != null && ((Boolean) ob).booleanValue();
if (sbb == obb) sid.setField(field.getSolrFieldName(), !sbb);
}

public void postprocessing_doublecontent(Segment segment, Set<String> uniqueURLs, SolrInputDocument sid, DigestURL url) {
// FIND OUT IF THIS IS A DOUBLE DOCUMENT
String urlhash = ASCII.String(url.hash());
String hostid = url.hosthash();
uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{
{CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b, CollectionSchema.exact_signature_copycount_i},
{CollectionSchema.fuzzy_signature_l, CollectionSchema.fuzzy_signature_unique_b, CollectionSchema.fuzzy_signature_copycount_i}}) {
CollectionSchema signaturefield = checkfields[0];
CollectionSchema uniquefield = checkfields[1];
CollectionSchema countfield = checkfields[2];
if (this.contains(signaturefield) && this.contains(uniquefield) && this.contains(countfield)) {
// lookup the document with the same signature
Long signature = (Long) sid.getField(signaturefield.getSolrFieldName()).getValue();
if (signature == null) continue uniquecheck;
try {
SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery("-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"", null, 0, 100, CollectionSchema.id.getSolrFieldName());
if (docs.getNumFound() == 0) {
sid.setField(uniquefield.getSolrFieldName(), true);
sid.setField(countfield.getSolrFieldName(), 1);
} else {
boolean firstappearance = true;
for (SolrDocument d: docs) {if (uniqueURLs.contains(d.getFieldValue(CollectionSchema.id.getSolrFieldName()))) firstappearance = false; break;}
sid.setField(uniquefield.getSolrFieldName(), firstappearance);
sid.setField(countfield.getSolrFieldName(), docs.getNumFound() + 1); // the current url was excluded from search but is included in count
}
} catch (final IOException e) {}
}
}

// CHECK IF TITLE AND DESCRIPTION IS UNIQUE (this is by default not switched on)
// in case that the document has no status code 200, has a noindex attribute
// or a canonical tag which does not point to the document itself,
// then the unique-field is not written at all!
Integer robots_i = this.contains(CollectionSchema.robots_i) ? (Integer) sid.getFieldValue(CollectionSchema.robots_i.getSolrFieldName()) : null;
Integer httpstatus_i = this.contains(CollectionSchema.httpstatus_i) ? (Integer) sid.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()) : null;
String canonical_s = this.contains(CollectionSchema.canonical_s) ? (String) sid.getFieldValue(CollectionSchema.canonical_s.getSolrFieldName()) : null;
Boolean canonical_equal_sku_b = this.contains(CollectionSchema.canonical_equal_sku_b) ? (Boolean) sid.getFieldValue(CollectionSchema.canonical_equal_sku_b.getSolrFieldName()) : null;
if (segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.host_id_s) &&
(robots_i == null || (robots_i.intValue() & (1 << 9)) == 0 /*noindex in http X-ROBOTS*/ && (robots_i.intValue() & (1 << 3)) == 0 /*noindex in html metas*/ ) &&
(canonical_s == null || canonical_s.length() == 0 || (canonical_equal_sku_b != null && canonical_equal_sku_b.booleanValue()) || url.toNormalform(true).equals(canonical_s)) &&
(httpstatus_i == null || httpstatus_i.intValue() == 200)) {
uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][] {
{CollectionSchema.title, CollectionSchema.title_exact_signature_l, CollectionSchema.title_unique_b},
{CollectionSchema.description_txt, CollectionSchema.description_exact_signature_l, CollectionSchema.description_unique_b}}) {
CollectionSchema checkfield = checkfields[0];
CollectionSchema signaturefield = checkfields[1];
CollectionSchema uniquefield = checkfields[2];
if (this.contains(checkfield) && this.contains(signaturefield) && this.contains(uniquefield)) {
// lookup in the index within the same hosts for the same title or description
//String checkstring = checkfield == CollectionSchema.title ? document.dc_title() : document.dc_description();
Long signature = (Long) sid.getFieldValue(signaturefield.getSolrFieldName());
if (signature == null) {
continue uniquecheck;
}
try {
long doccount = segment.fulltext().getDefaultConnector().getCountByQuery(
CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " +
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":8 AND " + // bit 3
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":24 AND " + // bit 3 + 4
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":512 AND " + // bit 9
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":1536 AND " + // bit 9 + 10
"(-" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":[* TO *] OR " + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":true ) AND " +
CollectionSchema.httpstatus_i.getSolrFieldName() + ":200 AND " +
"-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " +
signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"");
sid.setField(uniquefield.getSolrFieldName(), doccount == 0);
} catch (final IOException e) {}
}
}
}
uniqueURLs.add(urlhash);
}

public boolean postprocessing_references(final ReferenceReportCache rrCache, final SolrInputDocument sid, final DigestURL url, final Map<String, Long> hostExtentCount) {
if (!(this.contains(CollectionSchema.references_i) ||
this.contains(CollectionSchema.references_internal_i) ||
this.contains(CollectionSchema.references_external_i) || this.contains(CollectionSchema.references_exthosts_i))) return false;
Integer all_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.references_i.getSolrFieldName());
Integer internal_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName());
Integer external_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName());
Integer exthosts_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName());
Integer hostextc_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.host_extent_i.getSolrFieldName());
try {
ReferenceReport rr = rrCache.getReferenceReport(ASCII.String(url.hash()), false);
List<String> internalIDs = new ArrayList<String>();
HandleSet iids = rr.getInternallIDs();
for (byte[] b: iids) internalIDs.add(ASCII.String(b));

boolean change = false;
int all = rr.getExternalCount() + rr.getInternalCount();
if (this.contains(CollectionSchema.references_i) &&
(all_old == null || all_old.intValue() != all)) {
sid.setField(CollectionSchema.references_i.getSolrFieldName(), all);
change = true;
}
if (this.contains(CollectionSchema.references_internal_i) &&
(internal_old == null || internal_old.intValue() != rr.getInternalCount())) {
sid.setField(CollectionSchema.references_internal_i.getSolrFieldName(), rr.getInternalCount());
change = true;
}
if (this.contains(CollectionSchema.references_external_i) &&
(external_old == null || external_old.intValue() != rr.getExternalCount())) {
sid.setField(CollectionSchema.references_external_i.getSolrFieldName(), rr.getExternalCount());
change = true;
}
if (this.contains(CollectionSchema.references_exthosts_i) &&
(exthosts_old == null || exthosts_old.intValue() != rr.getExternalHostIDs().size())) {
sid.setField(CollectionSchema.references_exthosts_i.getSolrFieldName(), rr.getExternalHostIDs().size());
change = true;
}
Long hostExtent = hostExtentCount == null ? Integer.MAX_VALUE : hostExtentCount.get(url.hosthash());
if (this.contains(CollectionSchema.host_extent_i) &&
(hostextc_old == null || hostextc_old.intValue() != hostExtent)) {
sid.setField(CollectionSchema.host_extent_i.getSolrFieldName(), hostExtent.intValue());
change = true;
}
return change;
} catch (final IOException e) {
}
return false;
}

public boolean contains(SchemaDeclaration field) {
return this.contains(field.getSolrFieldName());
}
Expand Down
Loading

0 comments on commit 8514bff

Please sign in to comment.