Permalink
Browse files

Patch last_modified date with internal FirstSeenTime() if no date pro…

…vided

to make sure updated documents are indexed with their last-modified
date as provided in current crawl. 
(to patch moddate always with firstseen might bear the risk of miss 
actual updates).
  • Loading branch information...
reger24 committed Aug 5, 2017
1 parent d1b23af commit 275d65fffe0338145bec8c272f50e3cc6c5e7925
Showing with 12 additions and 5 deletions.
  1. +12 −5 source/net/yacy/search/schema/CollectionConfiguration.java
@@ -534,11 +534,18 @@ public SolrVector yacy2solr(
add(doc, CollectionSchema.author, author);
}
if (allAttr || contains(CollectionSchema.last_modified)) {
Date lastModified = responseHeader == null ? new Date() : responseHeader.lastModified();
if (lastModified == null) lastModified = new Date();
if (document.getLastModified().before(lastModified)) lastModified = document.getLastModified();
long firstSeen = segment.getFirstSeenTime(digestURL.hash());
if (firstSeen > 0 && firstSeen < lastModified.getTime()) lastModified = new Date(firstSeen); // patch the date if we have seen the document earlier
Date lastModified = responseHeader == null ? document.getLastModified() : responseHeader.lastModified();
if (lastModified == null) {
long firstSeen = segment.getFirstSeenTime(digestURL.hash());
if (firstSeen > 0) {
lastModified = new Date(firstSeen); // patch the date if we have seen the document earlier
} else {
lastModified = new Date();
}
}
if (document.getLastModified().before(lastModified)) {
lastModified = document.getLastModified();
}
add(doc, CollectionSchema.last_modified, lastModified);
}
if (allAttr || contains(CollectionSchema.dates_in_content_dts) || contains(CollectionSchema.dates_in_content_count_i)) {

0 comments on commit 275d65f

Please sign in to comment.