Skip to content

Commit

Permalink
added parsing of 'date', 'dc:date', 'dc.date' and 'last-modified' in
Browse files Browse the repository at this point in the history
html meta fields to get a correct (or: better) date timestamp. The
http:last-modified mostly does not work because it is set to the current
date from most CMS.
  • Loading branch information
Orbiter committed Sep 10, 2013
1 parent 9cc8468 commit 35ab2ce
Show file tree
Hide file tree
Showing 29 changed files with 120 additions and 33 deletions.
2 changes: 1 addition & 1 deletion htroot/gsa/searchresult.java
Expand Up @@ -116,7 +116,7 @@ public static serverObjects respond(final RequestHeader header, serverObjects po
post.put("defType", "edismax");
post.put(CommonParams.Q, solrQ.toString());
post.put(CommonParams.ROWS, post.remove("num"));
post.put(CommonParams.ROWS, Math.min(post.getInt(CommonParams.ROWS, 10), (authenticated) ? 5000 : 100));
post.put(CommonParams.ROWS, Math.min(post.getInt(CommonParams.ROWS, 10), (authenticated) ? 100000000 : 100));

// set ranking
if (post.containsKey("sort")) {
Expand Down
14 changes: 12 additions & 2 deletions source/net/yacy/document/Document.java
Expand Up @@ -94,6 +94,7 @@ public class Document {
private final double lon, lat;
private final Object parserObject; // the source object that was used to create the Document
private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
private final Date date;

public Document(final DigestURI location, final String mimeType, final String charset,
final Object parserObject,
Expand All @@ -107,7 +108,8 @@ public Document(final DigestURI location, final String mimeType, final String ch
final Map<DigestURI, Properties> anchors,
final Map<DigestURI, String> rss,
final Map<DigestURI, ImageEntry> images,
final boolean indexingDenied) {
final boolean indexingDenied,
final Date date) {
this.source = location;
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
this.charset = charset;
Expand Down Expand Up @@ -143,6 +145,7 @@ public Document(final DigestURI location, final String mimeType, final String ch
this.indexingDenied = indexingDenied;
this.text = text == null ? "" : text;
this.generic_facets = new HashMap<String, Set<String>>();
this.date = date == null ? new Date() : date;
}

public Object getParserObject() {
Expand Down Expand Up @@ -451,6 +454,10 @@ public Map<String, String> getEmaillinks() {
return this.emaillinks;
}

public Date getDate() {
return this.date;
}

public double lon() {
return this.lon;
}
Expand Down Expand Up @@ -783,6 +790,7 @@ public static Document mergeDocuments(final DigestURI location, final String glo
final Map<DigestURI, String> rss = new HashMap<DigestURI, String>();
final Map<DigestURI, ImageEntry> images = new HashMap<DigestURI, ImageEntry>();
double lon = 0.0d, lat = 0.0d;
Date date = new Date();

for (final Document doc: docs) {

Expand Down Expand Up @@ -821,6 +829,7 @@ public static Document mergeDocuments(final DigestURI location, final String glo
rss.putAll(doc.getRSS());
ContentScraper.addAllImages(images, doc.getImages());
if (doc.lon() != 0.0 && doc.lat() != 0.0) { lon = doc.lon(); lat = doc.lat(); }
if (doc.date.before(date)) date = doc.date;
}

// clean up parser data
Expand Down Expand Up @@ -852,7 +861,8 @@ public static Document mergeDocuments(final DigestURI location, final String glo
anchors,
rss,
images,
false);
false,
date);
}

public static Map<DigestURI, String> getHyperlinks(final Document[] documents) {
Expand Down
5 changes: 4 additions & 1 deletion source/net/yacy/document/content/DCEntry.java
Expand Up @@ -100,7 +100,9 @@ public DCEntry(
*/
public Date getDate() {
String d = this.get("docdatetime");
if (d == null) d = this.get("date");
if (d == null) d = this.get("dc:date");
if (d == null) d = this.get("last-modified");
if (d == null) return null;
if (d.isEmpty()) return null;
try {
Expand Down Expand Up @@ -286,7 +288,8 @@ public Document document() {
null,
null,
null,
false);
false,
getDate());
}

public void writeXML(OutputStreamWriter os) throws IOException {
Expand Down
7 changes: 5 additions & 2 deletions source/net/yacy/document/parser/audioTagParser.java
Expand Up @@ -30,6 +30,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
Expand Down Expand Up @@ -171,7 +172,8 @@ public Document[] parse(final DigestURI location, final String mimeType,
null,
null,
null,
false)
false,
new Date())
};
return docs;
} catch (final Exception e) {
Expand All @@ -193,7 +195,8 @@ public Document[] parse(final DigestURI location, final String mimeType,
null,
null,
null,
false
false,
new Date()
)};
} finally {
try {
Expand Down
4 changes: 3 additions & 1 deletion source/net/yacy/document/parser/csvParser.java
Expand Up @@ -30,6 +30,7 @@
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import net.yacy.document.AbstractParser;
Expand Down Expand Up @@ -77,7 +78,8 @@ public Document[] parse(DigestURI location, String mimeType, String charset, Inp
null,
null,
null,
false)};
false,
new Date())};
}

private static String concatRow(String[] columns) {
Expand Down
4 changes: 3 additions & 1 deletion source/net/yacy/document/parser/docParser.java
Expand Up @@ -28,6 +28,7 @@
package net.yacy.document.parser;

import java.io.InputStream;
import java.util.Date;

import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
Expand Down Expand Up @@ -103,7 +104,8 @@ public Document[] parse(final DigestURI location, final String mimeType,
null,
null,
null,
false)};
false,
new Date())};

return docs;
}
Expand Down
4 changes: 3 additions & 1 deletion source/net/yacy/document/parser/genericParser.java
Expand Up @@ -25,6 +25,7 @@
package net.yacy.document.parser;

import java.io.InputStream;
import java.util.Date;

import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
Expand Down Expand Up @@ -65,7 +66,8 @@ public Document[] parse(final DigestURI location, final String mimeType,
null,
null,
null,
false)};
false,
new Date())};
return docs;
}
}
25 changes: 25 additions & 0 deletions source/net/yacy/document/parser/html/ContentScraper.java
Expand Up @@ -31,7 +31,9 @@
import java.io.Writer;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
Expand All @@ -45,6 +47,7 @@

import javax.swing.event.EventListenerList;

import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.storage.SizeLimitedMap;
Expand Down Expand Up @@ -848,6 +851,28 @@ public String getRefreshPath() {
if (s.toLowerCase().startsWith("url=")) return s.substring(4).trim();
return EMPTY_STRING;
}

public Date getDate() {
String content;

// <meta name="date" content="YYYY-MM-DD..." />
content = this.metas.get("date");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {}

// <meta name="DC.date" content="YYYY-MM-DD" />
content = this.metas.get("dc.date");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {}

// <meta name="DC:date" content="YYYY-MM-DD" />
content = this.metas.get("dc:date");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {}

// <meta http-equiv="last-modified" content="YYYY-MM-DD" />
content = this.metas.get("last-modified");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {}

return new Date();
}

// parse location
// <meta NAME="ICBM" CONTENT="38.90551492, 1.454004505" />
Expand Down
3 changes: 2 additions & 1 deletion source/net/yacy/document/parser/htmlParser.java
Expand Up @@ -141,7 +141,8 @@ private static Document transformScraper(final DigestURI location, final String
scraper.getAnchors(),
scraper.getRSS(),
scraper.getImages(),
scraper.indexingDenied());
scraper.indexingDenied(),
scraper.getDate());
ppd.setFavicon(scraper.getFavicon());

return ppd;
Expand Down
Expand Up @@ -35,6 +35,7 @@
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
Expand Down Expand Up @@ -221,7 +222,8 @@ public Document[] parse(
anchors, // anchors
null,
images,
false)}; // images
false,
new Date())}; // images
}

@Override
Expand Down
4 changes: 3 additions & 1 deletion source/net/yacy/document/parser/mmParser.java
Expand Up @@ -27,6 +27,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import javax.xml.parsers.ParserConfigurationException;
Expand Down Expand Up @@ -116,7 +117,8 @@ public Document[] parse(final DigestURI location, final String mimeType,
null,
null,
null,
false)};
false,
new Date())};
}

private class FreeMindHandler extends DefaultHandler {
Expand Down
5 changes: 4 additions & 1 deletion source/net/yacy/document/parser/odtParser.java
Expand Up @@ -30,6 +30,7 @@
import java.io.File;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.List;
Expand Down Expand Up @@ -197,7 +198,9 @@ private Document[] parse(final DigestURI location, final String mimeType, @Suppr
null,
null,
null,
false)};
false,
new Date()
)};
return docs;
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
Expand Down
4 changes: 3 additions & 1 deletion source/net/yacy/document/parser/ooxmlParser.java
Expand Up @@ -30,6 +30,7 @@
import java.io.File;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.List;
Expand Down Expand Up @@ -182,7 +183,8 @@ private Document[] parse(final DigestURI location, final String mimeType, @Suppr
null,
null,
null,
false)};
false,
new Date())};
return docs;
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
Expand Down
8 changes: 5 additions & 3 deletions source/net/yacy/document/parser/pdfParser.java
Expand Up @@ -32,6 +32,7 @@
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;

import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.exceptions.CryptographyException;
Expand Down Expand Up @@ -125,17 +126,17 @@ public Document[] parse(final DigestURI location, final String mimeType, final S
// extracting some metadata
PDDocumentInformation info = pdfDoc.getDocumentInformation();
String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
Date docDate = new Date();
if (info != null) {
docTitle = info.getTitle();
docSubject = info.getSubject();
docAuthor = info.getAuthor();
docPublisher = info.getProducer();
if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator();
docKeywordStr = info.getKeywords();
try {if (info.getModificationDate() != null) docDate = info.getModificationDate().getTime();} catch (IOException e) {}
// unused:
// info.getTrapped());
// info.getCreationDate());
// info.getModificationDate();
}
info = null;

Expand Down Expand Up @@ -218,7 +219,8 @@ public void run() {
null,
null,
null,
false)};
false,
docDate)};
}

@SuppressWarnings("static-access")
Expand Down
4 changes: 3 additions & 1 deletion source/net/yacy/document/parser/pptParser.java
Expand Up @@ -29,6 +29,7 @@

import java.io.BufferedInputStream;
import java.io.InputStream;
import java.util.Date;

import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
Expand Down Expand Up @@ -99,7 +100,8 @@ public Document[] parse(final DigestURI location, final String mimeType,
null,
null,
null,
false)};
false,
new Date())};
return docs;
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
Expand Down
4 changes: 3 additions & 1 deletion source/net/yacy/document/parser/psParser.java
Expand Up @@ -34,6 +34,7 @@
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Date;

import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
Expand Down Expand Up @@ -115,7 +116,8 @@ private Document[] parse(final DigestURI location, final String mimeType, @Suppr
null, // anchors
null, // rss
null, // images
false)}; // indexingdenied
false, // indexingdenied
new Date())};

return docs;
} catch (final Exception e) {
Expand Down
3 changes: 2 additions & 1 deletion source/net/yacy/document/parser/rdfParser.java
Expand Up @@ -27,6 +27,7 @@

import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import net.yacy.document.AbstractParser;
Expand Down Expand Up @@ -59,7 +60,7 @@ public Document[] parse(final DigestURI url, final String mimeType,

String all = "rdfdatasource";
doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, new ArrayList<String>(0), 0, 0, all, null, null, null, false);
"", null, new ArrayList<String>(0), 0, 0, all, null, null, null, false, new Date());

docs.add(doc);

Expand Down

0 comments on commit 35ab2ce

Please sign in to comment.