Skip to content

Commit

Permalink
alpha version of surrogate reading and indexing.
Browse files Browse the repository at this point in the history
see the example file for an explanation.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5815 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Apr 16, 2009
1 parent 870066a commit 9050a3c
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 5 deletions.
28 changes: 28 additions & 0 deletions examples/surrogate_dublin_core.xml
@@ -0,0 +1,28 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- YaCy surrogate file using dublin core notion -->
<!--
This is a surrogate file which is an intermediate document description
file for index generation. Once you have YaCy started, you can copy a file
like this (or actual this file) into DATA/SURROGATE/in and then the indexing
process will read the file, store the content into the search index and moves
the file into DATA/SURROGATE/out
Using surrogate files and the surrogate file format you can easily create your
own data harvesting sources for the YaCy indexer. Just write a file generator
that generates files like this. The xml schema is very similar to that
described in
http://dublincore.org/documents/dc-xml-guidelines/
using the Dublin Core metadata element set.
-->

<surrogates
xmlns:dc="http://purl.org/dc/elements/1.1/">

<record>
<dc:title><![CDATA[Alan Smithee]]></dc:title>
<dc:identifier>http://de.wikipedia.org/wiki/Alan_Smithee</dc:identifier>
<dc:description><![CDATA[Der als Filmregisseur oft genannte '''Alan Smithee''' ist ein Anagramm von „The Alias Men“.]]></dc:description>
<dc:language>de</dc:language>
<dc:date>2009-03-02T11:12:36Z</dc:date> <!-- date is in ISO 8601 -->
</record>

</surrogates>
38 changes: 35 additions & 3 deletions source/de/anomic/crawler/Surrogate.java
Expand Up @@ -41,9 +41,31 @@ public class Surrogate extends HashMap<String, String> {
public Surrogate() {
super();
}

/*
DC according to rfc 5013
* dc_title
* dc_creator
* dc_subject
* dc_description
* dc_publisher
dc_contributor
dc_date
dc_type
* dc_format
* dc_identifier
* dc_source
dc_language
dc_relation
dc_coverage
dc_rights
*/

public Date date() {
String d = this.get("date");
String d = this.get("dateISO8601");
if (d == null) d = this.get("docdatetime");
if (d == null) d = this.get("dc:date");
if (d == null) return null;
try {
return DateFormatter.parseISO8601(d);
Expand All @@ -54,6 +76,7 @@ public Date date() {
}
public yacyURL url() {
String u = this.get("url");
if (u == null) u = this.get("dc:identifier");
if (u == null) return null;
try {
return new yacyURL(u, null);
Expand All @@ -64,19 +87,28 @@ public yacyURL url() {
}
public String language() {
String l = this.get("language");
if (l == null) l = this.get("dc:language");
if (l == null) return "en"; else return l;
}
public String title() {
String t = this.get("title");
return stripCDATA(t);
if (t == null) t = this.get("dc:title");
t = stripCDATA(t);
if (t == null) return "";
return t;
}
public String body() {
String t = this.get("body");
return stripCDATA(t);
if (t == null) this.get("dc:description");
t = stripCDATA(t);
if (t == null) return "";
return t;
}
public String[] categories() {
String t = this.get("categories");
if (t == null) this.get("dc:subject");
t = stripCDATA(t);
if (t == null) return new String[]{};
return t.split(";");
}
private String stripCDATA(String s) {
Expand Down
33 changes: 31 additions & 2 deletions source/de/anomic/xml/SurrogateReader.java
Expand Up @@ -89,19 +89,23 @@ public void run() {
}

public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
if ("document".equals(tag)) {
if ("record".equals(tag) || "document".equals(tag)) {
this.surrogate = new Surrogate();
} else if ("element".equals(tag)) {
this.elementName = atts.getValue("name");
} else if ("value".equals(tag)) {
this.buffer.setLength(0);
this.parsingValue = true;
} else if (tag.startsWith("dc:")) {
// parse dublin core attribute
this.elementName = tag;
this.parsingValue = true;
}
}

public void endElement(final String uri, final String name, final String tag) {
if (tag == null) return;
if ("document".equals(tag)) {
if ("record".equals(tag) || "document".equals(tag)) {
//System.out.println("A Title: " + this.surrogate.title());
try {
this.surrogates.put(this.surrogate);
Expand All @@ -124,6 +128,13 @@ public void endElement(final String uri, final String name, final String tag) {
}
this.buffer.setLength(0);
this.parsingValue = false;
} else if (tag.startsWith("dc:")) {
final String value = buffer.toString().trim();
if (this.elementName != null) {
this.surrogate.put(this.elementName, value);
}
this.buffer.setLength(0);
this.parsingValue = false;
}
}

Expand Down Expand Up @@ -177,6 +188,24 @@ public static void main(String[] args) {
}
/*
Example surrogate
<?xml version="1.0" encoding="utf-8"?>
<!-- YaCy surrogate file using dublin core notion -->
<!-- see http://dublincore.org/documents/dc-xml-guidelines/ -->
<surrogates
xmlns:dc="http://purl.org/dc/elements/1.1/">
<record>
<dc:title><![CDATA[Alan Smithee]]></dc:title>
<dc:identifier>http://de.wikipedia.org/wiki/Alan_Smithee</dc:identifier>
<dc:description><![CDATA[Der als Filmregisseur oft genannte '''Alan Smithee''' ist ein Anagramm von ãThe Alias MenÒ.]]></dc:description>
<dc:language>de</dc:language>
<dc:date>2009-03-02T11:12:36Z</dc:date> <!-- date is in ISO 8601 -->
</record>
</surrogates>
or
<?xml version="1.0" encoding="utf-8"?>
<documents>
Expand Down

0 comments on commit 9050a3c

Please sign in to comment.