Permalink
Browse files

added yacy grid flatjson surrogate parser

  • Loading branch information...
Orbiter committed Apr 25, 2017
1 parent b1da926 commit 973d74712f684fc04590c6f5839ac0cda9811f93
Showing with 45 additions and 4 deletions.
  1. +26 −4 source/net/yacy/search/Switchboard.java
  2. +19 −0 source/net/yacy/search/schema/CollectionSchema.java
@@ -2022,7 +2022,9 @@ public boolean processSurrogate(final String s) {
log.warn("IO Error processing warc file " + infile);
}
return moved;
} else if (s.endsWith(".flatjson")) {
} else if (s.endsWith(".jsonlist") || s.endsWith(".flatjson")) {
// parse a file that can be generated with yacy_grid_parser
// see https://github.com/yacy/yacy_grid_parser/blob/master/README.md
try {
InputStream is = new BufferedInputStream(new FileInputStream(infile));
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
@@ -2035,9 +2037,27 @@ public boolean processSurrogate(final String s) {
for (String key: json.keySet()) {
Object o = json.get(key);
if (o instanceof JSONArray) {
// todo: ass array
// transform this into a list
JSONArray a = (JSONArray) o;
List<Object> list = new ArrayList<>();
for (int i = 0; i < a.length(); i++) list.add(a.get(i));
CollectionSchema schema = CollectionSchema.valueOf(key);
schema.add(surrogate, list);
} else {
surrogate.put(key, new SolrInputField(o.toString()));
// patch yacy grid altered schema (yacy grid does not have IDs any more, but they can be re-computed here)
if (key.equals("url_s")) {
DigestURL durl = new DigestURL(o.toString());
String id = ASCII.String(durl.hash());
surrogate.setField(CollectionSchema.sku.getSolrFieldName(), durl.toNormalform(true));
surrogate.setField(CollectionSchema.id.getSolrFieldName(), id);
surrogate.setField(CollectionSchema.host_id_s.getSolrFieldName(), id.substring(6));
} else if (key.equals("referrer_url_s")) {
DigestURL durl = new DigestURL(o.toString());
String id = ASCII.String(durl.hash());
surrogate.setField(CollectionSchema.referrer_id_s.getSolrFieldName(), id);
} else {
surrogate.setField(key, o.toString());
}
}
}
Switchboard.this.index.putDocument(surrogate);
@@ -2219,7 +2239,9 @@ public boolean surrogateProcess() {
|| surrogate.endsWith(".xml.gz")
|| surrogate.endsWith(".xml.zip")
|| surrogate.endsWith(".warc")
|| surrogate.endsWith(".warc.gz") ) {
|| surrogate.endsWith(".warc.gz")
|| surrogate.endsWith(".jsonlist")
|| surrogate.endsWith(".flatjson") ) {
// read the surrogate file and store entry in index
if ( processSurrogate(surrogate) ) {
return true;
@@ -26,6 +26,8 @@
import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.federate.solr.SolrType;
import org.apache.poi.ss.formula.atp.DateParser;
import org.apache.poi.ss.formula.eval.EvaluationException;
import org.apache.solr.common.SolrInputDocument;
public enum CollectionSchema implements SchemaDeclaration {
@@ -424,6 +426,8 @@ public final void add(final SolrInputDocument doc, final List<?> value) {
doc.setField(this.getSolrFieldName(), new Integer[0]);
} else if (this.type == SolrType.string || this.type == SolrType.text_general) {
doc.setField(this.getSolrFieldName(), new String[0]);
} else if (this.type == SolrType.date) {
doc.setField(this.getSolrFieldName(), new Date[0]);
} else {
assert false : "ADD(1): type is " + this.type.name();
doc.setField(this.getSolrFieldName(), new Object[0]);
@@ -436,6 +440,21 @@ public final void add(final SolrInputDocument doc, final List<?> value) {
} else if (this.type == SolrType.string || this.type == SolrType.text_general) {
assert (value.iterator().next() instanceof String);
doc.setField(this.getSolrFieldName(), value.toArray(new String[value.size()]));
} else if (this.type == SolrType.date) {
assert (value.iterator().next() instanceof String) || (value.iterator().next() instanceof Date);
if (value.iterator().next() instanceof String) {
Date[] da = new Date[value.size()];
for (int i = 0; i < value.size(); i++) {
try {
da[i] = DateParser.parseDate((String) value.get(i)).getTime();
} catch (EvaluationException e) {
da[i] = null;
}
}
doc.setField(this.getSolrFieldName(), da);
} else {
doc.setField(this.getSolrFieldName(), value.toArray(new Date[value.size()]));
}
} else {
assert false : "ADD(2): type is " + this.type.name();
doc.setField(this.getSolrFieldName(), value.toArray(new Object[value.size()]));

0 comments on commit 973d747

Please sign in to comment.