From c00efc27175303281a06896b82086d9675dc68dc Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 9 May 2012 16:46:45 +0200 Subject: [PATCH] made the solr connection more generic --- htroot/IndexFederated_p.java | 15 +- htroot/api/schema_p.java | 16 +- .../federated/solr/SolrConnector.java | 17 - .../services/federated/solr/SolrField.java | 32 ++ .../federated/solr/SolrRetryConnector.java | 22 -- .../federated/solr/SolrShardingConnector.java | 33 +- .../federated/solr/SolrSingleConnector.java | 19 +- .../services/federated/solr/SolrType.java | 47 +++ source/net/yacy/search/Switchboard.java | 17 +- source/net/yacy/search/index/SolrField.java | 173 +++++++++ .../solr => search/index}/SolrScheme.java | 348 +++++------------- .../net/yacy/search/query/SnippetProcess.java | 2 +- 12 files changed, 377 insertions(+), 364 deletions(-) create mode 100644 source/net/yacy/cora/services/federated/solr/SolrField.java create mode 100644 source/net/yacy/cora/services/federated/solr/SolrType.java create mode 100644 source/net/yacy/search/index/SolrField.java rename source/net/yacy/{cora/services/federated/solr => search/index}/SolrScheme.java (50%) diff --git a/htroot/IndexFederated_p.java b/htroot/IndexFederated_p.java index 2b7c09ca51..2d35e98553 100644 --- a/htroot/IndexFederated_p.java +++ b/htroot/IndexFederated_p.java @@ -32,7 +32,6 @@ import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.solr.SolrConnector; -import net.yacy.cora.services.federated.solr.SolrScheme; import net.yacy.cora.services.federated.solr.SolrShardingConnector; import net.yacy.cora.services.federated.solr.SolrShardingSelection; import net.yacy.cora.services.federated.solr.SolrSingleConnector; @@ -40,6 +39,8 @@ import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; import net.yacy.search.index.Segments; +import net.yacy.search.index.SolrField; +import net.yacy.search.index.SolrScheme; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -92,7 +93,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje // switch on final boolean usesolr = sb.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0; try { - sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr((usesolr) ? new SolrShardingConnector(solrurls, scheme, SolrShardingSelection.Method.MODULO_HOST_MD5, 10000) : null); + sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr((usesolr) ? new SolrShardingConnector(solrurls, SolrShardingSelection.Method.MODULO_HOST_MD5, 10000) : null); } catch (final IOException e) { Log.logException(e); sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null); @@ -138,21 +139,17 @@ public static serverObjects respond(final RequestHeader header, final serverObje } // write scheme - SolrScheme scheme = (sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null) ? null : sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().getScheme(); final String schemename = sb.getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list"); - if (scheme == null) { - scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename)); - } - final Iterator i = scheme.allIterator(); + final Iterator i = sb.solrScheme.allIterator(); int c = 0; boolean dark = false; ConfigurationSet.Entry entry; - SolrScheme.Field field; + SolrField field; while (i.hasNext()) { entry = i.next(); try { - field = SolrScheme.Field.valueOf(entry.key()); + field = SolrField.valueOf(entry.key()); } catch (IllegalArgumentException e) { continue; } diff --git a/htroot/api/schema_p.java b/htroot/api/schema_p.java index 70867f638e..b4a63ba452 100644 --- a/htroot/api/schema_p.java +++ b/htroot/api/schema_p.java @@ -22,15 +22,12 @@ * If not, see . */ -import java.io.File; import java.util.Iterator; import net.yacy.cora.protocol.RequestHeader; -import net.yacy.cora.services.federated.solr.SolrScheme; -import net.yacy.cora.services.federated.solr.SolrScheme.Field; import net.yacy.cora.storage.ConfigurationSet; import net.yacy.search.Switchboard; -import net.yacy.search.index.Segments; +import net.yacy.search.index.SolrField; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -42,21 +39,16 @@ public static serverObjects respond(final RequestHeader header, final serverObje final Switchboard sb = (Switchboard) env; // write scheme - SolrScheme scheme = (sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null) ? null : sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().getScheme(); - final String schemename = sb.getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list"); - if (scheme == null) { - scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename)); - } - final Iterator i = scheme.allIterator(); + final Iterator i = sb.solrScheme.allIterator(); int c = 0; ConfigurationSet.Entry entry; - SolrScheme.Field field = null; + SolrField field = null; while (i.hasNext()) { entry = i.next(); if (!entry.enabled()) continue; //scheme.contains(entry.key()) try { - field = Field.valueOf(entry.key()); + field = SolrField.valueOf(entry.key()); } catch (IllegalArgumentException e) { continue; } diff --git a/source/net/yacy/cora/services/federated/solr/SolrConnector.java b/source/net/yacy/cora/services/federated/solr/SolrConnector.java index 3c4692483a..ec7b07e47c 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrConnector.java +++ b/source/net/yacy/cora/services/federated/solr/SolrConnector.java @@ -27,8 +27,6 @@ import java.io.IOException; import java.util.List; -import net.yacy.cora.protocol.ResponseHeader; -import net.yacy.document.Document; import net.yacy.kelondro.data.meta.DigestURI; import org.apache.solr.common.SolrDocumentList; @@ -37,12 +35,6 @@ public interface SolrConnector { - /** - * with a scheme the fields of a SolrDocument can be translated to actual data values - * @return the solr scheme that can translate the SolrDocument - */ - public SolrScheme getScheme(); - public void close(); /** @@ -73,15 +65,6 @@ public interface SolrConnector { */ public boolean exists(final String id) throws IOException; - /** - * add a YaCy document. This calls the scheme processor to add the document as solr document - * @param id the url hash of the entry - * @param header the http response header - * @param doc the YaCy document - * @throws IOException - */ - public void add(final String id, final ResponseHeader header, final Document doc) throws IOException; - /** * add a solr input document * @param solrdoc diff --git a/source/net/yacy/cora/services/federated/solr/SolrField.java b/source/net/yacy/cora/services/federated/solr/SolrField.java new file mode 100644 index 0000000000..5cdd2a9892 --- /dev/null +++ b/source/net/yacy/cora/services/federated/solr/SolrField.java @@ -0,0 +1,32 @@ +/** + * SolrField + * Copyright 2011 by Michael Peter Christen + * First released 14.04.2011 at http://yacy.net + * + * $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $ + * $LastChangedRevision: 7654 $ + * $LastChangedBy: orbiter $ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.services.federated.solr; + + +public interface SolrField { + + public String name(); + +} diff --git a/source/net/yacy/cora/services/federated/solr/SolrRetryConnector.java b/source/net/yacy/cora/services/federated/solr/SolrRetryConnector.java index 95a381fdea..88d5808a6b 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrRetryConnector.java +++ b/source/net/yacy/cora/services/federated/solr/SolrRetryConnector.java @@ -27,8 +27,6 @@ import java.io.IOException; import java.util.List; -import net.yacy.cora.protocol.ResponseHeader; -import net.yacy.document.Document; import net.yacy.kelondro.data.meta.DigestURI; import org.apache.solr.common.SolrDocumentList; @@ -45,11 +43,6 @@ public SolrRetryConnector(final SolrConnector solrConnector, final long retryMax this.retryMaxTime = retryMaxTime; } - @Override - public SolrScheme getScheme() { - return this.solrConnector.getScheme(); - } - @Override public void close() { this.solrConnector.close(); @@ -115,21 +108,6 @@ public boolean exists(final String id) throws IOException { return false; } - @Override - public void add(final String id, final ResponseHeader header, final Document doc) throws IOException { - final long t = System.currentTimeMillis() + this.retryMaxTime; - Throwable ee = null; - while (System.currentTimeMillis() < t) try { - this.solrConnector.add(id, header, doc); - return; - } catch (final Throwable e) { - ee = e; - try {Thread.sleep(10);} catch (final InterruptedException e1) {} - continue; - } - if (ee != null) throw (ee instanceof IOException) ? (IOException) ee : new IOException(ee.getMessage()); - } - @Override public void add(final SolrInputDocument solrdoc) throws IOException, SolrException { final long t = System.currentTimeMillis() + this.retryMaxTime; diff --git a/source/net/yacy/cora/services/federated/solr/SolrShardingConnector.java b/source/net/yacy/cora/services/federated/solr/SolrShardingConnector.java index 4c551fff6f..4e7cd3e97f 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrShardingConnector.java +++ b/source/net/yacy/cora/services/federated/solr/SolrShardingConnector.java @@ -31,8 +31,7 @@ import java.util.List; import net.yacy.cora.protocol.Domains; -import net.yacy.cora.protocol.ResponseHeader; -import net.yacy.document.Document; +import net.yacy.cora.services.federated.solr.SolrShardingSelection.Method; import net.yacy.kelondro.data.meta.DigestURI; import org.apache.solr.common.SolrDocument; @@ -43,25 +42,20 @@ public class SolrShardingConnector implements SolrConnector { private final List connectors; - private final SolrScheme scheme; private final SolrShardingSelection sharding; private final String[] urls; - public SolrShardingConnector(final String urlList, final SolrScheme scheme, final SolrShardingSelection.Method method, final long timeout) throws IOException { + public SolrShardingConnector(final String urlList, final SolrShardingSelection.Method method, final long timeout) throws IOException { urlList.replace(' ', ','); this.urls = urlList.split(","); this.connectors = new ArrayList(); for (final String u: this.urls) { - this.connectors.add(new SolrRetryConnector(new SolrSingleConnector(u.trim(), scheme), timeout)); + this.connectors.add(new SolrRetryConnector(new SolrSingleConnector(u.trim()), timeout)); } this.sharding = new SolrShardingSelection(method, this.urls.length); - this.scheme = scheme; - } - - public SolrScheme getScheme() { - return this.scheme; } + @Override public void close() { for (final SolrConnector connector: this.connectors) connector.close(); } @@ -70,6 +64,7 @@ public void close() { * delete everything in the solr index * @throws IOException */ + @Override public void clear() throws IOException { for (final SolrConnector connector: this.connectors) connector.clear(); } @@ -79,6 +74,7 @@ public void clear() throws IOException { * @param id the url hash of the entry * @throws IOException */ + @Override public void delete(final String id) throws IOException { for (final SolrConnector connector: this.connectors) connector.delete(id); } @@ -88,6 +84,7 @@ public void delete(final String id) throws IOException { * @param ids a list of url hashes * @throws IOException */ + @Override public void delete(final List ids) throws IOException { for (final SolrConnector connector: this.connectors) connector.delete(ids); } @@ -98,6 +95,7 @@ public void delete(final List ids) throws IOException { * @return true if any entry in solr exists * @throws IOException */ + @Override public boolean exists(final String id) throws IOException { for (final SolrConnector connector: this.connectors) { if (connector.exists(id)) return true; @@ -105,22 +103,12 @@ public boolean exists(final String id) throws IOException { return false; } - /** - * add a YaCy document. This calls the scheme processor to add the document as solr document - * @param id the url hash of the entry - * @param header the http response header - * @param doc the YaCy document - * @throws IOException - */ - public void add(final String id, final ResponseHeader header, final Document doc) throws IOException { - add(this.scheme.yacy2solr(id, header, doc)); - } - /** * add a Solr document * @param solrdoc * @throws IOException */ + @Override public void add(final SolrInputDocument solrdoc) throws IOException { this.connectors.get(this.sharding.select(solrdoc)).add(solrdoc); } @@ -141,6 +129,7 @@ protected void addSolr(final Collection docs) throws IOExcept * @param httpstatus * @throws IOException */ + @Override public void err(final DigestURI digestURI, final String failReason, final int httpstatus) throws IOException { this.connectors.get(this.sharding.selectURL(digestURI.toNormalform(true, false))).err(digestURI, failReason, httpstatus); } @@ -152,6 +141,7 @@ public void err(final DigestURI digestURI, final String failReason, final int ht * @param querystring * @throws IOException */ + @Override public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException { final SolrDocumentList list = new SolrDocumentList(); for (final SolrConnector connector: this.connectors) { @@ -181,6 +171,7 @@ public long[] getSizeList() { return size; } + @Override public long getSize() { final long[] size = getSizeList(); long s = 0; diff --git a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java index 500db723b2..0988303c7e 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java +++ b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java @@ -36,8 +36,6 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.Domains; -import net.yacy.cora.protocol.ResponseHeader; -import net.yacy.document.Document; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; @@ -65,7 +63,6 @@ public class SolrSingleConnector implements SolrConnector { private final String solrurl, host, solrpath, solraccount, solrpw; private final int port; private HttpSolrServer server; - private final SolrScheme scheme; private final static int transmissionQueueCount = 4; // allow concurrent http sessions to solr private final static int transmissionQueueSize = 50; // number of documents that are collected until a commit is sent @@ -80,9 +77,8 @@ public class SolrSingleConnector implements SolrConnector { * @throws IOException */ @SuppressWarnings("unchecked") - public SolrSingleConnector(final String url, final SolrScheme scheme) throws IOException { + public SolrSingleConnector(final String url) throws IOException { this.solrurl = url; - this.scheme = scheme; this.transmissionRoundRobinCounter = 0; this.transmissionQueue = new ArrayBlockingQueue[transmissionQueueCount]; for (int i = 0; i < transmissionQueueCount; i++) { @@ -187,11 +183,6 @@ public void close() { } } - @Override - public SolrScheme getScheme() { - return this.scheme; - } - @Override public long getSize() { try { @@ -261,11 +252,6 @@ public void add(final File file, final String solrId) throws IOException { } } - @Override - public void add(final String id, final ResponseHeader header, final Document doc) throws IOException, SolrException { - add(this.scheme.yacy2solr(id, header, doc)); - } - @Override public void add(final SolrInputDocument solrdoc) throws IOException, SolrException { int thisrrc = this.transmissionRoundRobinCounter; @@ -384,7 +370,8 @@ public String getAdminInterface() { public static void main(final String args[]) { SolrSingleConnector solr; try { - solr = new SolrSingleConnector("http://127.0.0.1:8983/solr", new SolrScheme()); + //SolrScheme scheme = new SolrScheme(); + solr = new SolrSingleConnector("http://127.0.0.1:8983/solr"); solr.clear(); final File exampleDir = new File("/Data/workspace2/yacy/test/parsertest/"); long t, t0, a = 0; diff --git a/source/net/yacy/cora/services/federated/solr/SolrType.java b/source/net/yacy/cora/services/federated/solr/SolrType.java new file mode 100644 index 0000000000..d35b0418e2 --- /dev/null +++ b/source/net/yacy/cora/services/federated/solr/SolrType.java @@ -0,0 +1,47 @@ +/** + * SolrType + * Copyright 2011 by Michael Peter Christen + * First released 14.04.2011 at http://yacy.net + * + * $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $ + * $LastChangedRevision: 7654 $ + * $LastChangedBy: orbiter $ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.cora.services.federated.solr; + +public enum SolrType { + string, + text_general, + text_en_splitting_tight, + date, + integer("int"), + tdouble, + bool("boolean"); + + private String printName; + private SolrType() { + this.printName = this.name(); + } + private SolrType(String printName) { + this.printName = printName; + } + public String printName() { + return this.printName; + } +} \ No newline at end of file diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 31c907c07e..c940130652 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -91,7 +91,6 @@ import net.yacy.cora.protocol.TimeoutRequest; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.protocol.http.ProxySettings; -import net.yacy.cora.services.federated.solr.SolrScheme; import net.yacy.cora.services.federated.solr.SolrShardingConnector; import net.yacy.cora.services.federated.solr.SolrShardingSelection; import net.yacy.cora.services.federated.yacy.CacheStrategy; @@ -142,6 +141,7 @@ import net.yacy.repository.LoaderDispatcher; import net.yacy.search.index.Segment; import net.yacy.search.index.Segments; +import net.yacy.search.index.SolrScheme; import net.yacy.search.query.AccessTracker; import net.yacy.search.query.QueryParams; import net.yacy.search.query.SearchEvent; @@ -242,6 +242,7 @@ public final class Switchboard extends serverSwitch public SeedDB peers; public WorkTables tables; public Tray tray; + public SolrScheme solrScheme; public WorkflowProcessor indexingDocumentProcessor; public WorkflowProcessor indexingCondensementProcessor; @@ -640,22 +641,20 @@ public void run() { FileUtils.copy(solrBackupProfile, solrWorkProfile); } final SolrScheme backupScheme = new SolrScheme(solrBackupProfile); - final SolrScheme workingScheme = new SolrScheme(solrWorkProfile); + this.solrScheme = new SolrScheme(solrWorkProfile); // update the working scheme with the backup scheme. This is necessary to include new features. // new features are always activated by default - workingScheme.fill(backupScheme, false); + this.solrScheme.fill(backupScheme, false); // set up the solr interface - final String solrurls = - getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr"); - final boolean usesolr = - getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0; + final String solrurls = getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr"); + final boolean usesolr = getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0; + try { this.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr( (usesolr) ? new SolrShardingConnector( solrurls, - workingScheme, SolrShardingSelection.Method.MODULO_HOST_MD5, 10000) : null); } catch ( final IOException e ) { @@ -2432,7 +2431,7 @@ && getConfigBool("federated.service.solr.indexing.enabled", false)/*in.queueEntr this.indexSegments .segment(Segments.Process.LOCALCRAWLING) .getSolr() - .add(id, in.queueEntry.getResponseHeader(), doc); + .add(this.solrScheme.yacy2solr(id, in.queueEntry.getResponseHeader(), doc)); } catch ( final IOException e ) { Log.logWarning( "SOLR", diff --git a/source/net/yacy/search/index/SolrField.java b/source/net/yacy/search/index/SolrField.java new file mode 100644 index 0000000000..6768777686 --- /dev/null +++ b/source/net/yacy/search/index/SolrField.java @@ -0,0 +1,173 @@ +/** + * SolrField + * Copyright 2011 by Michael Peter Christen + * First released 14.04.2011 at http://yacy.net + * + * $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $ + * $LastChangedRevision: 7654 $ + * $LastChangedBy: orbiter $ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.search.index; + +import net.yacy.cora.services.federated.solr.SolrType; + +public enum SolrField implements net.yacy.cora.services.federated.solr.SolrField { + + id(SolrType.string, true, true, "primary key of document, the URL hash"), + sku(SolrType.text_en_splitting_tight, true, true, false, true, "url of document"), + ip_s(SolrType.string, true, true, "ip of host of url (after DNS lookup)"), + host_s(SolrType.string, true, true, "host of the url"), + title(SolrType.text_general, true, true, true, "content of title tag"), + author(SolrType.text_general, true, true, "content of author-tag"), + description(SolrType.text_general, true, true, "content of description-tag"), + content_type(SolrType.string, true, true, true, "mime-type of document"), + last_modified(SolrType.date, true, true, "last-modified from http header"), + keywords(SolrType.text_general, true, true, "content of keywords tag; words are separated by space"), + text_t(SolrType.text_general, true, true, "all visible text"), + wordcount_i(SolrType.integer, true, true, "number of words in visible area"), + paths_txt(SolrType.text_general, true, true, true, "all path elements in the url"), + // encoded as binary value into an integer: + // bit 0: "all" contained in html header meta + // bit 1: "index" contained in html header meta + // bit 2: "noindex" contained in html header meta + // bit 3: "nofollow" contained in html header meta + // bit 8: "noarchive" contained in http header properties + // bit 9: "nosnippet" contained in http header properties + // bit 10: "noindex" contained in http header properties + // bit 11: "nofollow" contained in http header properties + // bit 12: "unavailable_after" contained in http header properties + robots_i(SolrType.integer, true, true, "content of tag and the \"X-Robots-Tag\" HTTP property"), + inboundlinkscount_i(SolrType.integer, true, true, "total number of inbound links"), + inboundlinksnofollowcount_i(SolrType.integer, true, true, "number of inbound links with nofollow tag"), + inboundlinks_tag_txt(SolrType.text_general, true, true, true, "internal links, normalized (absolute URLs), as - tag with anchor text and nofollow"), + inboundlinks_protocol_txt(SolrType.text_general, true, true, true, "internal links, only the protocol"), + inboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "internal links, the url only without the protocol"), + inboundlinks_name_txt(SolrType.text_general, true, true, true, "internal links, the name property of the a-tag"), + inboundlinks_rel_txt(SolrType.text_general, true, true, true, "internal links, the rel property of the a-tag"), + inboundlinks_relflags_txt(SolrType.text_general, true, true, true, "internal links, the rel property of the a-tag, coded binary"), + inboundlinks_text_txt(SolrType.text_general, true, true, true, "internal links, the text content of the a-tag"), + outboundlinkscount_i(SolrType.integer, true, true, "external number of inbound links"), + outboundlinksnofollowcount_i(SolrType.integer, true, true, "number of external links with nofollow tag"), + outboundlinks_tag_txt(SolrType.text_general, true, true, true, "external links, normalized (absolute URLs), as - tag with anchor text and nofollow"), + outboundlinks_protocol_txt(SolrType.text_general, true, true, true, "external links, only the protocol"), + outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "external links, the url only without the protocol"), + outboundlinks_name_txt(SolrType.text_general, true, true, true, "external links, the name property of the a-tag"), + outboundlinks_rel_txt(SolrType.text_general, true, true, true, "external links, the rel property of the a-tag"), + outboundlinks_relflags_txt(SolrType.text_general, true, true, true, "external links, the rel property of the a-tag, coded binary"), + outboundlinks_text_txt(SolrType.text_general, true, true, true, "external links, the text content of the a-tag"), + charset_s(SolrType.string, true, true, "character encoding"), + lon_coordinate(SolrType.tdouble, true, false, "longitude of location as declared in WSG84"), + lat_coordinate(SolrType.tdouble, true, false, "latitude of location as declared in WSG84"), + httpstatus_i(SolrType.integer, true, true, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), + h1_txt(SolrType.text_general, true, true, true, "h1 header"), + h2_txt(SolrType.text_general, true, true, true, "h2 header"), + h3_txt(SolrType.text_general, true, true, true, "h3 header"), + h4_txt(SolrType.text_general, true, true, true, "h4 header"), + h5_txt(SolrType.text_general, true, true, true, "h5 header"), + h6_txt(SolrType.text_general, true, true, true, "h6 header"), + htags_i(SolrType.integer, true, true, "binary pattern for the existance of h1..h6 headlines"), + canonical_s(SolrType.string, true, true, "url inside the canonical link element"), + metagenerator_t(SolrType.text_general, true, true, "content of tag"), + boldcount_i(SolrType.integer, true, true, "total number of occurrences of or "), + bold_txt(SolrType.text_general, true, true, true, "all texts inside of or tags. no doubles. listed in the order of number of occurrences in decreasing order"), + bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"), + italiccount_i(SolrType.integer, true, true, "total number of occurrences of "), + italic_txt(SolrType.text_general, true, true, true, "all texts inside of tags. no doubles. listed in the order of number of occurrences in decreasing order"), + italic_val(SolrType.integer, true, true, true, "number of occurrences of texts in italic_txt"), + licount_i(SolrType.integer, true, true, "number of
  • tags"), + li_txt(SolrType.text_general, true, true, true, "all texts in
  • tags"), + imagescount_i(SolrType.integer, true, true, "number of images"), + images_tag_txt(SolrType.text_general, true, true, true, " all image tags, encoded as tag inclusive alt- and title property"), + images_protocol_txt(SolrType.text_general, true, true, true, "all image link protocols"), + images_urlstub_txt(SolrType.text_general, true, true, true, "all image links without the protocol and '://'"), + images_alt_txt(SolrType.text_general, true, true, true, "all image link alt tag"), + csscount_i(SolrType.integer, true, true, "number of entries in css_tag_txt and css_url_txt"), + css_tag_txt(SolrType.text_general, true, true, true, "full css tag with normalized url"), + css_url_txt(SolrType.text_general, true, true, true, "normalized urls within a css tag"), + scripts_txt(SolrType.text_general, true, true, true, "normaluzed urls within a scripts tag"), + scriptscount_i(SolrType.integer, true, true, "number of entries in scripts_txt"), + frames_txt(SolrType.text_general, true, true, true, "list of all links to frames"), + framesscount_i(SolrType.integer, true, true, "number of frames_txt"), + iframes_txt(SolrType.text_general, true, true, true, "list of all links to iframes"), + iframesscount_i(SolrType.integer, true, true, "number of iframes_txt"), + flash_b(SolrType.bool, true, true, "flag that shows if a swf file is linked"), + responsetime_i(SolrType.integer, true, true, "response time of target server in milliseconds"), + ext_cms_txt(SolrType.text_general, true, true, true, "names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias"), + ext_cms_val(SolrType.integer, true, true, true, "number of attributes that count for a specific cms in ext_cms_txt"), + ext_ads_txt(SolrType.text_general, true, true, true, "names of ad-servers/ad-services"), + ext_ads_val(SolrType.integer, true, true, true, "number of attributes counts in ext_ads_txt"), + ext_community_txt(SolrType.text_general, true, true, true, "names of recognized community functions"), + ext_community_val(SolrType.integer, true, true, true, "number of attribute counts in attr_community"), + ext_maps_txt(SolrType.text_general, true, true, true, "names of map services"), + ext_maps_val(SolrType.integer, true, true, true, "number of attribute counts in ext_maps_txt"), + ext_tracker_txt(SolrType.text_general, true, true, true, "names of tracker server"), + ext_tracker_val(SolrType.integer, true, true, true, "number of attribute counts in ext_tracker_txt"), + ext_title_txt(SolrType.text_general, true, true, true, "names matching title expressions"), + ext_title_val(SolrType.integer, true, true, true, "number of matching title expressions"), + failreason_t(SolrType.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty"); + + final SolrType type; + final boolean indexed, stored; + boolean multiValued, omitNorms; + final String comment; + + private SolrField(final SolrType type, final boolean indexed, final boolean stored, final String comment) { + this.type = type; + this.indexed = indexed; + this.stored = stored; + this.multiValued = false; + this.omitNorms = false; + this.comment = comment; + } + + private SolrField(final SolrType type, final boolean indexed, final boolean stored, final boolean multiValued, final String comment) { + this(type, indexed, stored, comment); + this.multiValued = multiValued; + } + + private SolrField(final SolrType type, final boolean indexed, final boolean stored, final boolean multiValued, final boolean omitNorms, final String comment) { + this(type, indexed, stored, multiValued, comment); + this.omitNorms = omitNorms; + } + + public final SolrType getType() { + return this.type; + } + + public final boolean isIndexed() { + return this.indexed; + } + + public final boolean isStored() { + return this.stored; + } + + public final boolean isMultiValued() { + return this.multiValued; + } + + public final boolean isOmitNorms() { + return this.omitNorms; + } + + public final String getComment() { + return this.comment; + } + +} + diff --git a/source/net/yacy/cora/services/federated/solr/SolrScheme.java b/source/net/yacy/search/index/SolrScheme.java similarity index 50% rename from source/net/yacy/cora/services/federated/solr/SolrScheme.java rename to source/net/yacy/search/index/SolrScheme.java index 8302b24e93..4c2b30beec 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java +++ b/source/net/yacy/search/index/SolrScheme.java @@ -22,7 +22,7 @@ * If not, see . */ -package net.yacy.cora.services.federated.solr; +package net.yacy.search.index; import java.io.File; @@ -67,16 +67,16 @@ public SolrScheme() { */ public SolrScheme(final File configurationFile) { super(configurationFile); - // check consistency: compare with Field enum + // check consistency: compare with YaCyField enum for (String name: this) { try { - Field.valueOf(name); + SolrField.valueOf(name); } catch (IllegalArgumentException e) { Log.logWarning("SolrScheme", "solr scheme file " + configurationFile.getAbsolutePath() + " defines unknown attribute '" + name + "'"); } } /* - for (Field field: Field.values()) { + for (YaCyField field: YaCyField.values()) { if (!this.contains(field.name())) { Log.logWarning("SolrScheme", "solr scheme file " + configurationFile.getAbsolutePath() + " omits known attribute '" + field.name() + "'"); } @@ -84,228 +84,62 @@ public SolrScheme(final File configurationFile) { */ } - protected void addSolr(final SolrInputDocument solrdoc, final Field key, final String value) { + protected void addSolr(final SolrInputDocument solrdoc, final SolrField key, final String value) { if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value); } - protected void addSolr(final SolrInputDocument solrdoc, final Field key, final Date value) { + protected void addSolr(final SolrInputDocument solrdoc, final SolrField key, final Date value) { if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value); } - protected void addSolr(final SolrInputDocument solrdoc, final Field key, final int value) { + protected void addSolr(final SolrInputDocument solrdoc, final SolrField key, final int value) { if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value); } - protected void addSolr(final SolrInputDocument solrdoc, final Field key, final String[] value) { + protected void addSolr(final SolrInputDocument solrdoc, final SolrField key, final String[] value) { if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value); } - protected void addSolr(final SolrInputDocument solrdoc, final Field key, final float value) { + protected void addSolr(final SolrInputDocument solrdoc, final SolrField key, final float value) { if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value); } - protected void addSolr(final SolrInputDocument solrdoc, final Field key, final boolean value) { + protected void addSolr(final SolrInputDocument solrdoc, final SolrField key, final boolean value) { if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value); } - protected void addSolr(final SolrInputDocument solrdoc, final Field key, final String value, final float boost) { + protected void addSolr(final SolrInputDocument solrdoc, final SolrField key, final String value, final float boost) { if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value, boost); } - public static enum Types { - string, - text_general, - text_en_splitting_tight, - date, - integer("int"), - tdouble, - bool("boolean"); - - private String printName; - private Types() { - this.printName = this.name(); - } - private Types(String printName) { - this.printName = printName; - } - public String printName() { - return this.printName; - } - } - - public static enum Field { - - id(Types.string, true, true, "primary key of document, the URL hash"), - sku(Types.text_en_splitting_tight, true, true, false, true, "url of document"), - ip_s(Types.string, true, true, "ip of host of url (after DNS lookup)"), - host_s(Types.string, true, true, "host of the url"), - title(Types.text_general, true, true, true, "content of title tag"), - author(Types.text_general, true, true, "content of author-tag"), - description(Types.text_general, true, true, "content of description-tag"), - content_type(Types.string, true, true, true, "mime-type of document"), - last_modified(Types.date, true, true, "last-modified from http header"), - keywords(Types.text_general, true, true, "content of keywords tag; words are separated by space"), - text_t(Types.text_general, true, true, "all visible text"), - wordcount_i(Types.integer, true, true, "number of words in visible area"), - paths_txt(Types.text_general, true, true, true, "all path elements in the url"), - // encoded as binary value into an integer: - // bit 0: "all" contained in html header meta - // bit 1: "index" contained in html header meta - // bit 2: "noindex" contained in html header meta - // bit 3: "nofollow" contained in html header meta - // bit 8: "noarchive" contained in http header properties - // bit 9: "nosnippet" contained in http header properties - // bit 10: "noindex" contained in http header properties - // bit 11: "nofollow" contained in http header properties - // bit 12: "unavailable_after" contained in http header properties - robots_i(Types.integer, true, true, "content of tag and the \"X-Robots-Tag\" HTTP property"), - inboundlinkscount_i(Types.integer, true, true, "total number of inbound links"), - inboundlinksnofollowcount_i(Types.integer, true, true, "number of inbound links with nofollow tag"), - inboundlinks_tag_txt(Types.text_general, true, true, true, "internal links, normalized (absolute URLs), as - tag with anchor text and nofollow"), - inboundlinks_protocol_txt(Types.text_general, true, true, true, "internal links, only the protocol"), - inboundlinks_urlstub_txt(Types.text_general, true, true, true, "internal links, the url only without the protocol"), - inboundlinks_name_txt(Types.text_general, true, true, true, "internal links, the name property of the a-tag"), - inboundlinks_rel_txt(Types.text_general, true, true, true, "internal links, the rel property of the a-tag"), - inboundlinks_relflags_txt(Types.text_general, true, true, true, "internal links, the rel property of the a-tag, coded binary"), - inboundlinks_text_txt(Types.text_general, true, true, true, "internal links, the text content of the a-tag"), - outboundlinkscount_i(Types.integer, true, true, "external number of inbound links"), - outboundlinksnofollowcount_i(Types.integer, true, true, "number of external links with nofollow tag"), - outboundlinks_tag_txt(Types.text_general, true, true, true, "external links, normalized (absolute URLs), as - tag with anchor text and nofollow"), - outboundlinks_protocol_txt(Types.text_general, true, true, true, "external links, only the protocol"), - outboundlinks_urlstub_txt(Types.text_general, true, true, true, "external links, the url only without the protocol"), - outboundlinks_name_txt(Types.text_general, true, true, true, "external links, the name property of the a-tag"), - outboundlinks_rel_txt(Types.text_general, true, true, true, "external links, the rel property of the a-tag"), - outboundlinks_relflags_txt(Types.text_general, true, true, true, "external links, the rel property of the a-tag, coded binary"), - outboundlinks_text_txt(Types.text_general, true, true, true, "external links, the text content of the a-tag"), - charset_s(Types.string, true, true, "character encoding"), - lon_coordinate(Types.tdouble, true, false, "longitude of location as declared in WSG84"), - lat_coordinate(Types.tdouble, true, false, "latitude of location as declared in WSG84"), - httpstatus_i(Types.integer, true, true, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), - h1_txt(Types.text_general, true, true, true, "h1 header"), - h2_txt(Types.text_general, true, true, true, "h2 header"), - h3_txt(Types.text_general, true, true, true, "h3 header"), - h4_txt(Types.text_general, true, true, true, "h4 header"), - h5_txt(Types.text_general, true, true, true, "h5 header"), - h6_txt(Types.text_general, true, true, true, "h6 header"), - htags_i(Types.integer, true, true, "binary pattern for the existance of h1..h6 headlines"), - canonical_s(Types.string, true, true, "url inside the canonical link element"), - metagenerator_t(Types.text_general, true, true, "content of tag"), - boldcount_i(Types.integer, true, true, "total number of occurrences of or "), - bold_txt(Types.text_general, true, true, true, "all texts inside of or tags. no doubles. listed in the order of number of occurrences in decreasing order"), - bold_val(Types.integer, true, true, true, "number of occurrences of texts in bold_txt"), - italiccount_i(Types.integer, true, true, "total number of occurrences of "), - italic_txt(Types.text_general, true, true, true, "all texts inside of tags. no doubles. listed in the order of number of occurrences in decreasing order"), - italic_val(Types.integer, true, true, true, "number of occurrences of texts in italic_txt"), - licount_i(Types.integer, true, true, "number of
  • tags"), - li_txt(Types.text_general, true, true, true, "all texts in
  • tags"), - imagescount_i(Types.integer, true, true, "number of images"), - images_tag_txt(Types.text_general, true, true, true, " all image tags, encoded as tag inclusive alt- and title property"), - images_protocol_txt(Types.text_general, true, true, true, "all image link protocols"), - images_urlstub_txt(Types.text_general, true, true, true, "all image links without the protocol and '://'"), - images_alt_txt(Types.text_general, true, true, true, "all image link alt tag"), - csscount_i(Types.integer, true, true, "number of entries in css_tag_txt and css_url_txt"), - css_tag_txt(Types.text_general, true, true, true, "full css tag with normalized url"), - css_url_txt(Types.text_general, true, true, true, "normalized urls within a css tag"), - scripts_txt(Types.text_general, true, true, true, "normaluzed urls within a scripts tag"), - scriptscount_i(Types.integer, true, true, "number of entries in scripts_txt"), - frames_txt(Types.text_general, true, true, true, "list of all links to frames"), - framesscount_i(Types.integer, true, true, "number of frames_txt"), - iframes_txt(Types.text_general, true, true, true, "list of all links to iframes"), - iframesscount_i(Types.integer, true, true, "number of iframes_txt"), - flash_b(Types.bool, true, true, "flag that shows if a swf file is linked"), - responsetime_i(Types.integer, true, true, "response time of target server in milliseconds"), - ext_cms_txt(Types.text_general, true, true, true, "names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias"), - ext_cms_val(Types.integer, true, true, true, "number of attributes that count for a specific cms in ext_cms_txt"), - ext_ads_txt(Types.text_general, true, true, true, "names of ad-servers/ad-services"), - ext_ads_val(Types.integer, true, true, true, "number of attributes counts in ext_ads_txt"), - ext_community_txt(Types.text_general, true, true, true, "names of recognized community functions"), - ext_community_val(Types.integer, true, true, true, "number of attribute counts in attr_community"), - ext_maps_txt(Types.text_general, true, true, true, "names of map services"), - ext_maps_val(Types.integer, true, true, true, "number of attribute counts in ext_maps_txt"), - ext_tracker_txt(Types.text_general, true, true, true, "names of tracker server"), - ext_tracker_val(Types.integer, true, true, true, "number of attribute counts in ext_tracker_txt"), - ext_title_txt(Types.text_general, true, true, true, "names matching title expressions"), - ext_title_val(Types.integer, true, true, true, "number of matching title expressions"), - failreason_t(Types.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty"); - - final Types type; - final boolean indexed, stored; - boolean multiValued, omitNorms; - final String comment; - - private Field(final Types type, final boolean indexed, final boolean stored, final String comment) { - this.type = type; - this.indexed = indexed; - this.stored = stored; - this.multiValued = false; - this.omitNorms = false; - this.comment = comment; - } - - private Field(final Types type, final boolean indexed, final boolean stored, final boolean multiValued, final String comment) { - this(type, indexed, stored, comment); - this.multiValued = multiValued; - } - - private Field(final Types type, final boolean indexed, final boolean stored, final boolean multiValued, final boolean omitNorms, final String comment) { - this(type, indexed, stored, multiValued, comment); - this.omitNorms = omitNorms; - } - - public final Types getType() { - return this.type; - } - - public final boolean isIndexed() { - return this.indexed; - } - - public final boolean isStored() { - return this.stored; - } - - public final boolean isMultiValued() { - return this.multiValued; - } - - public final boolean isOmitNorms() { - return this.omitNorms; - } - - public final String getComment() { - return this.comment; - } - - } - public SolrInputDocument yacy2solr(final String id, final ResponseHeader header, final Document yacydoc) { // we user the SolrCell design as index scheme final SolrInputDocument solrdoc = new SolrInputDocument(); final DigestURI digestURI = new DigestURI(yacydoc.dc_source()); - addSolr(solrdoc, Field.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before) - addSolr(solrdoc, Field.id, id); - addSolr(solrdoc, Field.sku, digestURI.toNormalform(true, false)); + addSolr(solrdoc, SolrField.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before) + addSolr(solrdoc, SolrField.id, id); + addSolr(solrdoc, SolrField.sku, digestURI.toNormalform(true, false)); final InetAddress address = digestURI.getInetAddress(); - if (address != null) addSolr(solrdoc, Field.ip_s, address.getHostAddress()); - if (digestURI.getHost() != null) addSolr(solrdoc, Field.host_s, digestURI.getHost()); - addSolr(solrdoc, Field.title, yacydoc.dc_title()); - addSolr(solrdoc, Field.author, yacydoc.dc_creator()); - addSolr(solrdoc, Field.description, yacydoc.dc_description()); - addSolr(solrdoc, Field.content_type, yacydoc.dc_format()); - addSolr(solrdoc, Field.last_modified, header.lastModified()); - addSolr(solrdoc, Field.keywords, yacydoc.dc_subject(' ')); + if (address != null) addSolr(solrdoc, SolrField.ip_s, address.getHostAddress()); + if (digestURI.getHost() != null) addSolr(solrdoc, SolrField.host_s, digestURI.getHost()); + addSolr(solrdoc, SolrField.title, yacydoc.dc_title()); + addSolr(solrdoc, SolrField.author, yacydoc.dc_creator()); + addSolr(solrdoc, SolrField.description, yacydoc.dc_description()); + addSolr(solrdoc, SolrField.content_type, yacydoc.dc_format()); + addSolr(solrdoc, SolrField.last_modified, header.lastModified()); + addSolr(solrdoc, SolrField.keywords, yacydoc.dc_subject(' ')); final String content = UTF8.String(yacydoc.getTextBytes()); - addSolr(solrdoc, Field.text_t, content); - if (isEmpty() || contains(Field.wordcount_i.name())) { + addSolr(solrdoc, SolrField.text_t, content); + if (isEmpty() || contains(SolrField.wordcount_i.name())) { final int contentwc = content.split(" ").length; - addSolr(solrdoc, Field.wordcount_i, contentwc); + addSolr(solrdoc, SolrField.wordcount_i, contentwc); } // path elements of link final String path = digestURI.getPath(); - if (path != null && (isEmpty() || contains(Field.paths_txt.name()))) { + if (path != null && (isEmpty() || contains(SolrField.paths_txt.name()))) { final String[] paths = path.split("/"); - if (paths.length > 0) addSolr(solrdoc, Field.paths_txt, paths); + if (paths.length > 0) addSolr(solrdoc, SolrField.paths_txt, paths); } // get list of all links; they will be shrinked by urls that appear in other fields of the solr scheme @@ -322,17 +156,17 @@ public SolrInputDocument yacy2solr(final String id, final ResponseHeader header, int f = 1; String[] hs; - hs = html.getHeadlines(1); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h1_txt, hs); - hs = html.getHeadlines(2); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h2_txt, hs); - hs = html.getHeadlines(3); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h3_txt, hs); - hs = html.getHeadlines(4); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h4_txt, hs); - hs = html.getHeadlines(5); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h5_txt, hs); - hs = html.getHeadlines(6); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h6_txt, hs); + hs = html.getHeadlines(1); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, SolrField.h1_txt, hs); + hs = html.getHeadlines(2); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, SolrField.h2_txt, hs); + hs = html.getHeadlines(3); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, SolrField.h3_txt, hs); + hs = html.getHeadlines(4); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, SolrField.h4_txt, hs); + hs = html.getHeadlines(5); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, SolrField.h5_txt, hs); + hs = html.getHeadlines(6); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, SolrField.h6_txt, hs); - addSolr(solrdoc, Field.htags_i, h); + addSolr(solrdoc, SolrField.htags_i, h); // canonical tag - if (html.getCanonical() != null) addSolr(solrdoc, Field.canonical_s, html.getCanonical().toNormalform(false, false)); + if (html.getCanonical() != null) addSolr(solrdoc, SolrField.canonical_s, html.getCanonical().toNormalform(false, false)); // noindex and nofollow attributes // from HTML (meta-tag in HTML header: robots) @@ -366,32 +200,32 @@ public SolrInputDocument yacy2solr(final String id, final ResponseHeader header, if (x_robots_tag.indexOf("nofollow",0) >= 0) b += 2048; // set bit 11 if (x_robots_tag.indexOf("unavailable_after",0) >=0) b += 4096; // set bit 12 } - addSolr(solrdoc, Field.robots_i, b); + addSolr(solrdoc, SolrField.robots_i, b); // meta tags: generator final String generator = html.getMetas().get("generator"); - if (generator != null) addSolr(solrdoc, Field.metagenerator_t, generator); + if (generator != null) addSolr(solrdoc, SolrField.metagenerator_t, generator); // bold, italic final String[] bold = html.getBold(); - addSolr(solrdoc, Field.boldcount_i, bold.length); + addSolr(solrdoc, SolrField.boldcount_i, bold.length); if (bold.length > 0) { - addSolr(solrdoc, Field.bold_txt, bold); - if (isEmpty() || contains(Field.bold_val.name())) { - addSolr(solrdoc, Field.bold_val, html.getBoldCount(bold)); + addSolr(solrdoc, SolrField.bold_txt, bold); + if (isEmpty() || contains(SolrField.bold_val.name())) { + addSolr(solrdoc, SolrField.bold_val, html.getBoldCount(bold)); } } final String[] italic = html.getItalic(); - addSolr(solrdoc, Field.italiccount_i, italic.length); + addSolr(solrdoc, SolrField.italiccount_i, italic.length); if (italic.length > 0) { - addSolr(solrdoc, Field.italic_txt, italic); - if (isEmpty() || contains(Field.italic_val.name())) { - addSolr(solrdoc, Field.italic_val, html.getItalicCount(italic)); + addSolr(solrdoc, SolrField.italic_txt, italic); + if (isEmpty() || contains(SolrField.italic_val.name())) { + addSolr(solrdoc, SolrField.italic_val, html.getItalicCount(italic)); } } final String[] li = html.getLi(); - addSolr(solrdoc, Field.licount_i, li.length); - if (li.length > 0) addSolr(solrdoc, Field.li_txt, li); + addSolr(solrdoc, SolrField.licount_i, li.length); + if (li.length > 0) addSolr(solrdoc, SolrField.li_txt, li); // images final Collection imagesc = html.getImages().values(); @@ -410,14 +244,14 @@ public SolrInputDocument yacy2solr(final String id, final ResponseHeader header, imgalts[c] = ie.alt(); c++; } - addSolr(solrdoc, Field.imagescount_i, imgtags.length); - if (isEmpty() || contains(Field.images_tag_txt.name())) addSolr(solrdoc, Field.images_tag_txt, imgtags); - if (isEmpty() || contains(Field.images_protocol_txt.name())) addSolr(solrdoc, Field.images_protocol_txt, protocolList2indexedList(imgprots)); - if (isEmpty() || contains(Field.images_urlstub_txt.name())) addSolr(solrdoc, Field.images_urlstub_txt, imgstubs); - if (isEmpty() || contains(Field.images_alt_txt.name())) addSolr(solrdoc, Field.images_alt_txt, imgalts); + addSolr(solrdoc, SolrField.imagescount_i, imgtags.length); + if (isEmpty() || contains(SolrField.images_tag_txt.name())) addSolr(solrdoc, SolrField.images_tag_txt, imgtags); + if (isEmpty() || contains(SolrField.images_protocol_txt.name())) addSolr(solrdoc, SolrField.images_protocol_txt, protocolList2indexedList(imgprots)); + if (isEmpty() || contains(SolrField.images_urlstub_txt.name())) addSolr(solrdoc, SolrField.images_urlstub_txt, imgstubs); + if (isEmpty() || contains(SolrField.images_alt_txt.name())) addSolr(solrdoc, SolrField.images_alt_txt, imgalts); // style sheets - if (isEmpty() || contains(Field.css_tag_txt.name())) { + if (isEmpty() || contains(SolrField.css_tag_txt.name())) { final Map csss = html.getCSS(); final String[] css_tag = new String[csss.size()]; final String[] css_url = new String[csss.size()]; @@ -432,13 +266,13 @@ public SolrInputDocument yacy2solr(final String id, final ResponseHeader header, css_url[c] = url; c++; } - addSolr(solrdoc, Field.csscount_i, css_tag.length); - if (css_tag.length > 0) addSolr(solrdoc, Field.css_tag_txt, css_tag); - if (css_url.length > 0) addSolr(solrdoc, Field.css_url_txt, css_url); + addSolr(solrdoc, SolrField.csscount_i, css_tag.length); + if (css_tag.length > 0) addSolr(solrdoc, SolrField.css_tag_txt, css_tag); + if (css_url.length > 0) addSolr(solrdoc, SolrField.css_url_txt, css_url); } // Scripts - if (isEmpty() || contains(Field.scripts_txt.name())) { + if (isEmpty() || contains(SolrField.scripts_txt.name())) { final Set scriptss = html.getScript(); final String[] scripts = new String[scriptss.size()]; c = 0; @@ -447,12 +281,12 @@ public SolrInputDocument yacy2solr(final String id, final ResponseHeader header, ouboundLinks.remove(url); scripts[c++] = url.toNormalform(false, false, false, false); } - addSolr(solrdoc, Field.scriptscount_i, scripts.length); - if (scripts.length > 0) addSolr(solrdoc, Field.scripts_txt, scripts); + addSolr(solrdoc, SolrField.scriptscount_i, scripts.length); + if (scripts.length > 0) addSolr(solrdoc, SolrField.scripts_txt, scripts); } // Frames - if (isEmpty() || contains(Field.frames_txt.name())) { + if (isEmpty() || contains(SolrField.frames_txt.name())) { final Set framess = html.getFrames(); final String[] frames = new String[framess.size()]; c = 0; @@ -461,12 +295,12 @@ public SolrInputDocument yacy2solr(final String id, final ResponseHeader header, ouboundLinks.remove(url); frames[c++] = url.toNormalform(false, false, false, false); } - addSolr(solrdoc, Field.framesscount_i, frames.length); - if (frames.length > 0) addSolr(solrdoc, Field.frames_txt, frames); + addSolr(solrdoc, SolrField.framesscount_i, frames.length); + if (frames.length > 0) addSolr(solrdoc, SolrField.frames_txt, frames); } // IFrames - if (isEmpty() || contains(Field.iframes_txt.name())) { + if (isEmpty() || contains(SolrField.iframes_txt.name())) { final Set iframess = html.getIFrames(); final String[] iframes = new String[iframess.size()]; c = 0; @@ -475,33 +309,33 @@ public SolrInputDocument yacy2solr(final String id, final ResponseHeader header, ouboundLinks.remove(url); iframes[c++] = url.toNormalform(false, false, false, false); } - addSolr(solrdoc, Field.iframesscount_i, iframes.length); - if (iframes.length > 0) addSolr(solrdoc, Field.iframes_txt, iframes); + addSolr(solrdoc, SolrField.iframesscount_i, iframes.length); + if (iframes.length > 0) addSolr(solrdoc, SolrField.iframes_txt, iframes); } // flash embedded - addSolr(solrdoc, Field.flash_b, html.containsFlash()); + addSolr(solrdoc, SolrField.flash_b, html.containsFlash()); // generic evaluation pattern for (final String model: html.getEvaluationModelNames()) { if (isEmpty() || contains("ext_" + model + "_txt")) { final String[] scorenames = html.getEvaluationModelScoreNames(model); if (scorenames.length > 0) { - addSolr(solrdoc, Field.valueOf("ext_" + model + "_txt"), scorenames); - addSolr(solrdoc, Field.valueOf("ext_" + model + "_val"), html.getEvaluationModelScoreCounts(model, scorenames)); + addSolr(solrdoc, SolrField.valueOf("ext_" + model + "_txt"), scorenames); + addSolr(solrdoc, SolrField.valueOf("ext_" + model + "_val"), html.getEvaluationModelScoreCounts(model, scorenames)); } } } // response time - addSolr(solrdoc, Field.responsetime_i, header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")); + addSolr(solrdoc, SolrField.responsetime_i, header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")); } // list all links final Map alllinks = yacydoc.getAnchors(); c = 0; - if (isEmpty() || contains(Field.inboundlinkscount_i.name())) addSolr(solrdoc, Field.inboundlinkscount_i, inboundLinks.size()); - if (isEmpty() || contains(Field.inboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount()); + if (isEmpty() || contains(SolrField.inboundlinkscount_i.name())) addSolr(solrdoc, SolrField.inboundlinkscount_i, inboundLinks.size()); + if (isEmpty() || contains(SolrField.inboundlinksnofollowcount_i.name())) addSolr(solrdoc, SolrField.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount()); final String[] inboundlinksTag = new String[inboundLinks.size()]; final String[] inboundlinksURLProtocol = new String[inboundLinks.size()]; final String[] inboundlinksURLStub = new String[inboundLinks.size()]; @@ -528,17 +362,17 @@ public SolrInputDocument yacy2solr(final String id, final ResponseHeader header, ((text.length() > 0) ? text : "") + ""; c++; } - if (isEmpty() || contains(Field.inboundlinks_tag_txt.name())) addSolr(solrdoc, Field.inboundlinks_tag_txt, inboundlinksTag); - if (isEmpty() || contains(Field.inboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol)); - if (isEmpty() || contains(Field.inboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.inboundlinks_urlstub_txt, inboundlinksURLStub); - if (isEmpty() || contains(Field.inboundlinks_name_txt.name())) addSolr(solrdoc, Field.inboundlinks_name_txt, inboundlinksName); - if (isEmpty() || contains(Field.inboundlinks_rel_txt.name())) addSolr(solrdoc, Field.inboundlinks_rel_txt, inboundlinksRel); - if (isEmpty() || contains(Field.inboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.inboundlinks_relflags_txt, relEval(inboundlinksRel)); - if (isEmpty() || contains(Field.inboundlinks_text_txt.name())) addSolr(solrdoc, Field.inboundlinks_text_txt, inboundlinksText); + if (isEmpty() || contains(SolrField.inboundlinks_tag_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_tag_txt, inboundlinksTag); + if (isEmpty() || contains(SolrField.inboundlinks_protocol_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol)); + if (isEmpty() || contains(SolrField.inboundlinks_urlstub_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_urlstub_txt, inboundlinksURLStub); + if (isEmpty() || contains(SolrField.inboundlinks_name_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_name_txt, inboundlinksName); + if (isEmpty() || contains(SolrField.inboundlinks_rel_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_rel_txt, inboundlinksRel); + if (isEmpty() || contains(SolrField.inboundlinks_relflags_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_relflags_txt, relEval(inboundlinksRel)); + if (isEmpty() || contains(SolrField.inboundlinks_text_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_text_txt, inboundlinksText); c = 0; - if (isEmpty() || contains(Field.outboundlinkscount_i.name())) addSolr(solrdoc, Field.outboundlinkscount_i, ouboundLinks.size()); - if (isEmpty() || contains(Field.outboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount()); + if (isEmpty() || contains(SolrField.outboundlinkscount_i.name())) addSolr(solrdoc, SolrField.outboundlinkscount_i, ouboundLinks.size()); + if (isEmpty() || contains(SolrField.outboundlinksnofollowcount_i.name())) addSolr(solrdoc, SolrField.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount()); final String[] outboundlinksTag = new String[ouboundLinks.size()]; final String[] outboundlinksURLProtocol = new String[ouboundLinks.size()]; final String[] outboundlinksURLStub = new String[ouboundLinks.size()]; @@ -565,24 +399,24 @@ public SolrInputDocument yacy2solr(final String id, final ResponseHeader header, ((text.length() > 0) ? text : "") + ""; c++; } - if (isEmpty() || contains(Field.outboundlinks_tag_txt.name())) addSolr(solrdoc, Field.outboundlinks_tag_txt, outboundlinksTag); - if (isEmpty() || contains(Field.outboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol)); - if (isEmpty() || contains(Field.outboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.outboundlinks_urlstub_txt, outboundlinksURLStub); - if (isEmpty() || contains(Field.outboundlinks_name_txt.name())) addSolr(solrdoc, Field.outboundlinks_name_txt, outboundlinksName); - if (isEmpty() || contains(Field.outboundlinks_rel_txt.name())) addSolr(solrdoc, Field.outboundlinks_rel_txt, outboundlinksRel); - if (isEmpty() || contains(Field.outboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.outboundlinks_relflags_txt, relEval(inboundlinksRel)); - if (isEmpty() || contains(Field.outboundlinks_text_txt.name())) addSolr(solrdoc, Field.outboundlinks_text_txt, outboundlinksText); + if (isEmpty() || contains(SolrField.outboundlinks_tag_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_tag_txt, outboundlinksTag); + if (isEmpty() || contains(SolrField.outboundlinks_protocol_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol)); + if (isEmpty() || contains(SolrField.outboundlinks_urlstub_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_urlstub_txt, outboundlinksURLStub); + if (isEmpty() || contains(SolrField.outboundlinks_name_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_name_txt, outboundlinksName); + if (isEmpty() || contains(SolrField.outboundlinks_rel_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_rel_txt, outboundlinksRel); + if (isEmpty() || contains(SolrField.outboundlinks_relflags_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_relflags_txt, relEval(inboundlinksRel)); + if (isEmpty() || contains(SolrField.outboundlinks_text_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_text_txt, outboundlinksText); // charset - addSolr(solrdoc, Field.charset_s, yacydoc.getCharset()); + addSolr(solrdoc, SolrField.charset_s, yacydoc.getCharset()); // coordinates if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) { - addSolr(solrdoc, Field.lon_coordinate, yacydoc.lon()); - addSolr(solrdoc, Field.lat_coordinate, yacydoc.lat()); + addSolr(solrdoc, SolrField.lon_coordinate, yacydoc.lon()); + addSolr(solrdoc, SolrField.lat_coordinate, yacydoc.lat()); } - addSolr(solrdoc, Field.httpstatus_i, 200); + addSolr(solrdoc, SolrField.httpstatus_i, 200); return solrdoc; } diff --git a/source/net/yacy/search/query/SnippetProcess.java b/source/net/yacy/search/query/SnippetProcess.java index 3d265a0659..e2772b0c28 100644 --- a/source/net/yacy/search/query/SnippetProcess.java +++ b/source/net/yacy/search/query/SnippetProcess.java @@ -502,7 +502,7 @@ public void run() { sd = sdl.get(0); } if (sd != null) { - solrContent = this.solr.getScheme().solrGetText(sd); + solrContent = Switchboard.getSwitchboard().solrScheme.solrGetText(sd); } }