From 7f63d3747d59e8db0fa4d17fa9314546f818537a Mon Sep 17 00:00:00 2001 From: reger Date: Wed, 15 May 2013 23:16:32 +0200 Subject: [PATCH] more generic field selection for reindex option of documents with disabled fields using Luke request to compare config with actual fields in index --- source/net/yacy/migration.java | 102 ++++++++++++------ .../search/index/ReindexSolrBusyThread.java | 41 ++++--- 2 files changed, 94 insertions(+), 49 deletions(-) diff --git a/source/net/yacy/migration.java b/source/net/yacy/migration.java index ec192dc422..74f13d3fb3 100644 --- a/source/net/yacy/migration.java +++ b/source/net/yacy/migration.java @@ -34,25 +34,20 @@ import net.yacy.search.SwitchboardConstants; import com.google.common.io.Files; -import static java.lang.Thread.MIN_PRIORITY; -import java.util.ArrayList; +import java.util.HashSet; import java.util.Iterator; -import java.util.concurrent.Semaphore; -import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; +import java.util.Set; import net.yacy.cora.storage.Configuration.Entry; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.index.Index; import net.yacy.kelondro.index.Row; -import net.yacy.kelondro.workflow.AbstractBusyThread; -import net.yacy.kelondro.workflow.AbstractThread; import net.yacy.kelondro.workflow.BusyThread; -import net.yacy.kelondro.workflow.InstantBusyThread; -import net.yacy.kelondro.workflow.WorkflowThread; import net.yacy.search.index.Fulltext; import net.yacy.search.schema.CollectionConfiguration; -import org.apache.solr.common.SolrDocument; -import org.apache.solr.common.SolrDocumentList; -import org.apache.solr.common.SolrInputDocument; +import net.yacy.search.schema.CollectionSchema; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.request.LukeRequest; +import org.apache.solr.client.solrj.response.LukeResponse; public class migration { //SVN constants @@ -365,34 +360,73 @@ public static int reindexToschema (final Switchboard sb) { // a reindex job is already running if (bt != null) { return bt.getJobCount(); - } - - ReindexSolrBusyThread reidx = new ReindexSolrBusyThread(null); // ("*:*" would reindex all) - - // add all disabled fields + } + + boolean lukeCheckok = false; + Set omitFields = new HashSet(3); + omitFields.add(CollectionSchema.author_sxt.getSolrFieldName()); // special fields to exclude from disabled check + omitFields.add(CollectionSchema.coordinate_p_0_coordinate.getSolrFieldName()); + omitFields.add(CollectionSchema.coordinate_p_1_coordinate.getSolrFieldName()); CollectionConfiguration colcfg = Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration(); - Iterator itcol = colcfg.entryIterator(); - while (itcol.hasNext()) { - Entry etr = itcol.next(); - if (!etr.enabled()) { - reidx.addSelectFieldname(etr.key()); + ReindexSolrBusyThread reidx = new ReindexSolrBusyThread(null); // ("*:*" would reindex all); + + try { // get all fields contained in index + LukeRequest lukeRequest = new LukeRequest(); + lukeRequest.setNumTerms(1); + LukeResponse lukeResponse = lukeRequest.process(Switchboard.getSwitchboard().index.fulltext().getDefaultEmbeddedConnector().getServer()); + + for (LukeResponse.FieldInfo solrfield : lukeResponse.getFieldInfo().values()) { + if (!colcfg.contains(solrfield.getName()) && !omitFields.contains(solrfield.getName())) { // add found fields not in config for reindexing + reidx.addSelectFieldname(solrfield.getName()); + } } + lukeCheckok = true; + } catch (SolrServerException ex) { + Log.logException(ex); + } catch (IOException ex) { + Log.logException(ex); } + + if (!lukeCheckok) { // if luke failed alternatively use config and manual list + // add all disabled fields + Iterator itcol = colcfg.entryIterator(); + while (itcol.hasNext()) { // check for disabled fields in config + Entry etr = itcol.next(); + if (!etr.enabled() && !omitFields.contains(etr.key())) { + reidx.addSelectFieldname(etr.key()); + } + } - // add obsolete fields (not longer part of main index) - reidx.addSelectFieldname("inboundlinks_tag_txt"); - reidx.addSelectFieldname("inboundlinks_relflags_val"); - reidx.addSelectFieldname("inboundlinks_rel_sxt"); - reidx.addSelectFieldname("inboundlinks_text_txt"); - reidx.addSelectFieldname("inboundlinks_alttag_txt"); + // add obsolete fields (not longer part of main index) + reidx.addSelectFieldname("author_s"); + reidx.addSelectFieldname("css_tag_txt"); + reidx.addSelectFieldname("css_url_txt"); + reidx.addSelectFieldname("scripts_txt"); + reidx.addSelectFieldname("images_tag_txt"); + reidx.addSelectFieldname("images_urlstub_txt"); + reidx.addSelectFieldname("canonical_t"); + reidx.addSelectFieldname("frames_txt"); + reidx.addSelectFieldname("iframes_txt"); - reidx.addSelectFieldname("outboundlinks_tag_txt"); - reidx.addSelectFieldname("outboundlinks_relflags_val"); - reidx.addSelectFieldname("outboundlinks_rel_sxt"); - reidx.addSelectFieldname("outboundlinks_text_txt"); - reidx.addSelectFieldname("outboundlinks_alttag_txt"); - - sb.deployThread("reindexSolr", "Reindex Solr", "reindex documents with obsolete fields in embedded Solr index", "/IndexReIndexMonitor_p.html",reidx /*privateWorkerThread*/, 0); + reidx.addSelectFieldname("inboundlinks_tag_txt"); + reidx.addSelectFieldname("inboundlinks_relflags_val"); + reidx.addSelectFieldname("inboundlinks_name_txt"); + reidx.addSelectFieldname("inboundlinks_rel_sxt"); + reidx.addSelectFieldname("inboundlinks_text_txt"); + reidx.addSelectFieldname("inboundlinks_text_chars_val"); + reidx.addSelectFieldname("inboundlinks_text_words_val"); + reidx.addSelectFieldname("inboundlinks_alttag_txt"); + + reidx.addSelectFieldname("outboundlinks_tag_txt"); + reidx.addSelectFieldname("outboundlinks_relflags_val"); + reidx.addSelectFieldname("outboundlinks_name_txt"); + reidx.addSelectFieldname("outboundlinks_rel_sxt"); + reidx.addSelectFieldname("outboundlinks_text_txt"); + reidx.addSelectFieldname("outboundlinks_text_chars_val"); + reidx.addSelectFieldname("outboundlinks_text_words_val"); + reidx.addSelectFieldname("outboundlinks_alttag_txt"); + } + sb.deployThread("reindexSolr", "Reindex Solr", "reindex documents with obsolete fields in embedded Solr index", "/IndexReIndexMonitor_p.html",reidx , 0); return 0; } } diff --git a/source/net/yacy/search/index/ReindexSolrBusyThread.java b/source/net/yacy/search/index/ReindexSolrBusyThread.java index 1697b25d1e..aac2b6a467 100644 --- a/source/net/yacy/search/index/ReindexSolrBusyThread.java +++ b/source/net/yacy/search/index/ReindexSolrBusyThread.java @@ -24,9 +24,15 @@ import net.yacy.search.Switchboard; import java.util.ArrayList; import java.util.concurrent.Semaphore; +import java.util.logging.Level; +import java.util.logging.Logger; import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; import net.yacy.kelondro.workflow.AbstractBusyThread; import net.yacy.search.schema.CollectionConfiguration; +import org.apache.lucene.index.FieldInfo; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.request.LukeRequest; +import org.apache.solr.client.solrj.response.LukeResponse; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrInputDocument; @@ -72,7 +78,7 @@ public ReindexSolrBusyThread(String query) { } setName("reindexSolr"); this.setPriority(Thread.MIN_PRIORITY); - + } /** @@ -126,14 +132,13 @@ public boolean job() { } else { Log.logInfo("MIGRATION-REINDEX", "reindex docs with query=" + query + " found=" + docstoreindex + " start=" + start); start = start + chunksize; - } - - for (SolrDocument doc : xdocs) { - SolrInputDocument idoc = colcfg.toSolrInputDocument(doc); - Switchboard.getSwitchboard().index.fulltext().putDocument(idoc); - processed++; - } - + + for (SolrDocument doc : xdocs) { + SolrInputDocument idoc = colcfg.toSolrInputDocument(doc); + Switchboard.getSwitchboard().index.fulltext().putDocument(idoc); + processed++; + } + } } catch (IOException ex) { Log.logException(ex); } finally { @@ -152,11 +157,15 @@ public boolean job() { } - @Override - public void terminate(final boolean waitFor) { - querylist.clear(); - super.terminate(waitFor); - } + @Override + public void terminate(final boolean waitFor) { + querylist.clear(); + // if interrupted without finished commit to reflect latest changes + if (docstoreindex > 0 && processed > 0) { + esc.commit(true); + } + super.terminate(waitFor); + } /** * @return total number of processed documents @@ -168,7 +177,7 @@ public int getProcessed() { /** * @return the currently processed Solr select query */ - public String getCurrentQuery() { + public String getCurrentQuery() { return querylist.isEmpty() ? "" : querylist.get(0); } @@ -186,6 +195,8 @@ public void freemem() { if (chunksize > 2) { this.chunksize = this.chunksize / 2; } + esc.commit(true); + start = 0; } }