Skip to content

Commit

Permalink
more generic field selection for reindex option of documents with dis…
Browse files Browse the repository at this point in the history
…abled fields

using Luke request to compare config with actual fields in index
  • Loading branch information
reger committed May 15, 2013
1 parent c91c67c commit 7f63d37
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 49 deletions.
102 changes: 68 additions & 34 deletions source/net/yacy/migration.java
Expand Up @@ -34,25 +34,20 @@
import net.yacy.search.SwitchboardConstants;

import com.google.common.io.Files;
import static java.lang.Thread.MIN_PRIORITY;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.concurrent.Semaphore;
import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
import java.util.Set;
import net.yacy.cora.storage.Configuration.Entry;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.Index;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.workflow.AbstractBusyThread;
import net.yacy.kelondro.workflow.AbstractThread;
import net.yacy.kelondro.workflow.BusyThread;
import net.yacy.kelondro.workflow.InstantBusyThread;
import net.yacy.kelondro.workflow.WorkflowThread;
import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.CollectionConfiguration;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.search.schema.CollectionSchema;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.request.LukeRequest;
import org.apache.solr.client.solrj.response.LukeResponse;

public class migration {
//SVN constants
Expand Down Expand Up @@ -365,34 +360,73 @@ public static int reindexToschema (final Switchboard sb) {
// a reindex job is already running
if (bt != null) {
return bt.getJobCount();
}

ReindexSolrBusyThread reidx = new ReindexSolrBusyThread(null); // ("*:*" would reindex all)

// add all disabled fields
}

boolean lukeCheckok = false;
Set<String> omitFields = new HashSet<String>(3);
omitFields.add(CollectionSchema.author_sxt.getSolrFieldName()); // special fields to exclude from disabled check
omitFields.add(CollectionSchema.coordinate_p_0_coordinate.getSolrFieldName());
omitFields.add(CollectionSchema.coordinate_p_1_coordinate.getSolrFieldName());
CollectionConfiguration colcfg = Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration();
Iterator<Entry> itcol = colcfg.entryIterator();
while (itcol.hasNext()) {
Entry etr = itcol.next();
if (!etr.enabled()) {
reidx.addSelectFieldname(etr.key());
ReindexSolrBusyThread reidx = new ReindexSolrBusyThread(null); // ("*:*" would reindex all);

try { // get all fields contained in index
LukeRequest lukeRequest = new LukeRequest();
lukeRequest.setNumTerms(1);
LukeResponse lukeResponse = lukeRequest.process(Switchboard.getSwitchboard().index.fulltext().getDefaultEmbeddedConnector().getServer());

for (LukeResponse.FieldInfo solrfield : lukeResponse.getFieldInfo().values()) {
if (!colcfg.contains(solrfield.getName()) && !omitFields.contains(solrfield.getName())) { // add found fields not in config for reindexing
reidx.addSelectFieldname(solrfield.getName());
}
}
lukeCheckok = true;
} catch (SolrServerException ex) {
Log.logException(ex);
} catch (IOException ex) {
Log.logException(ex);
}

if (!lukeCheckok) { // if luke failed alternatively use config and manual list
// add all disabled fields
Iterator<Entry> itcol = colcfg.entryIterator();
while (itcol.hasNext()) { // check for disabled fields in config
Entry etr = itcol.next();
if (!etr.enabled() && !omitFields.contains(etr.key())) {
reidx.addSelectFieldname(etr.key());
}
}

// add obsolete fields (not longer part of main index)
reidx.addSelectFieldname("inboundlinks_tag_txt");
reidx.addSelectFieldname("inboundlinks_relflags_val");
reidx.addSelectFieldname("inboundlinks_rel_sxt");
reidx.addSelectFieldname("inboundlinks_text_txt");
reidx.addSelectFieldname("inboundlinks_alttag_txt");
// add obsolete fields (not longer part of main index)
reidx.addSelectFieldname("author_s");
reidx.addSelectFieldname("css_tag_txt");
reidx.addSelectFieldname("css_url_txt");
reidx.addSelectFieldname("scripts_txt");
reidx.addSelectFieldname("images_tag_txt");
reidx.addSelectFieldname("images_urlstub_txt");
reidx.addSelectFieldname("canonical_t");
reidx.addSelectFieldname("frames_txt");
reidx.addSelectFieldname("iframes_txt");

reidx.addSelectFieldname("outboundlinks_tag_txt");
reidx.addSelectFieldname("outboundlinks_relflags_val");
reidx.addSelectFieldname("outboundlinks_rel_sxt");
reidx.addSelectFieldname("outboundlinks_text_txt");
reidx.addSelectFieldname("outboundlinks_alttag_txt");

sb.deployThread("reindexSolr", "Reindex Solr", "reindex documents with obsolete fields in embedded Solr index", "/IndexReIndexMonitor_p.html",reidx /*privateWorkerThread*/, 0);
reidx.addSelectFieldname("inboundlinks_tag_txt");
reidx.addSelectFieldname("inboundlinks_relflags_val");
reidx.addSelectFieldname("inboundlinks_name_txt");
reidx.addSelectFieldname("inboundlinks_rel_sxt");
reidx.addSelectFieldname("inboundlinks_text_txt");
reidx.addSelectFieldname("inboundlinks_text_chars_val");
reidx.addSelectFieldname("inboundlinks_text_words_val");
reidx.addSelectFieldname("inboundlinks_alttag_txt");

reidx.addSelectFieldname("outboundlinks_tag_txt");
reidx.addSelectFieldname("outboundlinks_relflags_val");
reidx.addSelectFieldname("outboundlinks_name_txt");
reidx.addSelectFieldname("outboundlinks_rel_sxt");
reidx.addSelectFieldname("outboundlinks_text_txt");
reidx.addSelectFieldname("outboundlinks_text_chars_val");
reidx.addSelectFieldname("outboundlinks_text_words_val");
reidx.addSelectFieldname("outboundlinks_alttag_txt");
}
sb.deployThread("reindexSolr", "Reindex Solr", "reindex documents with obsolete fields in embedded Solr index", "/IndexReIndexMonitor_p.html",reidx , 0);
return 0;
}
}
41 changes: 26 additions & 15 deletions source/net/yacy/search/index/ReindexSolrBusyThread.java
Expand Up @@ -24,9 +24,15 @@
import net.yacy.search.Switchboard;
import java.util.ArrayList;
import java.util.concurrent.Semaphore;
import java.util.logging.Level;
import java.util.logging.Logger;
import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
import net.yacy.kelondro.workflow.AbstractBusyThread;
import net.yacy.search.schema.CollectionConfiguration;
import org.apache.lucene.index.FieldInfo;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.request.LukeRequest;
import org.apache.solr.client.solrj.response.LukeResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
Expand Down Expand Up @@ -72,7 +78,7 @@ public ReindexSolrBusyThread(String query) {
}
setName("reindexSolr");
this.setPriority(Thread.MIN_PRIORITY);

}

/**
Expand Down Expand Up @@ -126,14 +132,13 @@ public boolean job() {
} else {
Log.logInfo("MIGRATION-REINDEX", "reindex docs with query=" + query + " found=" + docstoreindex + " start=" + start);
start = start + chunksize;
}

for (SolrDocument doc : xdocs) {
SolrInputDocument idoc = colcfg.toSolrInputDocument(doc);
Switchboard.getSwitchboard().index.fulltext().putDocument(idoc);
processed++;
}


for (SolrDocument doc : xdocs) {
SolrInputDocument idoc = colcfg.toSolrInputDocument(doc);
Switchboard.getSwitchboard().index.fulltext().putDocument(idoc);
processed++;
}
}
} catch (IOException ex) {
Log.logException(ex);
} finally {
Expand All @@ -152,11 +157,15 @@ public boolean job() {
}


@Override
public void terminate(final boolean waitFor) {
querylist.clear();
super.terminate(waitFor);
}
@Override
public void terminate(final boolean waitFor) {
querylist.clear();
// if interrupted without finished commit to reflect latest changes
if (docstoreindex > 0 && processed > 0) {
esc.commit(true);
}
super.terminate(waitFor);
}

/**
* @return total number of processed documents
Expand All @@ -168,7 +177,7 @@ public int getProcessed() {
/**
* @return the currently processed Solr select query
*/
public String getCurrentQuery() {
public String getCurrentQuery() {
return querylist.isEmpty() ? "" : querylist.get(0);
}

Expand All @@ -186,6 +195,8 @@ public void freemem() {
if (chunksize > 2) {
this.chunksize = this.chunksize / 2;
}
esc.commit(true);
start = 0;
}

}
Expand Down

0 comments on commit 7f63d37

Please sign in to comment.