Skip to content

Commit

Permalink
added export option to export the fulltext of the search index text only
Browse files Browse the repository at this point in the history
  • Loading branch information
Orbiter committed Jul 30, 2015
1 parent fbeae20 commit de8cfbe
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 52 deletions.
8 changes: 5 additions & 3 deletions htroot/IndexExport_p.html
Expand Up @@ -39,19 +39,21 @@ <h2>Index Export</h2>
<dt>Only Domain:</dt>
<dd><input type="radio" name="format" value="dom-text" /> Plain Text List (domains only)<br />
<input type="radio" name="format" value="dom-html" /> HTML (domains as URLs, no title)</dd>
<dt>Only Text:</dt>
<dd><input type="radio" name="format" value="text-text" /> Fulltext of Search Index Text</dd>
</dl>
</dd>
<dt>&nbsp;</dt>
<dd><input type="submit" name="lurlexport" value="Export URLs" class="btn btn-primary" style="width:240px;"/>
<dd><input type="submit" name="lurlexport" value="Export" class="btn btn-primary" style="width:240px;"/>
</dd>
</dl>
</fieldset>
</form>::
<div class="alert alert-info" style="text-decoration:blink">Export to file #[exportfile]# is running .. #[urlcount]# URLs so far</div>::
<div class="alert alert-info" style="text-decoration:blink">Export to file #[exportfile]# is running .. #[urlcount]# Documents so far</div>::
#(/lurlexport)#

#(lurlexportfinished)#::
<div class="alert alert-success">Finished export of #[urlcount]# URLs to file <a href="file://#[exportfile]#" target="_">#[exportfile]#</a><br/>
<div class="alert alert-success">Finished export of #[urlcount]# Documents to file <a href="file://#[exportfile]#" target="_">#[exportfile]#</a><br/>
<em>Import this file by moving it to DATA/SURROGATES/in</em></div>::
#(/lurlexportfinished)#

Expand Down
21 changes: 11 additions & 10 deletions htroot/IndexExport_p.java
Expand Up @@ -93,27 +93,28 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea

if (post.containsKey("lurlexport")) {
// parse format
int format = 0;
Fulltext.ExportFormat format = Fulltext.ExportFormat.text;
final String fname = post.get("format", "url-text");
final boolean dom = fname.startsWith("dom"); // if dom== false complete urls are exported, otherwise only the domain
if (fname.endsWith("text")) format = 0;
if (fname.endsWith("html")) format = 1;
if (fname.endsWith("rss")) format = 2;
if (fname.endsWith("solr")) format = 3;
final boolean text = fname.startsWith("text");
if (fname.endsWith("text")) format = Fulltext.ExportFormat.text;
if (fname.endsWith("html")) format = Fulltext.ExportFormat.html;
if (fname.endsWith("rss")) format = Fulltext.ExportFormat.rss;
if (fname.endsWith("solr")) format = Fulltext.ExportFormat.solr;

// extend export file name
String s = post.get("exportfile", "");
if (s.indexOf('.',0) < 0) {
if (format == 0) s = s + ".txt";
if (format == 1) s = s + ".html";
if (format == 2 ) s = s + "_rss.xml";
if (format == 3) s = s + "_full.xml";
if (format == Fulltext.ExportFormat.text) s = s + ".txt";
if (format == Fulltext.ExportFormat.html) s = s + ".html";
if (format == Fulltext.ExportFormat.rss ) s = s + "_rss.xml";
if (format == Fulltext.ExportFormat.solr) s = s + "_full.xml";
}
final File f = new File(s);
f.getParentFile().mkdirs();
final String filter = post.get("exportfilter", ".*");
final String query = post.get("exportquery", "*:*");
final Fulltext.Export running = segment.fulltext().export(f, filter, query, format, dom);
final Fulltext.Export running = segment.fulltext().export(f, filter, query, format, dom, text);

prop.put("lurlexport_exportfile", s);
prop.put("lurlexport_urlcount", running.count());
Expand Down
12 changes: 5 additions & 7 deletions source/net/yacy/cora/document/id/MultiProtocolURL.java
Expand Up @@ -843,14 +843,12 @@ public static String getFileExtension(final String fileName) {
final int q = fileName.lastIndexOf('?');
if (q < 0) {
return fileName.substring(p + 1).toLowerCase();
} else {
// check last dot in query part
if (p > q) {
return ""; // TODO: last . after ? (file.ext?param=one.txt)
} else {
return fileName.substring(p + 1, q).toLowerCase();
}
}
// check last dot in query part
if (p > q) {
return ""; // TODO: last . after ? (file.ext?param=one.txt)
}
return fileName.substring(p + 1, q).toLowerCase();
}

public String getPath() {
Expand Down
69 changes: 37 additions & 32 deletions source/net/yacy/search/index/Fulltext.java
Expand Up @@ -618,13 +618,17 @@ public void rebootSolr() {
}
}

public static enum ExportFormat {
text, html, rss, solr;
}

// export methods
public Export export(final File f, final String filter, final String query, final int format, final boolean dom) {
public Export export(final File f, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text) {
if ((this.exportthread != null) && (this.exportthread.isAlive())) {
ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running");
return this.exportthread;
}
this.exportthread = new Export(f, filter, query, format, dom);
this.exportthread = new Export(f, filter, query, format, dom, text);
this.exportthread.start();
return this.exportthread;
}
Expand All @@ -638,10 +642,10 @@ public class Export extends Thread {
private final Pattern pattern;
private int count;
private String failure, query;
private final int format;
private final boolean dom;
private final ExportFormat format;
private final boolean dom, text;

private Export(final File f, final String filter, final String query, final int format, boolean dom) {
private Export(final File f, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text) {
// format: 0=text, 1=html, 2=rss/xml
this.f = f;
this.pattern = filter == null ? null : Pattern.compile(filter);
Expand All @@ -650,6 +654,7 @@ private Export(final File f, final String filter, final String query, final int
this.failure = null;
this.format = format;
this.dom = dom;
this.text = text;
//if ((dom) && (format == 2)) dom = false;
}

Expand All @@ -658,13 +663,13 @@ public void run() {
try {
final File parentf = this.f.getParentFile();
if (parentf != null) parentf.mkdirs();
OutputStream os = new FileOutputStream(this.format == 3 ? new File(this.f.getAbsolutePath() + ".gz") : this.f);
if (this.format == 3) os = new GZIPOutputStream(os, 65536){{def.setLevel(Deflater.BEST_COMPRESSION);}};
OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(this.f.getAbsolutePath() + ".gz") : this.f);
if (this.format == ExportFormat.solr) os = new GZIPOutputStream(os, 65536){{def.setLevel(Deflater.BEST_COMPRESSION);}};
final PrintWriter pw = new PrintWriter(new BufferedOutputStream(os));
if (this.format == 1) {
if (this.format == ExportFormat.html) {
pw.println("<html><head></head><body>");
}
if (this.format == 2) {
if (this.format == ExportFormat.rss) {
pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
pw.println("<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>");
pw.println("<rss version=\"2.0\" xmlns:yacy=\"http://www.yacy.net/\" xmlns:opensearch=\"http://a9.com/-/spec/opensearch/1.1/\" xmlns:atom=\"http://www.w3.org/2005/Atom\">");
Expand All @@ -673,7 +678,7 @@ public void run() {
pw.println("<description></description>");
pw.println("<link>http://yacy.net</link>");
}
if (this.format == 3) {
if (this.format == ExportFormat.solr) {
pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
pw.println("<response>");
pw.println("<result>");
Expand All @@ -683,12 +688,25 @@ public void run() {
ReversibleScoreMap<String> stats = scores.get(CollectionSchema.host_s.getSolrFieldName());
for (final String host: stats) {
if (this.pattern != null && !this.pattern.matcher(host).matches()) continue;
if (this.format == 0) pw.println(host);
if (this.format == 1) pw.println("<a href=\"http://" + host + "\">" + host + "</a><br>");
if (this.format == ExportFormat.text) pw.println(host);
if (this.format == ExportFormat.html) pw.println("<a href=\"http://" + host + "\">" + host + "</a><br>");
this.count++;
}
} else {
if (this.format < 3) {
if (this.format == ExportFormat.solr || (this.text && this.format == ExportFormat.text)) {
BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true);
SolrDocument doc;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
CRIgnoreWriter sw = new CRIgnoreWriter();
if (this.text) sw.write((String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName())); else EnhancedXMLResponseWriter.writeDoc(sw, doc);
sw.close();
String d = sw.toString();
pw.println(d);
this.count++;
}
} else {
BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true,
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(),
CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName());
Expand All @@ -705,13 +723,13 @@ public void run() {
size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName());
date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName());
if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
if (this.format == 0) {
if (this.format == ExportFormat.text) {
pw.println(url);
}
if (this.format == 1) {
if (this.format == ExportFormat.html) {
if (title != null) pw.println("<a href=\"" + MultiProtocolURL.escape(url) + "\">" + CharacterCoding.unicode2xml(title, true) + "</a>");
}
if (this.format == 2) {
if (this.format == ExportFormat.rss) {
pw.println("<item>");
if (title != null) pw.println("<title>" + CharacterCoding.unicode2xml(title, true) + "</title>");
pw.println("<link>" + MultiProtocolURL.escape(url) + "</link>");
Expand All @@ -724,29 +742,16 @@ public void run() {
}
this.count++;
}
} else {
BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true);
SolrDocument doc;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
CRIgnoreWriter sw = new CRIgnoreWriter();
EnhancedXMLResponseWriter.writeDoc(sw, doc);
sw.close();
String d = sw.toString();
pw.println(d);
this.count++;
}
}
}
if (this.format == 1) {
if (this.format == ExportFormat.html) {
pw.println("</body></html>");
}
if (this.format == 2) {
if (this.format == ExportFormat.rss) {
pw.println("</channel>");
pw.println("</rss>");
}
if (this.format == 3) {
if (this.format == ExportFormat.solr) {
pw.println("</result>");
pw.println("</response>");
}
Expand Down

0 comments on commit de8cfbe

Please sign in to comment.