From de8cfbe1d7fc894e0ae16704e15b5b22785281c6 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 30 Jul 2015 03:21:40 +0200 Subject: [PATCH] added export option to export the fulltext of the search index text only --- htroot/IndexExport_p.html | 8 ++- htroot/IndexExport_p.java | 21 +++--- .../cora/document/id/MultiProtocolURL.java | 12 ++-- source/net/yacy/search/index/Fulltext.java | 69 ++++++++++--------- 4 files changed, 58 insertions(+), 52 deletions(-) diff --git a/htroot/IndexExport_p.html b/htroot/IndexExport_p.html index aa03afedfc..5271900498 100644 --- a/htroot/IndexExport_p.html +++ b/htroot/IndexExport_p.html @@ -39,19 +39,21 @@

Index Export

Only Domain:
Plain Text List (domains only)
HTML (domains as URLs, no title)
+
Only Text:
+
Fulltext of Search Index Text
 
-
+
:: -
Export to file #[exportfile]# is running .. #[urlcount]# URLs so far
:: +
Export to file #[exportfile]# is running .. #[urlcount]# Documents so far
:: #(/lurlexport)# #(lurlexportfinished)#:: -
Finished export of #[urlcount]# URLs to file #[exportfile]#
+
Finished export of #[urlcount]# Documents to file #[exportfile]#
Import this file by moving it to DATA/SURROGATES/in
:: #(/lurlexportfinished)# diff --git a/htroot/IndexExport_p.java b/htroot/IndexExport_p.java index 7a7a5e3bc1..4dc8cfb93b 100644 --- a/htroot/IndexExport_p.java +++ b/htroot/IndexExport_p.java @@ -93,27 +93,28 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea if (post.containsKey("lurlexport")) { // parse format - int format = 0; + Fulltext.ExportFormat format = Fulltext.ExportFormat.text; final String fname = post.get("format", "url-text"); final boolean dom = fname.startsWith("dom"); // if dom== false complete urls are exported, otherwise only the domain - if (fname.endsWith("text")) format = 0; - if (fname.endsWith("html")) format = 1; - if (fname.endsWith("rss")) format = 2; - if (fname.endsWith("solr")) format = 3; + final boolean text = fname.startsWith("text"); + if (fname.endsWith("text")) format = Fulltext.ExportFormat.text; + if (fname.endsWith("html")) format = Fulltext.ExportFormat.html; + if (fname.endsWith("rss")) format = Fulltext.ExportFormat.rss; + if (fname.endsWith("solr")) format = Fulltext.ExportFormat.solr; // extend export file name String s = post.get("exportfile", ""); if (s.indexOf('.',0) < 0) { - if (format == 0) s = s + ".txt"; - if (format == 1) s = s + ".html"; - if (format == 2 ) s = s + "_rss.xml"; - if (format == 3) s = s + "_full.xml"; + if (format == Fulltext.ExportFormat.text) s = s + ".txt"; + if (format == Fulltext.ExportFormat.html) s = s + ".html"; + if (format == Fulltext.ExportFormat.rss ) s = s + "_rss.xml"; + if (format == Fulltext.ExportFormat.solr) s = s + "_full.xml"; } final File f = new File(s); f.getParentFile().mkdirs(); final String filter = post.get("exportfilter", ".*"); final String query = post.get("exportquery", "*:*"); - final Fulltext.Export running = segment.fulltext().export(f, filter, query, format, dom); + final Fulltext.Export running = segment.fulltext().export(f, filter, query, format, dom, text); prop.put("lurlexport_exportfile", s); prop.put("lurlexport_urlcount", running.count()); diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index c8af66661b..6df62a4165 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -843,14 +843,12 @@ public static String getFileExtension(final String fileName) { final int q = fileName.lastIndexOf('?'); if (q < 0) { return fileName.substring(p + 1).toLowerCase(); - } else { - // check last dot in query part - if (p > q) { - return ""; // TODO: last . after ? (file.ext?param=one.txt) - } else { - return fileName.substring(p + 1, q).toLowerCase(); - } } + // check last dot in query part + if (p > q) { + return ""; // TODO: last . after ? (file.ext?param=one.txt) + } + return fileName.substring(p + 1, q).toLowerCase(); } public String getPath() { diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 6428a58bf0..1b460d0a1a 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -618,13 +618,17 @@ public void rebootSolr() { } } + public static enum ExportFormat { + text, html, rss, solr; + } + // export methods - public Export export(final File f, final String filter, final String query, final int format, final boolean dom) { + public Export export(final File f, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text) { if ((this.exportthread != null) && (this.exportthread.isAlive())) { ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running"); return this.exportthread; } - this.exportthread = new Export(f, filter, query, format, dom); + this.exportthread = new Export(f, filter, query, format, dom, text); this.exportthread.start(); return this.exportthread; } @@ -638,10 +642,10 @@ public class Export extends Thread { private final Pattern pattern; private int count; private String failure, query; - private final int format; - private final boolean dom; + private final ExportFormat format; + private final boolean dom, text; - private Export(final File f, final String filter, final String query, final int format, boolean dom) { + private Export(final File f, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text) { // format: 0=text, 1=html, 2=rss/xml this.f = f; this.pattern = filter == null ? null : Pattern.compile(filter); @@ -650,6 +654,7 @@ private Export(final File f, final String filter, final String query, final int this.failure = null; this.format = format; this.dom = dom; + this.text = text; //if ((dom) && (format == 2)) dom = false; } @@ -658,13 +663,13 @@ public void run() { try { final File parentf = this.f.getParentFile(); if (parentf != null) parentf.mkdirs(); - OutputStream os = new FileOutputStream(this.format == 3 ? new File(this.f.getAbsolutePath() + ".gz") : this.f); - if (this.format == 3) os = new GZIPOutputStream(os, 65536){{def.setLevel(Deflater.BEST_COMPRESSION);}}; + OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(this.f.getAbsolutePath() + ".gz") : this.f); + if (this.format == ExportFormat.solr) os = new GZIPOutputStream(os, 65536){{def.setLevel(Deflater.BEST_COMPRESSION);}}; final PrintWriter pw = new PrintWriter(new BufferedOutputStream(os)); - if (this.format == 1) { + if (this.format == ExportFormat.html) { pw.println(""); } - if (this.format == 2) { + if (this.format == ExportFormat.rss) { pw.println(""); pw.println(""); pw.println(""); @@ -673,7 +678,7 @@ public void run() { pw.println(""); pw.println("http://yacy.net"); } - if (this.format == 3) { + if (this.format == ExportFormat.solr) { pw.println(""); pw.println(""); pw.println(""); @@ -683,12 +688,25 @@ public void run() { ReversibleScoreMap stats = scores.get(CollectionSchema.host_s.getSolrFieldName()); for (final String host: stats) { if (this.pattern != null && !this.pattern.matcher(host).matches()) continue; - if (this.format == 0) pw.println(host); - if (this.format == 1) pw.println("" + host + "
"); + if (this.format == ExportFormat.text) pw.println(host); + if (this.format == ExportFormat.html) pw.println("" + host + "
"); this.count++; } } else { - if (this.format < 3) { + if (this.format == ExportFormat.solr || (this.text && this.format == ExportFormat.text)) { + BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true); + SolrDocument doc; + while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { + String url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); + if (this.pattern != null && !this.pattern.matcher(url).matches()) continue; + CRIgnoreWriter sw = new CRIgnoreWriter(); + if (this.text) sw.write((String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName())); else EnhancedXMLResponseWriter.writeDoc(sw, doc); + sw.close(); + String d = sw.toString(); + pw.println(d); + this.count++; + } + } else { BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(), CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName()); @@ -705,13 +723,13 @@ public void run() { size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName()); date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName()); if (this.pattern != null && !this.pattern.matcher(url).matches()) continue; - if (this.format == 0) { + if (this.format == ExportFormat.text) { pw.println(url); } - if (this.format == 1) { + if (this.format == ExportFormat.html) { if (title != null) pw.println("" + CharacterCoding.unicode2xml(title, true) + ""); } - if (this.format == 2) { + if (this.format == ExportFormat.rss) { pw.println(""); if (title != null) pw.println("" + CharacterCoding.unicode2xml(title, true) + ""); pw.println("" + MultiProtocolURL.escape(url) + ""); @@ -724,29 +742,16 @@ public void run() { } this.count++; } - } else { - BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true); - SolrDocument doc; - while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { - String url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); - if (this.pattern != null && !this.pattern.matcher(url).matches()) continue; - CRIgnoreWriter sw = new CRIgnoreWriter(); - EnhancedXMLResponseWriter.writeDoc(sw, doc); - sw.close(); - String d = sw.toString(); - pw.println(d); - this.count++; - } } } - if (this.format == 1) { + if (this.format == ExportFormat.html) { pw.println(""); } - if (this.format == 2) { + if (this.format == ExportFormat.rss) { pw.println(""); pw.println("
"); } - if (this.format == 3) { + if (this.format == ExportFormat.solr) { pw.println(""); pw.println(""); }