Permalink
Browse files

Extended Mediawiki dump import to remote URLs.

When using a public HTTP URL in /IndexImportMediawiki_p.html, the remote
file now is directly streamed and processed, allowing import of several
GB dumps even with a low memory remote peer, and without need to
manually download the dump file first.
  • Loading branch information...
luccioman committed Apr 14, 2017
1 parent e5c3b16 commit f66438442ecbb1d1ec29839780c4e19ecc36f923
@@ -1,3 +1,3 @@
#!/usr/bin/env sh
cd "`dirname $0`"
./apicall.sh /IndexImportMediawiki_p.html?file=$1 > /dev/null
./protectedPostApiCall.sh "IndexImportMediawiki_p.html" "file=$1"
@@ -13,29 +13,33 @@
<h2>MediaWiki Dump Import</h2>
#(import)#
<p>#(prevStatus)#
::<div class="alert alert-danger" role="alert">Error on last import : #[message]#</div>
#(/prevStatus)#</p>
<p>#(status)#<div class="alert alert-info" role="alert">No import thread is running, you can start a new thread here</div>
::<div class="alert alert-danger" role="alert">Error : file argument must be a path to a document in the local file system</div>
::<div class="alert alert-danger" role="alert">Error : dump <abbr title="Uniform Resource Locator">URL</abbr> is malformed.</div>
::<div class="alert alert-danger" role="alert">Error : file not found "#[sourceFile]#"</div>
::<div class="alert alert-danger" role="alert">Error : can not read file "#[sourceFile]#"</div>
::<div class="alert alert-danger" role="alert">Error : you selected a directory ("#[sourceFile]#")</div>
#(/status)#</p>
<form action="IndexImportMediawiki_p.html" method="post" accept-charset="UTF-8" class="form-horizontal">
<input type="hidden" name="transactionToken" value="#[transactionToken]#"/>
<fieldset>
<legend>MediaWiki Dump File Selection: select an XML file (which may be bz2- or gz-encoded)</legend>
<legend>MediaWiki Dump File Selection</legend>
<p>
You can import MediaWiki dumps here. An example is the file
<a href="http://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2">
http://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2</a>.
<a href="https://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2">
https://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2</a>.
</p>
<p>
Dumps must be stored in the local file system in XML format and may be compressed in gz or bz2.
Dumps can be stored in the local file system or on a remote server in XML format and may be compressed in gz or bz2.
</p>
<div class="form-group">
<div class="col-sm-3 col-md-2 col-lg-2">
<label for="file" class="control-label" >Dump file path</label>
<label for="file" class="control-label" >Dump file path or <abbr title="Uniform Resource Locator">URL</abbr></label>
</div>
<div class="col-sm-9 col-md-8 col-lg-8">
<input id="file" class="form-control" name="file" type="text" title="Dump file path on this YaCy server file system" required="required"/>
<input id="file" class="form-control" name="file" type="text" title="Dump file path on this YaCy server file system, or any remote URL" required="required"/>
</div>
</div>
<input name="submit" class="btn btn-primary" type="submit" value="Import MediaWiki Dump" />
@@ -69,6 +73,8 @@ <h2>MediaWiki Dump Import</h2>
</ul>
<br />
::
<p>#(status)#::<div class="alert alert-danger" role="alert">Error encountered : #[message]#</div>
#(/status)#</p>
<form><fieldset><legend>Import Process</legend>
<dl>
<dt>Thread:</dt><dd>#[thread]#</dd>
@@ -23,8 +23,11 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.File;
import java.net.MalformedURLException;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.data.TransactionManager;
import net.yacy.document.importer.MediawikiImporter;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
@@ -54,6 +57,11 @@ public static serverObjects respond(final RequestHeader header, final serverObje
if (MediawikiImporter.job != null && MediawikiImporter.job.isAlive()) {
// one import is running, no option to insert anything
prop.put("import", 1);
final String jobErrorMessage = MediawikiImporter.job.status();
if( jobErrorMessage != null && !jobErrorMessage.isEmpty()) {
prop.put("import_status", 1);
prop.put("import_status_message", jobErrorMessage);
}
prop.put("import_thread", "running");
prop.put("import_dump", MediawikiImporter.job.source());
prop.put("import_count", MediawikiImporter.job.count());
@@ -64,33 +72,63 @@ public static serverObjects respond(final RequestHeader header, final serverObje
prop.put("import_remainingMinutes", (MediawikiImporter.job.remainingTime() / 60) % 60);
} else {
prop.put("import", 0);
if(MediawikiImporter.job != null) {
/* Report eventual fail report from the last terminated import (for example an HTTP 404 status)
* that else could be missed by the user because of page refresh */
final String jobErrorMessage = MediawikiImporter.job.status();
if( jobErrorMessage != null && !jobErrorMessage.isEmpty()) {
prop.put("import_prevStatus", 1);
prop.put("import_prevStatus_message", jobErrorMessage);
}
}
if (post == null) {
prop.put("import_status", 0);
/* Acquire a transaction token for the next POST form submission */
final String token = TransactionManager.getTransactionToken(header);
prop.put(TransactionManager.TRANSACTION_TOKEN_PARAM, token);
prop.put("import_" + TransactionManager.TRANSACTION_TOKEN_PARAM, token);
} else {
if (post.containsKey("file")) {
/* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post);
String file = post.get("file");
if (file.startsWith("file://")) file = file.substring(7);
if (file.startsWith("http")) {
prop.put("import_status", 1);
} else {
final File sourcefile = new File(file);
if (!sourcefile.exists()) {
prop.put("import_status", 2);
prop.put("import_status_sourceFile", sourcefile.getAbsolutePath());
} else if(!sourcefile.canRead()) {
prop.put("import_status", 3);
prop.put("import_status_sourceFile", sourcefile.getAbsolutePath());
} else if(sourcefile.isDirectory()) {
prop.put("import_status", 4);
prop.put("import_status_sourceFile", sourcefile.getAbsolutePath());
} else {
MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath);
MediawikiImporter.job.start();
prop.put("import_dump", MediawikiImporter.job.source());
prop.put("import_thread", "started");
prop.put("import", 1);
}
}
MultiProtocolURL sourceURL = null;
int status = 0;
String sourceFilePath = "";
try {
sourceURL = new MultiProtocolURL(file);
if(sourceURL.isFile()) {
final File sourcefile = sourceURL.getFSFile();
sourceFilePath = sourcefile.getAbsolutePath();
if (!sourcefile.exists()) {
status = 2;
} else if (!sourcefile.canRead()) {
status = 3;
} else if (sourcefile.isDirectory()) {
status = 4;
}
}
} catch (MalformedURLException e) {
status = 1;
}
if (status == 0) {
MediawikiImporter.job = new MediawikiImporter(sourceURL, sb.surrogatesInPath);
MediawikiImporter.job.start();
prop.put("import_dump", MediawikiImporter.job.source());
prop.put("import_thread", "started");
prop.put("import", 1);
} else {
prop.put("import_status", status);
prop.put("import_status_sourceFile", sourceFilePath);
/* Acquire a transaction token for the next POST form submission */
final String token = TransactionManager.getTransactionToken(header);
prop.put(TransactionManager.TRANSACTION_TOKEN_PARAM, token);
prop.put("import_" + TransactionManager.TRANSACTION_TOKEN_PARAM, token);
}
prop.put("import_count", 0);
prop.put("import_speed", 0);
prop.put("import_runningHours", 0);
@@ -48,6 +48,8 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HttpStatus;
import jcifs.smb.SmbException;
import jcifs.smb.SmbFile;
import jcifs.smb.SmbFileInputStream;
@@ -62,6 +64,7 @@
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.HTTPInputStream;
import net.yacy.crawler.retrieval.Response;
/**
@@ -2290,7 +2293,14 @@ public String getName() throws IOException {
return null;
}
public InputStream getInputStream(final ClientIdentification.Agent agent, final String username, final String pass) throws IOException {
/**
* Open an input stream on the resource described by this URL.
* <strong>Please don't forget to release resources by closing the returned stream.</strong>
* @param agent user agent identifier to use when the protocul is HTTP
* @return an open input stream
* @throws IOException when the stream can not be opened
*/
public InputStream getInputStream(final ClientIdentification.Agent agent) throws IOException {
if (isFile()) return new BufferedInputStream(new FileInputStream(getFSFile()));
if (isSMB()) return new BufferedInputStream(new SmbFileInputStream(getSmbFile()));
if (isFTP()) {
@@ -2303,7 +2313,12 @@ public InputStream getInputStream(final ClientIdentification.Agent agent, final
if (isHTTP() || isHTTPS()) {
final HTTPClient client = new HTTPClient(agent);
client.setHost(getHost());
return new ByteArrayInputStream(client.GETbytes(this, username, pass, false));
client.GET(this, false);
if (client.getStatusCode() != HttpStatus.SC_OK) {
throw new IOException("Unable to open http stream on " + this.toString() +
"\nServer returned status: " + client.getHttpResponse().getStatusLine());
}
return new HTTPInputStream(client);
}
return null;
@@ -138,7 +138,7 @@ public Response load(final Request request, boolean acceptOnlyParseable) throws
}
// load the resource
InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null);
InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
byte[] b = FileUtils.read(is);
// create response with loaded content
@@ -156,7 +156,7 @@ public Response load(final Request request, boolean acceptOnlyParseable) throws
}
// load the resource
InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null);
InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
byte[] b = FileUtils.read(is);
// create response with loaded content
Oops, something went wrong.

0 comments on commit f664384

Please sign in to comment.