Permalink
Browse files

Added MediaWiki dump import scheduling feature.

Checking the last modified date by default to prevent unnecessary long
running operations.
  • Loading branch information...
luccioman committed May 3, 2017
1 parent 10c03c6 commit a87281b4988bcf20934b5a6d95a66b297f00fbff
Showing with 167 additions and 13 deletions.
  1. +17 −0 htroot/IndexImportMediawiki_p.html
  2. +125 −0 htroot/IndexImportMediawiki_p.java
  3. +25 −13 source/net/yacy/data/WorkTables.java
@@ -19,6 +19,7 @@ <h2>MediaWiki Dump Import</h2>
::<div class="alert alert-danger" role="alert">Error : file not found "#[sourceFile]#"</div>
::<div class="alert alert-danger" role="alert">Error : can not read file "#[sourceFile]#"</div>
::<div class="alert alert-danger" role="alert">Error : you selected a directory ("#[sourceFile]#")</div>
::<div class="alert alert-danger" role="alert">Error : dump file ("#[sourceFile]#") was not modified since last import (#[lastImportDate]#).</div>
#(/status)#</p>
<form action="IndexImportMediawiki_p.html" method="post" accept-charset="UTF-8" class="form-horizontal">
<input type="hidden" name="transactionToken" value="#[transactionToken]#"/>
@@ -40,6 +41,22 @@ <h2>MediaWiki Dump Import</h2>
<input id="file" class="form-control" name="file" type="text" title="Dump file path on this YaCy server file system, or any remote URL" required="required"/>
</div>
</div>
<div class="form-group">
<div class="col-sm-3">
<div class="checkbox">
<label>
<input name="iffresh" id="iffresh"
type="checkbox" checked="checked"
aria-describedby="iffreshInfo"/>
Import only when modified since last import
</label>
</div>
</div>
<div class="col-sm-5" id="iffreshInfo">
When checked, the dump file is imported only if its last modified date is unknown or is after the last import execution date on this same file
(see <a href="Table_API_p.html?filter=mediawikidump">recorded API calls</a> with the "dump" type).
</div>
</div>
<input name="submit" class="btn btn-primary" type="submit" value="Import MediaWiki Dump" />
</fieldset>
</form>
@@ -23,12 +23,28 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.Iterator;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.data.TransactionManager;
import net.yacy.data.WorkTables;
import net.yacy.document.importer.MediawikiImporter;
import net.yacy.kelondro.blob.Tables.Row;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@@ -46,6 +62,7 @@
* @param post request parameters. Supported keys :
* <ul>
* <li>file : a dump URL or file path on this YaCy server local file system</li>
* <li>iffresh : when set to true, the dump file is imported only if its last modified date is unknown or after the last import trial date on this same file. </li>
* <li>report : when set, display the currently running thread monitoring info, or the last import report when no one is running.
* Ignored when no import thread is known.</li>
* </ul>
@@ -95,6 +112,11 @@ public static serverObjects respond(final RequestHeader header, final serverObje
MultiProtocolURL sourceURL = null;
int status = 0;
String sourceFilePath = "";
final Row lastExecutedCall = selectLastExecutedCall(post, sb);
Date lastExecutionDate = null;
if (lastExecutedCall != null) {
lastExecutionDate = lastExecutedCall.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null);
}
try {
sourceURL = new MultiProtocolURL(file);
if(sourceURL.isFile()) {
@@ -108,10 +130,41 @@ public static serverObjects respond(final RequestHeader header, final serverObje
status = 4;
}
}
if (status == 0 && post.getBoolean("iffresh")) {
long lastModified = getLastModified(sourceURL);
if (lastExecutionDate != null && lastModified != 0L
&& lastModified <= lastExecutionDate.getTime()) {
status = 5;
prop.put("import_status_lastImportDate",
GenericFormatter.FORMAT_SIMPLE.format(lastExecutionDate));
/* the import is not performed, but we increase here the api call count */
if(sb.tables != null) {
byte[] lastExecutedCallPk = lastExecutedCall.getPK();
if(lastExecutedCallPk != null && !post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) {
post.add(WorkTables.TABLE_API_COL_APICALL_PK, UTF8.String(lastExecutedCallPk));
}
sb.tables.recordAPICall(post, "IndexImportMediawiki_p.html", WorkTables.TABLE_API_TYPE_DUMP, "MediaWiki Dump Import for " + sourceURL);
}
}
}
} catch (MalformedURLException e) {
status = 1;
}
if (status == 0) {
/* store this call as an api call */
if(sb.tables != null) {
/* We avoid creating a duplicate of any already recorded API call with the same parameters */
if(lastExecutedCall != null && !post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) {
byte[] lastExecutedCallPk = lastExecutedCall.getPK();
if(lastExecutedCallPk != null) {
post.add(WorkTables.TABLE_API_COL_APICALL_PK, UTF8.String(lastExecutedCallPk));
}
}
sb.tables.recordAPICall(post, "IndexImportMediawiki_p.html", WorkTables.TABLE_API_TYPE_DUMP, "MediaWiki Dump Import for " + sourceURL);
}
MediawikiImporter.job = new MediawikiImporter(sourceURL, sb.surrogatesInPath);
MediawikiImporter.job.start();
prop.put("import_dump", MediawikiImporter.job.source());
@@ -138,4 +191,76 @@ public static serverObjects respond(final RequestHeader header, final serverObje
}
return prop;
}
/**
* @param post Servlet request parameters. Must not be null.
* @param sb the {@link Switchboard} instance. Must not be null.
* @return the most recently recorded call to this API with the same parameters
*/
private static Row selectLastExecutedCall(final serverObjects post, final Switchboard sb) {
Row lastRecordedCall = null;
if (sb.tables != null) {
try {
if(post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) {
/* Search the table on the primary key when when present (re-execution of a recorded call) */
lastRecordedCall = sb.tables.select(WorkTables.TABLE_API_NAME, UTF8.getBytes(post.get(WorkTables.TABLE_API_COL_APICALL_PK)));
} else {
/* Else search the table on the API URL as recorded (including parameters) */
final String apiURL = WorkTables.generateRecordedURL(post, "IndexImportMediawiki_p.html");
Iterator<Row> rowsIt = sb.tables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_URL,
UTF8.getBytes(apiURL));
while (rowsIt.hasNext()) {
Row currentRow = rowsIt.next();
if (currentRow != null) {
Date currentLastExec = currentRow.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null);
if(currentLastExec != null) {
if(lastRecordedCall == null) {
lastRecordedCall = currentRow;
} else if(lastRecordedCall.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null).before(currentLastExec)) {
lastRecordedCall = currentRow;
}
}
}
}
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
} catch(final SpaceExceededException e) {
ConcurrentLog.logException(e);
}
}
return lastRecordedCall;
}
/**
* @param fileURL the file URL. Must not be null.
* @return the last modified date for the file at fileURL, or 0L when unknown or when an error occurred
*/
private static long getLastModified(MultiProtocolURL fileURL) {
long lastModified = 0l;
try {
if (fileURL.isHTTP() || fileURL.isHTTPS()) {
/* http(s) : we do not use MultiprotocolURL.lastModified() which always returns 0L for these protocols */
HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent);
HttpResponse headResponse = httpClient.HEADResponse(fileURL, false);
if (headResponse != null && headResponse.getStatusLine() != null
&& headResponse.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
Header lastModifiedHeader = headResponse
.getFirstHeader(HeaderFramework.LAST_MODIFIED);
if (lastModifiedHeader != null) {
Date lastModifiedDate = HeaderFramework.parseHTTPDate(lastModifiedHeader.getValue());
if(lastModifiedDate != null) {
lastModified = lastModifiedDate.getTime();
}
}
}
} else {
lastModified = fileURL.lastModified();
}
} catch (IOException ignored) {
ConcurrentLog.warn("IndexImportMediawiki_p", "Could not retrieve last modified date for dump file at " + fileURL);
}
return lastModified;
}
}
@@ -97,22 +97,14 @@ public WorkTables(final File workPath) {
super(workPath, 12);
this.bookmarks = new YMarkTables(this);
}
/**
* recording of a api call. stores the call parameters into the API database table
* @param post the post arguments of the api call
*
* @param post the api call request parameters. Must not be null.
* @param servletName the name of the servlet
* @param type name of the servlet category
* @param comment visual description of the process
* @return the pk of the new entry in the api table
* @return the API URL to be recorded
*/
public byte[] recordAPICall(final serverObjects post, final String servletName, final String type, final String comment) {
// remove the apicall attributes from the post object
String[] pks = post.remove(TABLE_API_COL_APICALL_PK);
byte[] pk = pks == null ? null : UTF8.getBytes(pks[0]);
public static String generateRecordedURL(final serverObjects post, final String servletName) {
/* Before API URL serialization, we set any eventual transaction token value to empty :
* this will later help identify a new valid transaction token will be necessary,
* but without revealing it in the URL displayed in the process scheduler and storing an invalid value */
@@ -130,6 +122,26 @@ public WorkTables(final File workPath) {
} else {
post.remove(TransactionManager.TRANSACTION_TOKEN_PARAM);
}
return apiurl;
}
/**
* recording of a api call. stores the call parameters into the API database table
* @param post the post arguments of the api call. Must not be null.
* @param servletName the name of the servlet
* @param type name of the servlet category
* @param comment visual description of the process
* @return the pk of the new entry in the api table
*/
public byte[] recordAPICall(final serverObjects post, final String servletName, final String type, final String comment) {
// remove the apicall attributes from the post object
String[] pks = post.remove(TABLE_API_COL_APICALL_PK);
byte[] pk = pks == null ? null : UTF8.getBytes(pks[0]);
// generate the apicall url - without the apicall attributes
final String apiurl = generateRecordedURL(post, servletName);
// read old entry from the apicall table (if exists)
Row row = null;

0 comments on commit a87281b

Please sign in to comment.