Skip to content

Commit

Permalink
added stub of oai-pmh importer (not working yet)
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6437 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Oct 30, 2009
1 parent 77c99e5 commit 30f108f
Show file tree
Hide file tree
Showing 7 changed files with 238 additions and 30 deletions.
37 changes: 37 additions & 0 deletions htroot/IndexImportOAIPMH_p.html
@@ -0,0 +1,37 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': OAI-PMH Import</title>
#%env/templates/metas.template%#
#(import)#::<meta http-equiv="REFRESH" content="10" />#(/import)#
</head>
<body id="IndexImportOAIPMH">
#%env/templates/header.template%#
#%env/templates/submenuIntegration.template%#
<h2>OAI-PMH Import</h2>

#(import)#
<p>#(status)#No import thread is running, you can start a new thread here::Bad input data: #[message]# #(/status)#</p>
<form action="IndexImportOAIPMH_p.html" method="get">
<fieldset>
<legend>OAI-PMH Import: set a OAI-PMH URL</legend>
<input name="oaipmhurl" type="text" value="" size="80" />
<input name="submit" type="submit" value="Import from a OAI-PMH source" />
</fieldset>
</form>
::
<form><fieldset><legend>Import Process</legend>
<dl>
<dt>Thread:</dt><dd>#[thread]#</dd>
<dt>Source:</dt><dd>#[source]#</dd>
<dt>Processed:</dt><dd>#[count]# Wiki Entries</dd>
<dt>Speed:</dt><dd>#[speed]# articles per second</dd>
<dt>Running Time:</dt><dd>#[runningHours]# hours, #[runningMinutes]# minutes</dd>
<dt>Remaining Time:</dt><dd>#[remainingHours]# hours, #[remainingMinutes]# minutes</dd>
</dl>
</fieldset></form>
#(/import)#

#%env/templates/footer.template%#
</body>
</html>
86 changes: 86 additions & 0 deletions htroot/IndexImportOAIPMH_p.java
@@ -0,0 +1,86 @@
// IndexImportOAIPMH.java
// -------------------------
// (C) 2009 by Michael Peter Christen; mc@yacy.net
// first published 04.05.2009 on http://yacy.net
// Frankfurt, Germany
//
// $LastChangedDate: 2009-10-11 23:29:18 +0200 (So, 11 Okt 2009) $
// $LastChangedRevision: 6400 $
// $LastChangedBy: orbiter $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

import java.io.File;
import java.net.MalformedURLException;

import net.yacy.document.importer.OAIPMHImporter;
import net.yacy.kelondro.data.meta.DigestURI;

import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;

public class IndexImportOAIPMH_p {

public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard) env;

if (OAIPMHImporter.job != null && OAIPMHImporter.job.isAlive()) {
// one import is running, no option to insert anything
prop.put("import", 1);
prop.put("import_thread", "running");
prop.put("import_source", OAIPMHImporter.job.source());
prop.put("import_count", OAIPMHImporter.job.count());
prop.put("import_speed", OAIPMHImporter.job.speed());
prop.put("import_runningHours", (OAIPMHImporter.job.runningTime() / 60) / 60);
prop.put("import_runningMinutes", (OAIPMHImporter.job.runningTime() / 60) % 60);
prop.put("import_remainingHours", (OAIPMHImporter.job.remainingTime() / 60) / 60);
prop.put("import_remainingMinutes", (OAIPMHImporter.job.remainingTime() / 60) % 60);
} else {
prop.put("import", 0);
if (post == null) {
prop.put("import_status", 0);
} else {
if (post.containsKey("file")) {
String oaipmhurl = post.get("oaipmhurl");
DigestURI url = null;
try {
url = new DigestURI(oaipmhurl, null);
OAIPMHImporter.job = new OAIPMHImporter(sb.loader, url);
OAIPMHImporter.job.start();
prop.put("import", 1);
prop.put("import_thread", "started");
prop.put("import_dump", OAIPMHImporter.job.source());
prop.put("import_count", 0);
prop.put("import_speed", 0);
prop.put("import_runningHours", 0);
prop.put("import_runningMinutes", 0);
prop.put("import_remainingHours", 0);
prop.put("import_remainingMinutes", 0);
} catch (MalformedURLException e) {
e.printStackTrace();
prop.put("import", 0);
prop.put("import_status", 1);
prop.put("import_status_message", e.getMessage());
}
}
return prop;
}
}
return prop;
}
}
25 changes: 13 additions & 12 deletions htroot/IndexImportWikimedia_p.java
Expand Up @@ -25,29 +25,30 @@
import java.io.File;
import java.net.MalformedURLException;

import net.yacy.document.importer.MediawikiImporter;

import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.tools.mediawikiIndex;

public class IndexImportWikimedia_p {

public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard) env;

if (mediawikiIndex.job != null && mediawikiIndex.job.isAlive()) {
if (MediawikiImporter.job != null && MediawikiImporter.job.isAlive()) {
// one import is running, no option to insert anything
prop.put("import", 1);
prop.put("import_thread", "running");
prop.put("import_dump", mediawikiIndex.job.sourcefile.getName());
prop.put("import_count", mediawikiIndex.job.count);
prop.put("import_speed", mediawikiIndex.job.speed());
prop.put("import_runningHours", (mediawikiIndex.job.runningTime() / 60) / 60);
prop.put("import_runningMinutes", (mediawikiIndex.job.runningTime() / 60) % 60);
prop.put("import_remainingHours", (mediawikiIndex.job.remainingTime() / 60) / 60);
prop.put("import_remainingMinutes", (mediawikiIndex.job.remainingTime() / 60) % 60);
prop.put("import_dump", MediawikiImporter.job.source());
prop.put("import_count", MediawikiImporter.job.count());
prop.put("import_speed", MediawikiImporter.job.speed());
prop.put("import_runningHours", (MediawikiImporter.job.runningTime() / 60) / 60);
prop.put("import_runningMinutes", (MediawikiImporter.job.runningTime() / 60) % 60);
prop.put("import_remainingHours", (MediawikiImporter.job.remainingTime() / 60) / 60);
prop.put("import_remainingMinutes", (MediawikiImporter.job.remainingTime() / 60) % 60);
} else {
prop.put("import", 0);
if (post == null) {
Expand All @@ -64,11 +65,11 @@ public static serverObjects respond(final RequestHeader header, final serverObje
}
String lang = name.substring(0, 2);
try {
mediawikiIndex.job = new mediawikiIndex(sourcefile, sb.surrogatesInPath, "http://" + lang + ".wikipedia.org/wiki/");
mediawikiIndex.job.start();
MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath, "http://" + lang + ".wikipedia.org/wiki/");
MediawikiImporter.job.start();
prop.put("import", 1);
prop.put("import_thread", "started");
prop.put("import_dump", mediawikiIndex.job.sourcefile.getName());
prop.put("import_dump", MediawikiImporter.job.source());
prop.put("import_count", 0);
prop.put("import_speed", 0);
prop.put("import_runningHours", 0);
Expand Down
9 changes: 5 additions & 4 deletions htroot/mediawiki_p.java
Expand Up @@ -27,11 +27,12 @@
import java.io.File;
import java.io.IOException;

import net.yacy.document.importer.MediawikiImporter;

import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.tools.mediawikiIndex;

public class mediawiki_p {

Expand All @@ -53,12 +54,12 @@ public static serverObjects respond(final RequestHeader header, serverObjects po

File dumpFile = new File(sb.getRootPath(), "DATA/HTCACHE/mediawiki/" + dump);
if (!dumpFile.exists()) return post;
mediawikiIndex.checkIndex(dumpFile);
mediawikiIndex.wikisourcerecord w = mediawikiIndex.find(title.replaceAll(" ", "_"), mediawikiIndex.idxFromWikimediaXML(dumpFile));
MediawikiImporter.checkIndex(dumpFile);
MediawikiImporter.wikisourcerecord w = MediawikiImporter.find(title.replaceAll(" ", "_"), MediawikiImporter.idxFromWikimediaXML(dumpFile));
if (w == null) {
return post;
}
String page = new String(mediawikiIndex.read(dumpFile, w.start, (int) (w.end - w.start)), "UTF-8");
String page = new String(MediawikiImporter.read(dumpFile, w.start, (int) (w.end - w.start)), "UTF-8");
int p = page.indexOf("<text");
if (p < 0) return prop;
p = page.indexOf('>', p);
Expand Down
40 changes: 40 additions & 0 deletions source/net/yacy/document/importer/Importer.java
@@ -0,0 +1,40 @@
package net.yacy.document.importer;

public interface Importer extends Runnable {


public String source();

public int count();

/**
* return the number of articles per second
* @return
*/
public int speed();

/**
* return the time this import is already running
* @return
*/
public long runningTime();


/**
* return the remaining seconds for the completion of all records in milliseconds
* @return
*/
public long remainingTime();



public boolean isAlive();

public void start();

/**
* the run method from runnable
*/
public void run();

}
Expand Up @@ -24,7 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

package de.anomic.tools;
package net.yacy.document.importer;

import net.yacy.document.Document;
import net.yacy.document.TextParser;
Expand Down Expand Up @@ -71,14 +71,17 @@
* as referenced with xmlns="http://www.mediawiki.org/xml/export-0.3/"
*/

public class mediawikiIndex extends Thread {
public class MediawikiImporter extends Thread implements Importer {

private static final String textstart = "<text";
private static final String textend = "</text>";
private static final String pagestart = "<page>";
private static final String pageend = "</page>";
private static final byte[] pagestartb = pagestart.getBytes();
private static final byte[] pageendb = pageend.getBytes();
private static final int docspermbinxmlbz2 = 800; // documents per megabyte in a xml.bz2 wikimedia dump

public static Importer job; // if started from a servlet, this object is used to store the thread

protected wikiParser wparser;
protected String urlStub;
Expand All @@ -89,11 +92,8 @@ public class mediawikiIndex extends Thread {
private long docsize;
private int approxdocs;

private static final int docspermbinxmlbz2 = 800; // documents per megabyte in a xml.bz2 wikimedia dump

public static mediawikiIndex job; // if started from a servlet, this object is used to store the thread

public mediawikiIndex(File sourcefile, File targetdir, String baseURL) throws MalformedURLException {
public MediawikiImporter(File sourcefile, File targetdir, String baseURL) throws MalformedURLException {
this.sourcefile = sourcefile;
this.docsize = sourcefile.length();
this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L);
Expand All @@ -104,6 +104,14 @@ public mediawikiIndex(File sourcefile, File targetdir, String baseURL) throws Ma
this.start = 0;
}

public int count() {
return this.count;
}

public String source() {
return this.sourcefile.getAbsolutePath();
}

/**
* return the number of articles per second
* @return
Expand Down Expand Up @@ -738,7 +746,7 @@ public static void main(String[] s) {
String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/
//String language = urlStub.substring(7,9);
try {
mediawikiIndex mi = new mediawikiIndex(sourcefile, targetdir, urlStub);
MediawikiImporter mi = new MediawikiImporter(sourcefile, targetdir, urlStub);
mi.start();
mi.join();
} catch (InterruptedException e) {
Expand Down
Expand Up @@ -24,7 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

package de.anomic.crawler;
package net.yacy.document.importer;

import java.io.ByteArrayInputStream;
import java.io.IOException;
Expand All @@ -35,21 +35,56 @@
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.LoaderDispatcher;

import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;

public class PMHReader {
public class OAIPMHImporter extends Thread implements Importer {

LoaderDispatcher loader;
public static Importer job; // if started from a servlet, this object is used to store the thread

public PMHReader(LoaderDispatcher loader) {
private LoaderDispatcher loader;
private DigestURI source;
private int count;
private long startTime;

public OAIPMHImporter(LoaderDispatcher loader, DigestURI source) {
this.loader = loader;
this.source = source;
this.count = 0;
this.startTime = System.currentTimeMillis();
}


public int count() {
return this.count;
}

public long remainingTime() {
return Long.MAX_VALUE; // we don't know
}

public long runningTime() {
return System.currentTimeMillis() - this.startTime;
}

public String source() {
return source.toNormalform(true, false);
}

public int speed() {
return (int) (1000L * ((long) count()) / runningTime());
}

public void load(DigestURI source) throws IOException {
Response response = this.loader.load(source, true, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
load(response);
public void run() {
Response response;
try {
response = this.loader.load(source, true, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
load(response);
} catch (IOException e) {
e.printStackTrace();
}
}

public static void load0(DigestURI source) throws IOException {
Expand Down

0 comments on commit 30f108f

Please sign in to comment.