Skip to content

Commit

Permalink
- refactoring of plasmaCrawlLURL.Entry to prepare new Entry format
Browse files Browse the repository at this point in the history
- added test migration method to migrate the old LURL to a new LURL
the new LURL will be splitted into different tables for each month
this solves several problems:
- the biggest table in YaCy is splitted in different parts and can
  also be managed in filesystems that are limited to 2GB
- the oldest entries can easily be identified, used for re-crawl und
  deleted
- The complete database can be limited to a specific size (as wanted many times)


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2755 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Oct 12, 2006
1 parent 130cc76 commit a5dd0d4
Show file tree
Hide file tree
Showing 37 changed files with 193 additions and 453 deletions.
4 changes: 2 additions & 2 deletions htroot/Bookmarks.java
Expand Up @@ -56,7 +56,7 @@
import de.anomic.data.bookmarksDB.Tag;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
Expand Down Expand Up @@ -147,7 +147,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash);
if (bookmark == null) {
// try to get the bookmark from the LURL database
plasmaCrawlLURL.Entry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
plasmaCrawlLURLEntry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
plasmaParserDocument document = null;
if(urlentry != null){
document = switchboard.snippetCache.retrieveDocument(urlentry.url(), true);
Expand Down
20 changes: 10 additions & 10 deletions htroot/IndexControl_p.java
Expand Up @@ -61,7 +61,7 @@
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
Expand Down Expand Up @@ -218,7 +218,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
}

if (post.containsKey("urlhashdelete")) {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
Expand Down Expand Up @@ -265,7 +265,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
HashMap knownURLs = new HashMap();
HashSet unknownURLEntries = new HashSet();
indexEntry iEntry;
plasmaCrawlLURL.Entry lurl;
plasmaCrawlLURLEntry lurl;
while (urlIter.hasNext()) {
iEntry = (indexEntry) urlIter.next();
lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null);
Expand Down Expand Up @@ -321,7 +321,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
URL url = new URL(urlstring);
urlhash = indexURL.urlHash(url);
prop.put("urlhash", urlhash);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
if (entry == null) {
prop.put("urlstring", "unknown url: " + urlstring);
prop.put("urlhash", "");
Expand All @@ -335,7 +335,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
}

if (post.containsKey("urlhashsearch")) {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash);
} else {
Expand All @@ -351,12 +351,12 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
try {
final Iterator entryIt = switchboard.urlPool.loadedURL.entries(true, true, urlhash);
StringBuffer result = new StringBuffer("Sequential List of URL-Hashes:<br>");
plasmaCrawlLURL.Entry entry;
plasmaCrawlLURLEntry entry;
int i = 0;
int rows = 0, cols = 0;
prop.put("urlhashsimilar", 1);
while (entryIt.hasNext() && i < 256) {
entry = (plasmaCrawlLURL.Entry) entryIt.next();
entry = (plasmaCrawlLURLEntry) entryIt.next();
prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", entry.hash());
cols++;
if (cols==8) {
Expand Down Expand Up @@ -403,7 +403,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
return prop;
}

public static serverObjects genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURL.Entry entry, String urlhash) {
public static serverObjects genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURLEntry entry, String urlhash) {
serverObjects prop = new serverObjects();
if (entry == null) {
prop.put("genUrlProfile", 1);
Expand All @@ -412,7 +412,7 @@ public static serverObjects genUrlProfile(plasmaSwitchboard switchboard, plasmaC
}
URL url = entry.url();
String referrer = null;
plasmaCrawlLURL.Entry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null);
plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null);
if (le == null) {
referrer = "<unknown>";
} else {
Expand Down Expand Up @@ -463,7 +463,7 @@ public static serverObjects genUrlList(plasmaSwitchboard switchboard, String key
while (en.hasNext()) {
xi = (indexEntry) en.next();
uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())};
plasmaCrawlLURL.Entry le = switchboard.urlPool.loadedURL.load(uh[0], null);
plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(uh[0], null);
if (le == null) {
tm.put(uh[0], uh);
} else {
Expand Down
4 changes: 2 additions & 2 deletions htroot/ViewFile.java
Expand Up @@ -55,13 +55,13 @@
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.crawler.plasmaCrawlerException;
import de.anomic.plasma.parser.ParserException;
import de.anomic.plasma.plasmaCrawlLURL.Entry;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
Expand Down Expand Up @@ -107,7 +107,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
String viewMode = post.get("viewMode","sentences");

// getting the urlEntry that belongs to the url hash
Entry urlEntry = null;
plasmaCrawlLURLEntry urlEntry = null;
urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
if (urlEntry == null) {
prop.put("error",2);
Expand Down
4 changes: 2 additions & 2 deletions htroot/htdocsdefault/dir.java
Expand Up @@ -64,7 +64,7 @@
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCodings;
import de.anomic.server.serverCore;
Expand Down Expand Up @@ -358,7 +358,7 @@ public static void indexPhrase(plasmaSwitchboard switchboard, String urlstring,
try {
final URL url = new URL(urlstring);
final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()));
final plasmaCrawlLURL.Entry newEntry = switchboard.urlPool.loadedURL.newEntry(
final plasmaCrawlLURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry(
url, "YaCyShare: " + descr, new Date(), new Date(),
"AAAAAAAAAAAA", /*referrer*/
0, /*copycount*/
Expand Down
4 changes: 2 additions & 2 deletions htroot/yacy/crawlOrder.java
Expand Up @@ -51,7 +51,7 @@
import de.anomic.http.httpHeader;
import de.anomic.index.indexURL;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
Expand Down Expand Up @@ -249,7 +249,7 @@ private static Object[] stack(plasmaSwitchboard switchboard, String url, String
// case where we have already the url loaded;
reason = reasonString;
// send lurl-Entry as response
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null);
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null);
if (entry == null) {
response = "rejected";
lurl = "";
Expand Down
4 changes: 2 additions & 2 deletions htroot/yacy/crawlReceipt.java
Expand Up @@ -51,7 +51,7 @@
import de.anomic.http.httpHeader;
import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
Expand Down Expand Up @@ -124,7 +124,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
prop.put("delay", "3600");
} else if (result.equals("fill")) {
// generating a new loaded URL entry
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.newEntry(propStr, true);
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr, true);
if ((entry == null)||(entry.url()==null)) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT for hash " + receivedUrlhash + " from peer " + iam +
"\n\tURL properties: "+ propStr);
Expand Down
6 changes: 3 additions & 3 deletions htroot/yacy/search.java
Expand Up @@ -54,7 +54,7 @@
import de.anomic.http.httpHeader;
import de.anomic.index.indexContainer;
import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile;
Expand Down Expand Up @@ -244,10 +244,10 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
StringBuffer links = new StringBuffer();
String resource = "";
//plasmaIndexEntry pie;
plasmaCrawlLURL.Entry urlentry;
plasmaCrawlLURLEntry urlentry;
plasmaSnippetCache.Snippet snippet;
while ((acc.hasMoreElements()) && (i < squery.wantedResults)) {
urlentry = acc.nextElement();
urlentry = (plasmaCrawlLURLEntry) acc.nextElement();
if (includesnippet) {
snippet = sb.snippetCache.retrieveSnippet(urlentry.url(), squery.queryHashes, false, 260, 1000);
} else {
Expand Down
4 changes: 2 additions & 2 deletions htroot/yacy/transferURL.java
Expand Up @@ -48,7 +48,7 @@
import java.io.IOException;

import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCore;
Expand Down Expand Up @@ -90,7 +90,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
final int sizeBefore = sb.urlPool.loadedURL.size();
// read the urls from the other properties and store
String urls;
plasmaCrawlLURL.Entry lEntry;
plasmaCrawlLURLEntry lEntry;
for (int i = 0; i < urlc; i++) {
serverCore.checkInterruption();
urls = (String) post.get("url" + i);
Expand Down
4 changes: 2 additions & 2 deletions htroot/yacysearch.java
Expand Up @@ -57,7 +57,7 @@
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSearchImages;
import de.anomic.plasma.plasmaSearchPreOrder;
Expand Down Expand Up @@ -189,7 +189,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
return prop;
}
final String recommendHash = post.get("recommendref", ""); // urlhash
plasmaCrawlLURL.Entry urlentry = sb.urlPool.loadedURL.load(recommendHash, null);
plasmaCrawlLURLEntry urlentry = sb.urlPool.loadedURL.load(recommendHash, null);
if (urlentry != null) {
plasmaParserDocument document = sb.snippetCache.retrieveDocument(urlentry.url(), true);
if (document != null) {
Expand Down
9 changes: 9 additions & 0 deletions source/dbtest.java
Expand Up @@ -13,6 +13,7 @@
import java.util.Random;

import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroFlexSplitTable;
import de.anomic.kelondro.kelondroFlexTable;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroNaturalOrder;
Expand Down Expand Up @@ -186,6 +187,10 @@ public static void main(String[] args) {
File tablepath = new File(tablename).getParentFile();
table = new kelondroFlexTable(tablepath, new File(tablename).getName(), buffer, preload, testRow, kelondroBase64Order.enhancedCoder);
}
if (dbe.equals("kelondroFlexSplitTable")) {
File tablepath = new File(tablename).getParentFile();
table = new kelondroFlexSplitTable(tablepath, new File(tablename).getName(), buffer, preload, testRow, kelondroBase64Order.enhancedCoder);
}
if (dbe.equals("mysql")) {
table = new dbTable("mysql", testRow);
}
Expand Down Expand Up @@ -513,6 +518,10 @@ public kelondroRow.Entry get(byte[] key) throws IOException {
}
}

public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
return put(row);
}

public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException {
try {

Expand Down
11 changes: 1 addition & 10 deletions source/de/anomic/index/indexURL.java
Expand Up @@ -47,7 +47,7 @@
public class indexURL {

// day formatter for entry export
protected static final SimpleDateFormat shortDayFormatter = new SimpleDateFormat("yyyyMMdd");
public static final SimpleDateFormat shortDayFormatter = new SimpleDateFormat("yyyyMMdd");

// statics for value lengths
public static final int urlHashLength = yacySeedDB.commonHashLength; // 12
Expand Down Expand Up @@ -428,15 +428,6 @@ public int size() {
}
}

public void store(kelondroRow.Entry entry, boolean cached) throws IOException {
if ((cached) && (urlIndexCache != null))
synchronized (urlIndexCache) {
urlIndexCache.put(entry);
}
else
urlIndexFile.put(entry);
}

public void flushCacheSome() {
if (urlIndexCache == null) return;
if (urlIndexCache.size() == 0) return;
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/kelondro/kelondroCollectionIndex.java
Expand Up @@ -446,7 +446,7 @@ protected kelondroRowSet getdelete(kelondroRow.Entry indexrow, boolean remove, b
indexEntry.setCol(idx_col_lastread, kelondroRowCollection.daysSince2000(System.currentTimeMillis()));
indexEntry.setCol(idx_col_lastwrote, kelondroRowCollection.daysSince2000(System.currentTimeMillis()));
index.put(indexEntry);
throw new kelondroException(arrayFile(this.path, this.filenameStub, this.loadfactor, chunksize, partitionnumber, serialnumber).toString(), "array contains wrong row '" + new String(arrayrow.getColBytes(0)) + "', expected is '" + new String(indexrow.getColBytes(idx_col_key)) + "', the row has been fixed");
throw new kelondroException(array.filename, "array contains wrong row '" + new String(arrayrow.getColBytes(0)) + "', expected is '" + new String(indexrow.getColBytes(idx_col_key)) + "', the row has been fixed");
}
int chunkcountInArray = collection.size();
if (chunkcountInArray != chunkcount) {
Expand Down
5 changes: 5 additions & 0 deletions source/de/anomic/kelondro/kelondroColumn.java
Expand Up @@ -209,6 +209,11 @@ public String description() {
public String toString() {
StringBuffer s = new StringBuffer();
switch (celltype) {
case celltype_undefined:
s.append(nickname);
s.append('-');
s.append(cellwidth);
break;
case celltype_boolean:
s.append("boolean ");
s.append(nickname);
Expand Down
5 changes: 5 additions & 0 deletions source/de/anomic/kelondro/kelondroFlexTable.java
Expand Up @@ -27,6 +27,7 @@

import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.Iterator;

public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondroIndex {
Expand Down Expand Up @@ -137,6 +138,10 @@ public synchronized kelondroRow.Entry get(byte[] key) throws IOException {
return super.get(i);
}

public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
return put(row);
}

public synchronized kelondroRow.Entry put(kelondroRow.Entry row) throws IOException {
int i = index.geti(row.getColBytes(0));
if (i < 0) {
Expand Down
2 changes: 2 additions & 0 deletions source/de/anomic/kelondro/kelondroIndex.java
Expand Up @@ -51,6 +51,7 @@
package de.anomic.kelondro;

import java.io.IOException;
import java.util.Date;
import java.util.Iterator;

public interface kelondroIndex {
Expand All @@ -60,6 +61,7 @@ public interface kelondroIndex {
public kelondroRow row() throws IOException;
public kelondroRow.Entry get(byte[] key) throws IOException;
public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException;
public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException;
public kelondroRow.Entry remove(byte[] key) throws IOException;
public Iterator rows(boolean up, boolean rotating, byte[] firstKey) throws IOException;
public void close() throws IOException;
Expand Down
6 changes: 6 additions & 0 deletions source/de/anomic/kelondro/kelondroRAMIndex.java
Expand Up @@ -26,6 +26,8 @@

package de.anomic.kelondro;

import java.io.IOException;
import java.util.Date;
import java.util.Iterator;
import java.util.TreeMap;

Expand Down Expand Up @@ -59,6 +61,10 @@ public synchronized Entry get(byte[] key) {
return (kelondroRow.Entry) index.get(key);
}

public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
return put(row);
}

public synchronized Entry put(Entry row) {
return (kelondroRow.Entry) index.put(row.getColBytes(0), row);
}
Expand Down
4 changes: 2 additions & 2 deletions source/de/anomic/kelondro/kelondroRecords.java
Expand Up @@ -976,7 +976,7 @@ protected final int free() {
return USAGE.FREEC;
}

private final void dispose(Handle h) throws IOException {
private synchronized final void dispose(Handle h) throws IOException {
// delete element with handle h
// this element is then connected to the deleted-chain and can be
// re-used change counter
Expand Down Expand Up @@ -1052,7 +1052,7 @@ protected final Set deletedHandles(long maxTime) throws kelondroException, IOExc
if (markedDeleted.contains(h)) {
// loop detection
this.theLogger.severe("KELONDRO WARNING " + this.filename + ": FREE-Queue contains loops");
return markedDeleted;
return markedDeleted; // TODO: automatic fix
}
markedDeleted.add(h);
seekp = seekpos(h);
Expand Down

0 comments on commit a5dd0d4

Please sign in to comment.