Skip to content

Commit

Permalink
- added correct flagging of word properties
Browse files Browse the repository at this point in the history
- added self-healing to database in case that wrong free-pointers exist
- added presentation of media links in snippets (does not yet work correctly)
- code cleanup

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3055 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Dec 8, 2006
1 parent 10d888e commit bf0d820
Show file tree
Hide file tree
Showing 12 changed files with 342 additions and 133 deletions.
2 changes: 1 addition & 1 deletion htroot/IndexControl_p.java
Expand Up @@ -300,7 +300,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
"true".equalsIgnoreCase(gzipBody),
timeout);
result = (String) resultObj.get("result");
prop.put("result", (result == null) ? ("Successfully transferred " + index.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result);
prop.put("result", (result == null) ? ("Successfully transferred " + knownURLs.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds, " + unknownURLEntries + " URL not found") : result);
index = null;
}

Expand Down
2 changes: 1 addition & 1 deletion htroot/yacy/transferRWI.java
Expand Up @@ -203,7 +203,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
}
if (unknownURLs.length() > 0) { unknownURLs.delete(0, 1); }
if ((wordhashes.length == 0) || (received == 0)) {
sb.getLog().logInfo("Received 0 RWIs from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + " URLs");
sb.getLog().logInfo("Received 0 RWIs from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + " URLs, blocked " + blocked + " RWIs");
} else {
final double avdist = (yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[0]) + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[received - 1])) / 2.0;
sb.getLog().logInfo("Received " + received + " Entries " + wordc + " Words [" + wordhashes[0] + " .. " + wordhashes[received - 1] + "]/" + avdist + " from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + "/" + receivedURL + " URLs, blocked " + blocked + " RWIs");
Expand Down
69 changes: 42 additions & 27 deletions htroot/yacy/transferURL.java
Expand Up @@ -46,10 +46,12 @@
// javac -classpath .:../classes transferRWI.java

import java.io.IOException;
import java.text.ParseException;

import de.anomic.http.httpHeader;
import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
Expand All @@ -59,11 +61,14 @@

public final class transferURL {


public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch ss) throws InterruptedException {
if (post == null || ss == null) { return null; }

long start = System.currentTimeMillis();

long freshdate = 0;
try {freshdate = plasmaURL.shortDayFormatter.parse("20061101").getTime();} catch (ParseException e1) {}

// return variable that accumulates replacements
final plasmaSwitchboard sb = (plasmaSwitchboard) ss;
final serverObjects prop = new serverObjects();
Expand Down Expand Up @@ -93,35 +98,45 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
indexURLEntry lEntry;
for (int i = 0; i < urlc; i++) {
serverCore.checkInterruption();

// read new lurl-entry
urls = (String) post.get("url" + i);
if (urls == null) {
yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName);
} else {
lEntry = sb.wordIndex.loadedURL.newEntry(urls);
if (lEntry == null) {
yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
// TODO: should we send back an error message???
} else {
indexURLEntry.Components comp = lEntry.comp();
if (comp.url() == null) {
yacyCore.log.logWarning("transferURL: received invalid URL (url null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
// TODO: should we send back an error message???
} else {
if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), comp.url()))) {
int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
lEntry = null;
blocked++;
} else try {
sb.wordIndex.loadedURL.store(lEntry);
sb.wordIndex.loadedURL.stack(lEntry, iam, iam, 3);
yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName);
received++;
} catch (IOException e) {
e.printStackTrace();
}
}
}
continue;
}

// parse new lurl-entry
lEntry = sb.wordIndex.loadedURL.newEntry(urls);
if (lEntry == null) {
yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
continue;
}

// check if entry is well-formed
indexURLEntry.Components comp = lEntry.comp();
if ((comp.url() == null) || (lEntry.freshdate().getTime() <= freshdate)) {
yacyCore.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName + "\n\tURL Property: " + urls);
continue;
}

// check if the entry is blacklisted
if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), comp.url()))) {
int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
lEntry = null;
blocked++;
continue;
}

// write entry to database
try {
sb.wordIndex.loadedURL.store(lEntry);
sb.wordIndex.loadedURL.stack(lEntry, iam, iam, 3);
yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName);
received++;
} catch (IOException e) {
e.printStackTrace();
}
}

Expand Down
1 change: 0 additions & 1 deletion source/de/anomic/index/indexRWIEntryNew.java
Expand Up @@ -117,7 +117,6 @@ public indexRWIEntryNew(String urlHash,
int sizeOfPage, // # of bytes of the page TODO: not needed any more
long lastmodified, // last-modified time of the document where word appears
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
int quality, // the entropy value
String language, // (guessed) language of document
char doctype, // type of document
int outlinksSame, // outlinks to same domain
Expand Down
9 changes: 4 additions & 5 deletions source/de/anomic/index/indexRWIEntryOld.java
Expand Up @@ -33,7 +33,6 @@
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRow.Entry;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.yacy.yacySeedDB;

Expand Down Expand Up @@ -66,7 +65,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
private static final int col_hitcount = 3;
private static final int col_language = 4;
private static final int col_doctype = 5;
private static final int col_localflag = 6;
//private static final int col_localflag = 6;
private static final int col_posintext = 7;
private static final int col_posinphrase = 8;
private static final int col_posofphrase = 9;
Expand All @@ -77,6 +76,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {

private kelondroRow.Entry entry;

/*
public indexRWIEntryOld(String urlHash,
int urlLength, // byte-length of complete URL
int urlComps, // number of path components
Expand All @@ -91,7 +91,6 @@ public indexRWIEntryOld(String urlHash,
int sizeOfPage, // # of bytes of the page
long lastmodified, //*last-modified time of the document where word appears
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
int quality, //*the entropy value
String language, //*(guessed) language of document
char doctype, //*type of document
int outlinksSame, // outlinks to same domain
Expand All @@ -107,7 +106,7 @@ public indexRWIEntryOld(String urlHash,
if ((language == null) || (language.length() != urlEntryRow.width(col_language))) language = "uk";
this.entry = urlEntryRow.newEntry();
this.entry.setCol(col_urlhash, urlHash, null);
this.entry.setCol(col_quality, quality);
this.entry.setCol(col_quality, 0);
this.entry.setCol(col_lastModified, lastmodified);
this.entry.setCol(col_hitcount, hitcount);
this.entry.setCol(col_language, language, null);
Expand All @@ -121,7 +120,7 @@ public indexRWIEntryOld(String urlHash,
this.entry.setCol(col_phrasecount, phrasecount);
//System.out.println("DEBUG-NEWENTRY " + toPropertyForm());
}

*/
public indexRWIEntryOld(String urlHash, String code) {
// the code is the external form of the row minus the leading urlHash entry
this.entry = urlEntryRow.newEntry((urlHash + code).getBytes());
Expand Down
8 changes: 7 additions & 1 deletion source/de/anomic/kelondro/kelondroBitfield.java
Expand Up @@ -24,7 +24,7 @@

package de.anomic.kelondro;

public class kelondroBitfield {
public class kelondroBitfield implements Cloneable {

// the bitfield implements a binary array. Such arrays may be exported in a base64-String

Expand Down Expand Up @@ -55,6 +55,12 @@ public kelondroBitfield(int bytelength, String exported) {
}
}

public Object clone() {
kelondroBitfield theClone = new kelondroBitfield(new byte[this.bb.length]);
System.arraycopy(this.bb, 0, theClone.bb, 0, this.bb.length);
return theClone;
}

public void set(int pos, boolean value) {
assert (pos >= 0);
int slot = pos / 8;
Expand Down
17 changes: 12 additions & 5 deletions source/de/anomic/kelondro/kelondroRecords.java
Expand Up @@ -1392,7 +1392,7 @@ protected Handle() throws IOException {
USAGE.FREEC--;
// take link
if (USAGE.FREEH.index == NUL) {
System.out.println("INTERNAL ERROR (DATA INCONSISTENCY): re-use of records failed, lost " + (USAGE.FREEC + 1) + " records. Affected file: " + filename);
serverLog.logSevere("kelondroRecords/" + filename, "INTERNAL ERROR (DATA INCONSISTENCY): re-use of records failed, lost " + (USAGE.FREEC + 1) + " records.");
// try to heal..
USAGE.USEDC = USAGE.allCount() + 1;
USAGE.FREEC = 0;
Expand All @@ -1402,10 +1402,17 @@ protected Handle() throws IOException {
//System.out.println("*DEBUG* ALLOCATED DELETED INDEX " + index);
// check for valid seek position
long seekp = seekpos(USAGE.FREEH);
if (seekp > entryFile.length()) throw new kelondroException("new Handle: seek position " + seekp + "/" + USAGE.FREEH.index + " out of file size " + entryFile.length() + "/" + ((entryFile.length() - POS_NODES) / recordsize));

// read link to next element of FREEH chain
USAGE.FREEH.index = entryFile.readInt(seekp);
if (seekp > entryFile.length()) {
// this is a severe inconsistency. try to heal..
serverLog.logSevere("kelondroRecords/" + filename, "new Handle: lost " + USAGE.FREEC + " marked nodes; seek position " + seekp + "/" + USAGE.FREEH.index + " out of file size " + entryFile.length() + "/" + ((entryFile.length() - POS_NODES) / recordsize));
index = USAGE.allCount(); // a place at the end of the file
USAGE.USEDC += USAGE.FREEC; // to avoid that non-empty records at the end are overwritten
USAGE.FREEC = 0; // discard all possible empty nodes
USAGE.FREEH.index = NUL;
} else {
// read link to next element of FREEH chain
USAGE.FREEH.index = entryFile.readInt(seekp);
}
}
USAGE.write();
}
Expand Down

0 comments on commit bf0d820

Please sign in to comment.