Skip to content

Commit

Permalink
next refactoring step in document indexing to prepare concurrency env…
Browse files Browse the repository at this point in the history
…ironment for document parsing

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4604 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Mar 26, 2008
1 parent 7f9f639 commit 9b0e20f
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 8 deletions.
2 changes: 1 addition & 1 deletion source/de/anomic/index/indexRepositoryReference.java
Expand Up @@ -54,7 +54,7 @@
public final class indexRepositoryReference {

// class objects
private kelondroIndex urlIndexFile;
kelondroIndex urlIndexFile;
private Export exportthread = null; // will habe a export thread assigned if exporter is running

public indexRepositoryReference(File indexSecondaryRoot, String networkName) {
Expand Down
20 changes: 20 additions & 0 deletions source/de/anomic/plasma/plasmaParserDocument.java
Expand Up @@ -54,6 +54,7 @@
import de.anomic.yacy.yacyURL;

import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
Expand Down Expand Up @@ -86,6 +87,7 @@ public class plasmaParserDocument {
private yacyURL favicon;
private boolean resorted;
private InputStream textStream;
private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure

protected plasmaParserDocument(yacyURL location, String mimeType, String charset,
String[] keywords, String title, String author,
Expand All @@ -107,6 +109,8 @@ protected plasmaParserDocument(yacyURL location, String mimeType, String charset
this.applinks = null;
this.emaillinks = null;
this.resorted = false;
this.inboundLinks = -1;
this.outboundLinks = -1;

if (text == null) try {
this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
Expand Down Expand Up @@ -430,6 +434,22 @@ public void setFavicon(yacyURL faviconURL) {
this.favicon = faviconURL;
}

public void notifyWebStructure(plasmaWebStructure webStructure, plasmaCondenser condenser, Date docDate) {
Integer[] ioLinks = webStructure.generateCitationReference(this, condenser, docDate); // [outlinksSame, outlinksOther]
this.inboundLinks = ioLinks[0].intValue();
this.outboundLinks = ioLinks[1].intValue();
}

public int inboundLinks() {
assert this.inboundLinks >= 0;
return (this.inboundLinks < 0) ? 0 : this.inboundLinks;
}

public int outboundLinks() {
assert this.outboundLinks >= 0;
return (this.outboundLinks < 0) ? 0 : this.outboundLinks;
}

public void close() {
// try close the output stream
if (this.textStream != null) {
Expand Down
56 changes: 54 additions & 2 deletions source/de/anomic/plasma/plasmaSwitchboard.java
Expand Up @@ -1862,7 +1862,8 @@ public boolean deQueueProcess() {
if (document != null) {
plasmaCondenser condensement = condenseDocument(nextentry, document);
if (condensement != null) {
indexDocument(nextentry, document, condensement);
document.notifyWebStructure(webStructure, condensement, nextentry.getModificationDate());
storeDocumentIndex(nextentry, document, condensement);
}
}
return true;
Expand Down Expand Up @@ -2228,8 +2229,58 @@ private plasmaCondenser condenseDocument(plasmaSwitchboardQueue.Entry entry, pla
return condenser;
}

private void storeDocumentIndex(plasmaSwitchboardQueue.Entry entry, plasmaParserDocument document, plasmaCondenser condenser) {

// CREATE INDEX
String dc_title = document.dc_title();
yacyURL referrerURL = entry.referrerURL();
int processCase = entry.processCase();

// remove stopwords
log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + entry.url());

// STORE URL TO LOADED-URL-DB
indexURLReference newEntry = null;
try {
newEntry = wordIndex.storeDocument(entry, document, condenser);
} catch (IOException e) {
log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase);
addURLtoErrorDB(entry.url(), referrerURL.hash(), entry.initiator(), dc_title, "error storing url: " + e.getMessage(), new kelondroBitfield());
return;
}

// update statistics
crawlResults.stack(
newEntry, // loaded url db entry
entry.initiator(), // initiator peer hash
yacyCore.seedDB.mySeed().hash, // executor peer hash
processCase // process case
);

// STORE WORD INDEX
if ((!entry.profile().indexText()) && (!entry.profile().indexMedia())) {
log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase);
addURLtoErrorDB(entry.url(), referrerURL.hash(), entry.initiator(), dc_title, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new kelondroBitfield());
return;
}

// increment number of indexed urls
indexedPages++;

// update profiling info
plasmaProfiling.updateIndexedPage(entry);

// if this was performed for a remote crawl request, notify requester
yacySeed initiatorPeer = entry.initiatorPeer();
if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
log.logInfo("Sending crawl receipt for '" + entry.url().toNormalform(false, true) + "' to " + initiatorPeer.getName());
if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
yacyClient.crawlReceipt(initiatorPeer, "crawl", "fill", "indexed", newEntry, "");
}
}
/*
private void indexDocument(plasmaSwitchboardQueue.Entry entry, plasmaParserDocument document, plasmaCondenser condenser) throws InterruptedException {
long indexingStartTime = 0, indexingEndTime = 0,
long indexingStartTime = System.currentTimeMillis(), indexingEndTime = 0,
storageStartTime = 0, storageEndTime = 0;
// CREATE INDEX
Expand Down Expand Up @@ -2348,6 +2399,7 @@ private void indexDocument(plasmaSwitchboardQueue.Entry entry, plasmaParserDocum
yacyClient.crawlReceipt(initiatorPeer, "crawl", "fill", "indexed", newEntry, "");
}
}
*/

private static SimpleDateFormat DateFormatter = new SimpleDateFormat("EEE, dd MMM yyyy");
public static String dateString(Date date) {
Expand Down
8 changes: 4 additions & 4 deletions source/de/anomic/plasma/plasmaWebStructure.java
Expand Up @@ -92,16 +92,16 @@ public plasmaWebStructure(serverLog log, File rankingPath, String crlFile, Strin
}
}

public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(yacyURL url, String baseurlhash, Date docDate, plasmaParserDocument document, plasmaCondenser condenser) {
assert url.hash().equals(baseurlhash);
public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(plasmaParserDocument document, plasmaCondenser condenser, Date docDate) {
yacyURL url = document.dc_source();

// generate citation reference
Map<yacyURL, String> hl = document.getHyperlinks();
Iterator<yacyURL> it = hl.keySet().iterator();
String nexturlhash;
StringBuffer cpg = new StringBuffer(12 * (hl.size() + 1) + 1);
StringBuffer cpl = new StringBuffer(12 * (hl.size() + 1) + 1);
String lhp = baseurlhash.substring(6); // local hash part
String lhp = url.hash().substring(6); // local hash part
int GCount = 0;
int LCount = 0;
while (it.hasNext()) {
Expand All @@ -121,7 +121,7 @@ public plasmaWebStructure(serverLog log, File rankingPath, String crlFile, Strin

// append this reference to buffer
// generate header info
String head = baseurlhash + "=" +
String head = url.hash() + "=" +
plasmaWordIndex.microDateHoursStr(docDate.getTime()) + // latest update timestamp of the URL
plasmaWordIndex.microDateHoursStr(System.currentTimeMillis()) + // last visit timestamp of the URL
kelondroBase64Order.enhancedCoder.encodeLongSmart(LCount, 2) + // count of links to local resources
Expand Down
75 changes: 74 additions & 1 deletion source/de/anomic/plasma/plasmaWordIndex.java
Expand Up @@ -75,9 +75,11 @@ public final class plasmaWordIndex implements indexRI {
private final indexRAMRI dhtOutCache, dhtInCache;
private final indexCollectionRI collections; // new database structure to replace AssortmentCluster and FileCluster
private int flushsize;
private final indexRepositoryReference referenceURL;
private serverLog log;
final indexRepositoryReference referenceURL;

public plasmaWordIndex(File indexPrimaryRoot, File indexSecondaryRoot, String networkName, serverLog log) {
this.log = log;
File indexPrimaryPath = new File(indexPrimaryRoot, networkName);
File indexPrimaryTextLocation = new File(indexPrimaryPath, "TEXT");
if (!indexPrimaryTextLocation.exists()) {
Expand Down Expand Up @@ -603,6 +605,77 @@ public synchronized TreeSet<indexContainer> indexContainerSet(String startHash,
return containers; // this may return less containers as demanded
}

public indexURLReference storeDocument(plasmaSwitchboardQueue.Entry entry, plasmaParserDocument document, plasmaCondenser condenser) throws IOException {
long startTime = System.currentTimeMillis();

// CREATE INDEX
String dc_title = document.dc_title();
yacyURL referrerURL = entry.referrerURL();
Date docDate = entry.getModificationDate();

// create a new loaded URL db entry
long ldate = System.currentTimeMillis();
indexURLReference newEntry = new indexURLReference(
entry.url(), // URL
dc_title, // document description
document.dc_creator(), // author
document.dc_subject(' '), // tags
"", // ETag
docDate, // modification date
new Date(), // loaded date
new Date(ldate + Math.max(0, ldate - docDate.getTime()) / 2), // freshdate, computed with Proxy-TTL formula
(referrerURL == null) ? null : referrerURL.hash(), // referer hash
new byte[0], // md5
(int) entry.size(), // size
condenser.RESULT_NUMB_WORDS, // word count
plasmaHTCache.docType(document.dc_format()), // doctype
condenser.RESULT_FLAGS, // flags
yacyURL.language(entry.url()), // language
document.inboundLinks(), // inbound links
document.outboundLinks(), // outbound links
document.getAudiolinks().size(), // laudio
document.getImages().size(), // limage
document.getVideolinks().size(), // lvideo
document.getApplinks().size() // lapp
);

// STORE URL TO LOADED-URL-DB
putURL(newEntry);

long storageEndTime = System.currentTimeMillis();

// STORE PAGE INDEX INTO WORD INDEX DB
int words = addPageIndex(
entry.url(), // document url
docDate, // document mod date
(int) entry.size(), // document size
document, // document content
condenser, // document condenser
yacyURL.language(entry.url()), // document language
plasmaHTCache.docType(document.dc_format()), // document type
document.inboundLinks(), // inbound links
document.outboundLinks() // outbound links
);

long indexingEndTime = System.currentTimeMillis();

if (log.isInfo()) {
// TODO: UTF-8 docDescription seems not to be displayed correctly because
// of string concatenation
log.logInfo("*Indexed " + words + " words in URL " + entry.url() +
" [" + entry.urlHash() + "]" +
"\n\tDescription: " + dc_title +
"\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " +
"Size: " + document.getTextLength() + " bytes | " +
"Anchors: " + ((document.getAnchors() == null) ? 0 : document.getAnchors().size()) +
"\n\tLinkStorageTime: " + (storageEndTime - startTime) + " ms | " +
"indexStorageTime: " + (indexingEndTime - storageEndTime) + " ms");
}

// finished
return newEntry;
}

public synchronized kelondroCloneableIterator<indexContainer> wordContainers(String startHash, boolean ram, boolean rot) {
kelondroCloneableIterator<indexContainer> i = wordContainers(startHash, ram);
if (rot) {
Expand Down

0 comments on commit 9b0e20f

Please sign in to comment.