Skip to content

Commit

Permalink
*) Minor changes
Browse files Browse the repository at this point in the history
   - more debugging output: storageTime for indexed document is logged now
   - saving memory in plasmaParserDocument.java, plasmaWordIndexEntryContainer.java (not a big deal)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@798 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
theli committed Sep 27, 2005
1 parent 3c1d968 commit 9b7f37f
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 17 deletions.
4 changes: 2 additions & 2 deletions source/de/anomic/plasma/plasmaParserDocument.java
Expand Up @@ -81,8 +81,8 @@ public plasmaParserDocument(URL location, String mimeType,
this.sections = (sections==null)?new String[0]:sections;
this.abstrct = (abstrct==null)?"":abstrct;
this.text = (text==null)?new byte[0]:text;
this.anchors = (anchors==null)?new HashMap():anchors;
this.images = (images==null)?new HashMap():images;
this.anchors = (anchors==null)?new HashMap(0):anchors;
this.images = (images==null)?new HashMap(0):images;
this.hyperlinks = null;
this.medialinks = null;
this.emaillinks = null;
Expand Down
32 changes: 22 additions & 10 deletions source/de/anomic/plasma/plasmaSwitchboard.java
Expand Up @@ -118,6 +118,8 @@ this class is also the core of the http crawling.
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.logging.Level;

import de.anomic.data.messageBoard;
import de.anomic.data.robotsParser;
import de.anomic.data.wikiBoard;
Expand Down Expand Up @@ -470,7 +472,7 @@ public boolean onlineCaution() {
}
}

private static String ppRamString(int bytes) {
private static String ppRamString(long bytes) {
if (bytes < 1024) return bytes + " KByte";
bytes = bytes / 1024;
if (bytes < 1024) return bytes + " MByte";
Expand Down Expand Up @@ -942,7 +944,10 @@ public boolean remoteTriggeredCrawlJob() {
private void processResourceStack(plasmaSwitchboardQueue.Entry entry) {
// work off one stack entry with a fresh resource
try {
long stackStartTime = 0, stackEndTime = 0, parsingStartTime = 0, parsingEndTime = 0, indexingStartTime = 0, indexingEndTime;
long stackStartTime = 0, stackEndTime = 0,
parsingStartTime = 0, parsingEndTime = 0,
indexingStartTime = 0, indexingEndTime = 0,
storageStartTime = 0, storageEndTime = 0;

// we must distinguish the following cases: resource-load was initiated by
// 1) global crawling: the index is extern, not here (not possible here)
Expand Down Expand Up @@ -1085,18 +1090,25 @@ private void processResourceStack(plasmaSwitchboardQueue.Entry entry) {
(entry.profile().localIndexing())) {
// remove stopwords
log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + entry.url());
indexingEndTime = System.currentTimeMillis();

// do indexing
//log.logDebug("Create Index for '" + entry.normalizedURLString() + "'");
storageStartTime = System.currentTimeMillis();
int words = searchManager.addPageIndex(entry.url(), urlHash, loadDate, condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()));
indexingEndTime = System.currentTimeMillis();
log.logInfo("*Indexed " + words + " words in URL " + entry.url() +
"\n\tDescription: " + descr + "\n\t" +
"MimeType: " + document.getMimeType() + " | " +
"Size: " + document.text.length + " bytes | " +
"StackingTime: " + (stackEndTime-stackStartTime) + " ms | " +
"ParsingTime: " + (parsingEndTime-parsingStartTime) + " ms | " +
"IndexingTime: " + (indexingEndTime-indexingStartTime) + " ms");
storageEndTime = System.currentTimeMillis();

if (log.isLoggable(Level.INFO)) {
log.logInfo("*Indexed " + words + " words in URL " + entry.url() +
"\n\tDescription: " + descr +
"\n\tMimeType: " + document.getMimeType() + " | " +
"Size: " + document.text.length + " bytes | " +
"Anchors: " + ((document.anchors==null)?0:document.anchors.size()) +
"\n\tStackingTime: " + (stackEndTime-stackStartTime) + " ms | " +
"ParsingTime: " + (parsingEndTime-parsingStartTime) + " ms | " +
"IndexingTime: " + (indexingEndTime-indexingStartTime) + " ms | " +
"StorageTime: " + (storageEndTime-storageStartTime) + " ms");
}

// if this was performed for a remote crawl request, notify requester
if ((processCase == 6) && (initiator != null)) {
Expand Down
14 changes: 9 additions & 5 deletions source/de/anomic/plasma/plasmaWordIndexEntryContainer.java
Expand Up @@ -57,16 +57,20 @@

import de.anomic.server.serverCodings;

public class plasmaWordIndexEntryContainer implements Comparable {
public final class plasmaWordIndexEntryContainer implements Comparable {

private String wordHash;
private HashMap container; // urlHash/plasmaWordIndexEntry - Mapping
private final String wordHash;
private final HashMap container; // urlHash/plasmaWordIndexEntry - Mapping
private long updateTime;

public plasmaWordIndexEntryContainer(String wordHash) {
this(wordHash,16);
}

public plasmaWordIndexEntryContainer(String wordHash, int initContainerSize) {
this.wordHash = wordHash;
this.updateTime = 0;
container = new HashMap(); // a urlhash/plasmaWordIndexEntry - relation
container = new HashMap(initContainerSize); // a urlhash/plasmaWordIndexEntry - relation
}

public int size() {
Expand Down Expand Up @@ -123,7 +127,7 @@ public Iterator entries() {
}

public static plasmaWordIndexEntryContainer instantContainer(String wordHash, long creationTime, plasmaWordIndexEntry entry) {
plasmaWordIndexEntryContainer c = new plasmaWordIndexEntryContainer(wordHash);
plasmaWordIndexEntryContainer c = new plasmaWordIndexEntryContainer(wordHash,1);
c.add(entry);
c.updateTime = creationTime;
return c;
Expand Down

0 comments on commit 9b7f37f

Please sign in to comment.