Skip to content

Commit

Permalink
*) setting htCache.Entry fields to private
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2484 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
theli committed Sep 4, 2006
1 parent ab5a9be commit 393a7d1
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 47 deletions.
2 changes: 1 addition & 1 deletion source/de/anomic/http/httpdProxyHandler.java
Expand Up @@ -630,7 +630,7 @@ private void fulfillRequestFromWeb(Properties conProp, URL url,String ext, httpH

String storeError = cacheEntry.shallStoreCacheForProxy();
boolean storeHTCache = cacheEntry.profile.storeHTCache();
boolean isSupportedContent = plasmaParser.supportedContent(plasmaParser.PARSER_MODE_PROXY,cacheEntry.url,cacheEntry.responseHeader.mime());
boolean isSupportedContent = plasmaParser.supportedContent(plasmaParser.PARSER_MODE_PROXY,cacheEntry.url(),cacheEntry.responseHeader.mime());
if (
/*
* Now we store the response into the htcache directory if
Expand Down
8 changes: 4 additions & 4 deletions source/de/anomic/plasma/crawler/http/CrawlWorker.java
Expand Up @@ -200,19 +200,19 @@ private plasmaHTCache.Entry load(int crawlingRetryCount) throws IOException {
htCache = createCacheEntry(requestDate, requestHeader, res);

// aborting download if content is to long ...
if (htCache.cacheFile.getAbsolutePath().length() > serverSystem.maxPathLength) {
if (htCache.cacheFile().getAbsolutePath().length() > serverSystem.maxPathLength) {
remote.close();
this.log.logInfo("REJECTED URL " + this.url.toString() + " because path too long '" + this.cacheManager.cachePath.getAbsolutePath() + "'");
addURLtoErrorDB(plasmaCrawlEURL.DENIED_CACHEFILE_PATH_TOO_LONG);
return (htCache = null);
}

// reserve cache entry
if (!htCache.cacheFile.getCanonicalPath().startsWith(this.cacheManager.cachePath.getCanonicalPath())) {
if (!htCache.cacheFile().getCanonicalPath().startsWith(this.cacheManager.cachePath.getCanonicalPath())) {
// if the response has not the right file type then reject file
remote.close();
this.log.logInfo("REJECTED URL " + this.url.toString() + " because of an invalid file path ('" +
htCache.cacheFile.getCanonicalPath() + "' does not start with '" +
htCache.cacheFile().getCanonicalPath() + "' does not start with '" +
this.cacheManager.cachePath.getAbsolutePath() + "').");
addURLtoErrorDB(plasmaCrawlEURL.DENIED_INVALID_CACHEFILE_PATH);
return (htCache = null);
Expand All @@ -231,7 +231,7 @@ private plasmaHTCache.Entry load(int crawlingRetryCount) throws IOException {
try {
fos = new FileOutputStream(cacheFile);
res.writeContent(fos); // superfluous write to array
htCache.cacheArray = null;
htCache.setCacheArray(null);
this.cacheManager.writeFileAnnouncement(cacheFile);
//htCache.cacheArray = res.writeContent(fos); // writes in cacheArray and cache file
} finally {
Expand Down
71 changes: 54 additions & 17 deletions source/de/anomic/plasma/plasmaHTCache.java
Expand Up @@ -701,23 +701,23 @@ public Entry newEntry(Date initDate, int depth, URL url, String name,
public final class Entry {

// the class objects
public Date initDate; // the date when the request happened; will be used as a key
public int depth; // the depth of prefetching
public httpHeader requestHeader; // we carry also the header to prevent too many file system access
public String responseStatus;
public httpHeader responseHeader; // we carry also the header to prevent too many file system access
public File cacheFile; // the cache file
public byte[] cacheArray; // or the cache as byte-array
public URL url;
public String name; // the name of the link, read as anchor from an <a>-tag
public String nomalizedURLHash;
public String nomalizedURLString;
public int status; // cache load/hit/stale etc status
public Date lastModified;
public char doctype;
public String language;
public plasmaCrawlProfile.entry profile;
private String initiator;
private Date initDate; // the date when the request happened; will be used as a key
private int depth; // the depth of prefetching
private httpHeader requestHeader; // we carry also the header to prevent too many file system access
private String responseStatus;
private httpHeader responseHeader; // we carry also the header to prevent too many file system access
private File cacheFile; // the cache file
private byte[] cacheArray; // or the cache as byte-array
private URL url;
private String name; // the name of the link, read as anchor from an <a>-tag
private String nomalizedURLHash;
private String nomalizedURLString;
private int status; // cache load/hit/stale etc status
private Date lastModified;
private char doctype;
private String language;
private plasmaCrawlProfile.entry profile;
private String initiator;

protected Object clone() throws CloneNotSupportedException {
return new Entry(
Expand Down Expand Up @@ -793,6 +793,19 @@ public Entry(Date initDate,
public String name() {
return this.name;
}

public URL url() {
return this.url;
}

public String urlHash() {
return this.nomalizedURLHash;
}

public plasmaCrawlProfile.entry profile() {
return this.profile;
}

public String initiator() {
return this.initiator;
}
Expand All @@ -804,6 +817,10 @@ public long size() {
return this.cacheArray.length;
}

public int depth() {
return this.depth;
}

public URL referrerURL() {
if (this.requestHeader == null) return null;
try {
Expand All @@ -813,6 +830,26 @@ public URL referrerURL() {
}
}

public File cacheFile() {
return this.cacheFile;
}

public void setCacheArray(byte[] data) {
this.cacheArray = data;
}

public byte[] cacheArray() {
return this.cacheArray;
}

public httpHeader requestHeader() {
return this.requestHeader;
}

public httpHeader responseHeader() {
return this.responseHeader;
}

/*
public boolean update() {
return ((status == CACHE_FILL) || (status == CACHE_STALE_RELOAD_GOOD));
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/plasma/plasmaSnippetCache.java
Expand Up @@ -173,7 +173,7 @@ public result retrieve(URL url, Set queryhashes, boolean fetchOnline, int snippe
if ((fetchOnline) && (resource == null)) {
plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000);
if (entry != null) {
header = entry.responseHeader;
header = entry.responseHeader();
}
resource = cacheManager.loadResource(url);
source = SOURCE_WEB;
Expand Down
48 changes: 24 additions & 24 deletions source/de/anomic/plasma/plasmaSwitchboard.java
Expand Up @@ -814,7 +814,7 @@ synchronized public boolean htEntryStoreProcess(plasmaHTCache.Entry entry) throw
* Testing if the content type is supported by the available parsers
* ========================================================================= */
boolean isSupportedContent = (entry.responseHeader != null) &&
plasmaParser.supportedContent(entry.url,entry.responseHeader.mime());
plasmaParser.supportedContent(entry.url(),entry.responseHeader.mime());

/* =========================================================================
* INDEX CONTROL HEADER
Expand All @@ -823,10 +823,10 @@ synchronized public boolean htEntryStoreProcess(plasmaHTCache.Entry entry) throw
* yacy to index the response returned as answer to a request
* ========================================================================= */
boolean doIndexing = true;
if (entry.requestHeader != null) {
if (entry.requestHeader() != null) {
if (
(entry.requestHeader.containsKey(httpHeader.X_YACY_INDEX_CONTROL)) &&
(((String) entry.requestHeader.get(httpHeader.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX"))
(entry.requestHeader().containsKey(httpHeader.X_YACY_INDEX_CONTROL)) &&
(((String) entry.requestHeader().get(httpHeader.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX"))
) {
doIndexing = false;
}
Expand All @@ -837,17 +837,17 @@ synchronized public boolean htEntryStoreProcess(plasmaHTCache.Entry entry) throw
*
* check if ip is local ip address
* ========================================================================= */
InetAddress hostAddress = httpc.dnsResolve(entry.url.getHost());
InetAddress hostAddress = httpc.dnsResolve(entry.url().getHost());
if (hostAddress == null) {
if (this.remoteProxyConfig == null || !this.remoteProxyConfig.useProxy()) {
this.log.logFine("Unknown host in URL '" + entry.url + "'. Will not be indexed.");
this.log.logFine("Unknown host in URL '" + entry.url() + "'. Will not be indexed.");
doIndexing = false;
}
} else if (hostAddress.isSiteLocalAddress()) {
this.log.logFine("Host in URL '" + entry.url + "' has private ip address. Will not be indexed.");
this.log.logFine("Host in URL '" + entry.url() + "' has private ip address. Will not be indexed.");
doIndexing = false;
} else if (hostAddress.isLoopbackAddress()) {
this.log.logFine("Host in URL '" + entry.url + "' has loopback ip address. Will not be indexed.");
this.log.logFine("Host in URL '" + entry.url() + "' has loopback ip address. Will not be indexed.");
doIndexing = false;
}

Expand All @@ -859,25 +859,25 @@ synchronized public boolean htEntryStoreProcess(plasmaHTCache.Entry entry) throw
* b) the content should be indexed
* ========================================================================= */
if (
(entry.profile.storeHTCache()) ||
(entry.profile().storeHTCache()) ||
(doIndexing && isSupportedContent)
) {
// store response header
if (entry.responseHeader != null) {
this.cacheManager.storeHeader(entry.nomalizedURLHash, entry.responseHeader);
this.log.logInfo("WROTE HEADER for " + entry.cacheFile);
this.cacheManager.storeHeader(entry.urlHash(), entry.responseHeader);
this.log.logInfo("WROTE HEADER for " + entry.cacheFile());
}

// work off unwritten files
if (entry.cacheArray == null) {
if (entry.cacheArray() == null) {
//this.log.logFine("EXISTING FILE (" + entry.cacheFile.length() + " bytes) for " + entry.cacheFile);
} else {
String error = entry.shallStoreCacheForProxy();
if (error == null) {
this.cacheManager.writeFile(entry.url, entry.cacheArray);
this.log.logFine("WROTE FILE (" + entry.cacheArray.length + " bytes) for " + entry.cacheFile);
this.cacheManager.writeFile(entry.url(), entry.cacheArray());
this.log.logFine("WROTE FILE (" + entry.cacheArray().length + " bytes) for " + entry.cacheFile());
} else {
this.log.logFine("WRITE OF FILE " + entry.cacheFile + " FORBIDDEN: " + error);
this.log.logFine("WRITE OF FILE " + entry.cacheFile() + " FORBIDDEN: " + error);
}
}
}
Expand All @@ -888,24 +888,24 @@ synchronized public boolean htEntryStoreProcess(plasmaHTCache.Entry entry) throw
if (doIndexing && isSupportedContent){

// registering the cachefile as in use
if (entry.cacheFile.exists()) {
plasmaHTCache.filesInUse.add(entry.cacheFile);
if (entry.cacheFile().exists()) {
plasmaHTCache.filesInUse.add(entry.cacheFile());
}

// enqueue for further crawling
enQueue(this.sbQueue.newEntry(
entry.url,
entry.url(),
indexURL.urlHash(entry.referrerURL()),
entry.requestHeader.ifModifiedSince(),
entry.requestHeader.containsKey(httpHeader.COOKIE),
entry.requestHeader().ifModifiedSince(),
entry.requestHeader().containsKey(httpHeader.COOKIE),
entry.initiator(),
entry.depth,
entry.profile.handle(),
entry.depth(),
entry.profile().handle(),
entry.name()
));
} else {
if (!entry.profile.storeHTCache() && entry.cacheFile.exists()) {
this.cacheManager.deleteFile(entry.url);
if (!entry.profile().storeHTCache() && entry.cacheFile().exists()) {
this.cacheManager.deleteFile(entry.url());
}
}

Expand Down

0 comments on commit 393a7d1

Please sign in to comment.