Permalink
Browse files

Fixed webstructure.xml API used with a domain name 'about' parameter.

As described in mantis 720 (http://mantis.tokeek.de/view.php?id=720),
when requesting this API with a domain name instead of a complete URL
only HTTP references on default port were listed.
  • Loading branch information...
luccioman committed Jan 16, 2017
1 parent 0da1e6b commit ed3dd5e31a9177a4b1c3957fdce7a9b8b2859265
@@ -25,8 +25,10 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.ASCII;
@@ -102,12 +104,12 @@ public static serverObjects respond(final RequestHeader header, final serverObje
if (about != null) {
DigestURL url = null;
byte[] urlhash = null;
String hosthash = null;
Set<String> hostHashes = new HashSet<>();
if (about.length() == 6 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
hosthash = about;
hostHashes.add(about);
} else if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
urlhash = ASCII.getBytes(about);
hosthash = about.substring(6);
hostHashes.add(about.substring(6));
try {
url = authenticated ? sb.getURL(urlhash) : null;
} catch (IOException e) {
@@ -119,27 +121,37 @@ public static serverObjects respond(final RequestHeader header, final serverObje
try {
url = new DigestURL(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains
urlhash = url.hash();
hosthash = url.hosthash();
if(about.indexOf("://") >= 0) {
hostHashes.add(url.hosthash());
} else {
hostHashes.addAll(sb.webStructure.hostName2HostHashes(about));
}
} catch (final MalformedURLException e) {
}
}
if (hosthash != null) {
if (!hostHashes.isEmpty()) {
prop.put("out", 1);
prop.put("in", 1);
WebStructureGraph.StructureEntry sentry = sb.webStructure.outgoingReferences(hosthash);
if (sentry != null && sentry.references.size() > 0) {
reference(prop, "out", 0, sentry, sb.webStructure);
prop.put("out_domains", 1);
} else {
prop.put("out_domains", 0);
}
sentry = sb.webStructure.incomingReferences(hosthash);
if (sentry != null && sentry.references.size() > 0) {
reference(prop, "in", 0, sentry, sb.webStructure);
prop.put("in_domains", 1);
} else {
prop.put("in_domains", 0);
int inCount = 0, outCount = 0;
for(final String hostHash: hostHashes) {
WebStructureGraph.StructureEntry sentry = sb.webStructure.outgoingReferences(hostHash);
if (sentry != null && sentry.references.size() > 0) {
reference(prop, "out", outCount, sentry, sb.webStructure);
outCount++;
} else {
prop.put("out_domains", 0);
}
sentry = sb.webStructure.incomingReferences(hostHash);
if (sentry != null && sentry.references.size() > 0) {
reference(prop, "in", inCount, sentry, sb.webStructure);
prop.put("in_domains", 1);
inCount++;
} else {
prop.put("in_domains", 0);
}
}
prop.put("out_domains", outCount);
prop.put("in_domains", inCount);
}
if (urlhash != null) {
// anchors
@@ -560,8 +560,12 @@ private void incomingReferencesEnrich(
}
}
/**
*
* @param hosthash
* @return the number of hosts that are referenced by this hosthash
*/
public int referencesCount(final String hosthash) {
// returns the number of hosts that are referenced by this hosthash
assert hosthash.length() == 6 : "hosthash = " + hosthash;
if (hosthash == null || hosthash.length() != 6) return 0;
SortedMap<String, byte[]> tailMap;
@@ -591,6 +595,10 @@ public int referencesCount(final String hosthash) {
return c;
}
/**
* @param hosthash host name hash
* @return the host name corresponding to the given hash or null when the hash is not known
*/
public String hostHash2hostName(final String hosthash) {
// returns the host as string, null if unknown
assert hosthash.length() == 6;
@@ -615,6 +623,42 @@ public String hostHash2hostName(final String hosthash) {
}
return null;
}
/**
* Look for host hashes corresponding to the given host name. There can be
* multiple host hashes for one host name as the used hash function
* {@link DigestURL#hosthash()} returns a different result for each
* different protocol or port with a same host name.
*
* @param hostName
* host name
* @return the host hashes corresponding to the given host name or an emtpy set when
* the host name is not known
*/
public Set<String> hostName2HostHashes(final String hostName) {
Set<String> hashes = new HashSet<>();
synchronized (this.structure_old) {
String keyHostName, hash;
for (String key : structure_old.keySet()) {
hash = key.substring(0, 6);
keyHostName = key.substring(7);
if (keyHostName.equalsIgnoreCase(hostName)) {
hashes.add(hash);
}
}
}
synchronized (this.structure_new) {
String keyHostName, hash;
for (String key : structure_new.keySet()) {
hash = key.substring(0, 6);
keyHostName = key.substring(7);
if (keyHostName.equalsIgnoreCase(hostName)) {
hashes.add(hash);
}
}
}
return hashes;
}
protected void learnrefs(final LearnObject lro) {
@@ -252,7 +252,15 @@ public static void main(String args[]) throws MalformedURLException {
graph.outgoingReferences(sourceHash);
}
endTime = System.nanoTime();
System.out.println("testPerfs outgoingReferences running time : " + ((endTime - beginTime) / 1000000000) + " seconds");
System.out.println("testPerfs outgoingReferences running time : " + ((endTime - beginTime) / 1000000000) + " seconds");
beginTime = System.nanoTime();
/* Loop and look for host hashes from host name on each sample generated source */
for(int i = 0; i < WebStructureGraph.maxhosts; i++) {
graph.hostName2HostHashes("source" + i + ".net");
}
endTime = System.nanoTime();
System.out.println("testPerfs hostName2HostHashes running time : " + ((endTime - beginTime) / 1000000000) + " seconds");
} finally {
graph.close();

0 comments on commit ed3dd5e

Please sign in to comment.