Permalink
Browse files

Fixed webstructure.xml API used with a domain name 'about' parameter.

As described in mantis 720 (http://mantis.tokeek.de/view.php?id=720),
when requesting this API with a domain name instead of a complete URL
only HTTP references on default port were listed.
  • Loading branch information...
1 parent 0da1e6b commit ed3dd5e31a9177a4b1c3957fdce7a9b8b2859265 @luccioman luccioman committed Jan 16, 2017
@@ -25,8 +25,10 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
+import java.util.Set;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.ASCII;
@@ -102,12 +104,12 @@ public static serverObjects respond(final RequestHeader header, final serverObje
if (about != null) {
DigestURL url = null;
byte[] urlhash = null;
- String hosthash = null;
+ Set<String> hostHashes = new HashSet<>();
if (about.length() == 6 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
- hosthash = about;
+ hostHashes.add(about);
} else if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
urlhash = ASCII.getBytes(about);
- hosthash = about.substring(6);
+ hostHashes.add(about.substring(6));
try {
url = authenticated ? sb.getURL(urlhash) : null;
} catch (IOException e) {
@@ -119,27 +121,37 @@ public static serverObjects respond(final RequestHeader header, final serverObje
try {
url = new DigestURL(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains
urlhash = url.hash();
- hosthash = url.hosthash();
+ if(about.indexOf("://") >= 0) {
+ hostHashes.add(url.hosthash());
+ } else {
+ hostHashes.addAll(sb.webStructure.hostName2HostHashes(about));
+ }
} catch (final MalformedURLException e) {
}
}
- if (hosthash != null) {
+ if (!hostHashes.isEmpty()) {
prop.put("out", 1);
prop.put("in", 1);
- WebStructureGraph.StructureEntry sentry = sb.webStructure.outgoingReferences(hosthash);
- if (sentry != null && sentry.references.size() > 0) {
- reference(prop, "out", 0, sentry, sb.webStructure);
- prop.put("out_domains", 1);
- } else {
- prop.put("out_domains", 0);
- }
- sentry = sb.webStructure.incomingReferences(hosthash);
- if (sentry != null && sentry.references.size() > 0) {
- reference(prop, "in", 0, sentry, sb.webStructure);
- prop.put("in_domains", 1);
- } else {
- prop.put("in_domains", 0);
+ int inCount = 0, outCount = 0;
+ for(final String hostHash: hostHashes) {
+ WebStructureGraph.StructureEntry sentry = sb.webStructure.outgoingReferences(hostHash);
+ if (sentry != null && sentry.references.size() > 0) {
+ reference(prop, "out", outCount, sentry, sb.webStructure);
+ outCount++;
+ } else {
+ prop.put("out_domains", 0);
+ }
+ sentry = sb.webStructure.incomingReferences(hostHash);
+ if (sentry != null && sentry.references.size() > 0) {
+ reference(prop, "in", inCount, sentry, sb.webStructure);
+ prop.put("in_domains", 1);
+ inCount++;
+ } else {
+ prop.put("in_domains", 0);
+ }
}
+ prop.put("out_domains", outCount);
+ prop.put("in_domains", inCount);
}
if (urlhash != null) {
// anchors
@@ -560,8 +560,12 @@ private void incomingReferencesEnrich(
}
}
+ /**
+ *
+ * @param hosthash
+ * @return the number of hosts that are referenced by this hosthash
+ */
public int referencesCount(final String hosthash) {
- // returns the number of hosts that are referenced by this hosthash
assert hosthash.length() == 6 : "hosthash = " + hosthash;
if (hosthash == null || hosthash.length() != 6) return 0;
SortedMap<String, byte[]> tailMap;
@@ -591,6 +595,10 @@ public int referencesCount(final String hosthash) {
return c;
}
+ /**
+ * @param hosthash host name hash
+ * @return the host name corresponding to the given hash or null when the hash is not known
+ */
public String hostHash2hostName(final String hosthash) {
// returns the host as string, null if unknown
assert hosthash.length() == 6;
@@ -615,6 +623,42 @@ public String hostHash2hostName(final String hosthash) {
}
return null;
}
+
+ /**
+ * Look for host hashes corresponding to the given host name. There can be
+ * multiple host hashes for one host name as the used hash function
+ * {@link DigestURL#hosthash()} returns a different result for each
+ * different protocol or port with a same host name.
+ *
+ * @param hostName
+ * host name
+ * @return the host hashes corresponding to the given host name or an emtpy set when
+ * the host name is not known
+ */
+ public Set<String> hostName2HostHashes(final String hostName) {
+ Set<String> hashes = new HashSet<>();
+ synchronized (this.structure_old) {
+ String keyHostName, hash;
+ for (String key : structure_old.keySet()) {
+ hash = key.substring(0, 6);
+ keyHostName = key.substring(7);
+ if (keyHostName.equalsIgnoreCase(hostName)) {
+ hashes.add(hash);
+ }
+ }
+ }
+ synchronized (this.structure_new) {
+ String keyHostName, hash;
+ for (String key : structure_new.keySet()) {
+ hash = key.substring(0, 6);
+ keyHostName = key.substring(7);
+ if (keyHostName.equalsIgnoreCase(hostName)) {
+ hashes.add(hash);
+ }
+ }
+ }
+ return hashes;
+ }
protected void learnrefs(final LearnObject lro) {
@@ -252,7 +252,15 @@ public static void main(String args[]) throws MalformedURLException {
graph.outgoingReferences(sourceHash);
}
endTime = System.nanoTime();
- System.out.println("testPerfs outgoingReferences running time : " + ((endTime - beginTime) / 1000000000) + " seconds");
+ System.out.println("testPerfs outgoingReferences running time : " + ((endTime - beginTime) / 1000000000) + " seconds");
+
+ beginTime = System.nanoTime();
+ /* Loop and look for host hashes from host name on each sample generated source */
+ for(int i = 0; i < WebStructureGraph.maxhosts; i++) {
+ graph.hostName2HostHashes("source" + i + ".net");
+ }
+ endTime = System.nanoTime();
+ System.out.println("testPerfs hostName2HostHashes running time : " + ((endTime - beginTime) / 1000000000) + " seconds");
} finally {
graph.close();

0 comments on commit ed3dd5e

Please sign in to comment.