Permalink
Browse files

Updated the generic top-level known domains list.

Using current IANA reference list at
https://www.iana.org/domains/root/db

The generated URL hashes on these domains stay the same but performance
is greatly improved as a DNS resolve request is required on URL hash
computation when the TLD part of the host name is unknown.

Hash computation mean time measured on 1541 sample URLs (one on each
TLD) and a computer with a DSL connection : about 230ms before change,
then only 20ms.
  • Loading branch information...
luccioman committed Nov 14, 2017
1 parent 938d8a9 commit ac209cac2e408f17966d323ac1d902ff4ffe6e11
@@ -221,7 +221,7 @@ public String hosthash() {
/**
* calculated YaCy-Hash of this URI
*
* @note needs DNS lookup to check if the addresses domain is local
* @note needs DNS lookup to check if the addresses domain is local (when the top-level domain part in the host name is unknown)
* that causes that this method may be very slow
*
* @return hash
@@ -478,21 +478,28 @@ public void run() {
"ZW=Zimbabwe",
"YT=Mayotte"
};
private static final String[] TLD_Generic = {
"COM=US Commercial",
private static final String[] TLD_Sponsored = {
"AERO=The air-transport industry",
"ARPA=operationally-critical infrastructural identifier spaces",
"BIZ=Business",
"COOP=cooperative associations",
"INFO=",
"JOBS=human resource managers",
"MOBI=mobile products and services",
"MUSEUM=Museums",
"NAME=Individuals",
"PRO=Credentialed professionals",
"TEL=Published contact data",
"TRAVEL=The travel industry",
"INT=International",
};
private static final String[] TLD_Infrastructure = {
"ARPA=operationally-critical infrastructural identifier spaces",
};
private static final String[] TLD_GenericRestricted = {
"BIZ=Business",
"NAME=Individuals",
"PRO=Credentialed professionals",
};
private static final String[] TLD_OpenNIC = {
// domains from the OpenNIC project, http://www.opennicproject.org, see also http://wiki.opennic.glue/OpenNICNamespaces
"GLUE=OpenNIC Internal Architectural use",
"BBS=OpenNIC Bulletin Board System servers",
@@ -658,7 +665,10 @@ public void run() {
ccSLD_TLD.addAll(Arrays.asList(ccSLD_TLD_list));
}
private static Map<String, Integer> TLDID = new ConcurrentHashMap<String, Integer>(32);
/**
* Map top-level domains (lower caes) to TLD category identifiers.
*/
private static Map<String, Integer> TLDID = new ConcurrentHashMap<String, Integer>();
//private static HashMap<String, String> TLDName = new HashMap<String, String>();
private static void insertTLDProps(final String[] TLDList, final int id) {
@@ -697,8 +707,21 @@ private static void insertTLDProps(final String[] TLDList, final int id) {
insertTLDProps(TLD_MiddleEastWestAsia, TLD_MiddleEastWestAsia_ID);
insertTLDProps(TLD_NorthAmericaOceania, TLD_NorthAmericaOceania_ID);
insertTLDProps(TLD_Africa, TLD_Africa_ID);
insertTLDProps(TLD_Generic, TLD_Generic_ID);
// the id=7 is used to flag local addresses
for(GenericTLD tld : GenericTLD.values()) {
TLDID.put(tld.getDomainName(), TLD_Generic_ID);
}
/*
* IANA lists the following top-level domains in other catetories than 'generic' but we
* still associate them with YaCy's TLD_Generic_ID otherwise the URLs hash would
* be modified
*/
insertTLDProps(TLD_GenericRestricted, TLD_Generic_ID);
insertTLDProps(TLD_Infrastructure, TLD_Generic_ID);
insertTLDProps(TLD_Sponsored, TLD_Generic_ID);
insertTLDProps(TLD_OpenNIC, TLD_Generic_ID);
// the id=7 (TLD_Local_ID) is used to flag local addresses
}
private static KeyList globalHosts = null;
@@ -1113,6 +1136,7 @@ public static boolean isThisHostIP(final InetAddress clientAddress) {
public static int getDomainID(final String host, final InetAddress hostaddress) {
if (host == null || host.isEmpty()) return TLD_Local_ID;
final int p = host.lastIndexOf('.');
// TODO (must be careful as this would change URL hash generation) : lower case the TLD part before checking its category id, as the TLDID map contains lower cased TLDs as keys */
final String tld = (p > 0) ? host.substring(p + 1) : "";
final Integer i = TLDID.get(tld);
if (i != null) return i.intValue();
@@ -1182,6 +1206,7 @@ private static boolean isLocal(final String host, InetAddress hostaddress, final
// check simply if the tld in the host is a known tld
final int p = host.lastIndexOf('.');
// TODO (must be careful as this would change URL hash generation) : lower case the TLD part before checking its category id, as the TLDID map contains lower cased TLDs as keys */
final String tld = (p > 0) ? host.substring(p + 1) : "";
final Integer i = TLDID.get(tld);
if (i != null) return false;
@@ -1195,7 +1220,8 @@ private static boolean isLocal(final String host, InetAddress hostaddress, final
private static boolean isLocal(final InetAddress a) {
final boolean
localp = noLocalCheck || // DO NOT REMOVE THIS! it is correct to return true if the check is off
a == null ||
a == null || // TODO returning true here after dns resolution failed can make hash generation inconsistent on some hosts
// (hash is marked with TLD_LOCAL_ID when host name is not found within timeout, but then is marked again with TLD_Generic when the host name is found within timeout on another request)
a.isAnyLocalAddress() ||
a.isLinkLocalAddress() ||
a.isLoopbackAddress() ||
@@ -1313,4 +1339,4 @@ public static void main(final String[] args) {
System.out.println("Intranet IP: " + b);
}
}
}
}
Oops, something went wrong.

0 comments on commit ac209ca

Please sign in to comment.