Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement rules using a trie data structure. #97

Merged
merged 8 commits into from
Sep 1, 2017
Merged
28 changes: 14 additions & 14 deletions index.js
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
"use strict";

var allRules = require('./rules.json');
// Load rules
var Trie = require('./lib/suffix-trie.js');
var allRules = Trie.fromJson(require('./rules.json'));

var cleanHostValue = require('./lib/clean-host.js');
var escapeRegExp = require('./lib/escape-regexp.js');
var getRulesForTld = require('./lib/tld-rules.js');
var getDomain = require('./lib/domain.js');
var getPublicSuffix = require('./lib/public-suffix.js');
var getSubdomain = require('./lib/subdomain.js');
var isValid = require('./lib/is-valid.js');
var getPublicSuffix = require('./lib/public-suffix.js');
var tldExists = require('./lib/tld-exists.js');


/**
* Creates a new instance of tldjs
* @param {Object.<rules,validHosts>} options [description]
Expand All @@ -22,19 +23,17 @@ function factory(options) {

return {
cleanHostValue: cleanHostValue,
escapeRegExp: escapeRegExp,
getRulesForTld: getRulesForTld,
getDomain: function (host) {
return getDomain(rules, validHosts, host);
getDomain: function (hostname) {
return getDomain(rules, validHosts, hostname);
},
getSubdomain: function (host) {
return getSubdomain(rules, validHosts, host);
getSubdomain: function (hostname) {
return getSubdomain(rules, validHosts, hostname);
},
isValid: function (host) {
return isValid(validHosts, host);
isValid: function (hostname) {
return isValid(validHosts, hostname);
},
getPublicSuffix: function (host) {
return getPublicSuffix(rules, host);
getPublicSuffix: function (hostname) {
return getPublicSuffix(rules, hostname);
},
tldExists: function (tld) {
return tldExists(rules, tld);
Expand All @@ -43,4 +42,5 @@ function factory(options) {
};
}


module.exports = factory({ validHosts: [], rules: allRules });
46 changes: 0 additions & 46 deletions lib/canditate-rule.js

This file was deleted.

15 changes: 10 additions & 5 deletions lib/clean-host.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,23 @@ var URL = require('url');
// scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
var hasPrefixRE = /^(([a-z][a-z0-9+.-]*)?:)?\/\//;
var invalidHostnameChars = /[^A-Za-z0-9.-]/;
var trailingDotsRE = /[.]+$/g;

//@see https://github.com/oncletom/tld.js/issues/95
// @see https://github.com/oncletom/tld.js/issues/95
function rtrim(value) {
return String(value).replace(/[.]+$/g, '');
return String(value).replace(trailingDotsRE, '');
}

module.exports = function cleanHostValue(value){
module.exports = function cleanHostValue(value) {
value = String(value).trim().toLowerCase();

var parts = URL.parse(hasPrefixRE.test(value) ? value : '//' + value, null, true);

if (parts.hostname && !invalidHostnameChars.test(parts.hostname)) { return rtrim(parts.hostname); }
if (!invalidHostnameChars.test(value)) { return rtrim(value); }
if (parts.hostname && !invalidHostnameChars.test(parts.hostname)) {
return rtrim(parts.hostname);
} else if (!invalidHostnameChars.test(value)) {
return rtrim(value);
}

return '';
};
121 changes: 103 additions & 18 deletions lib/domain.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,85 @@
var Rule = require('./rule.js');
"use strict";

var isValid = require('./is-valid.js');
var cleanHostValue = require('./clean-host.js');
var extractTldFromHost = require('./from-host.js');
var getCandidateRule = require('./canditate-rule.js');
var getRulesForTld = require('./tld-rules.js');
var getPublicSuffix = require('./public-suffix.js');


/**
* Polyfill for `endsWith`
*
* @param {string} str
* @param {string} pattern
* @return {boolean}
*/
function endsWith(str, pattern) {
return (
str.lastIndexOf(pattern) === (str.length - pattern.length)
);
}


/**
* Check if `vhost` is a valid suffix of `hostname` (top-domain)
*
* It means that `vhost` needs to be a suffix of `hostname` and we then need to
* make sure that: either they are equal, or the character preceding `vhost` in
* `hostname` is a '.' (it should not be a partial label).
*
* * hostname = 'not.evil.com' and vhost = 'vil.com' => not ok
* * hostname = 'not.evil.com' and vhost = 'evil.com' => ok
* * hostname = 'not.evil.com' and vhost = 'not.evil.com' => ok
*
* @param {string} hostname
* @param {string} vhost
* @return {boolean}
*/
function shareSameDomainSuffix(hostname, vhost) {
if (endsWith(hostname, vhost)) {
return (
hostname.length === vhost.length ||
hostname[hostname.length - vhost.length - 1] === '.'
);
}

return false;
}


/**
* Given a hostname and its public suffix, extract the general domain.
*
* @param {string} hostname
* @param {string} publicSuffix
* @return {string}
*/
function extractDomainWithSuffix(hostname, publicSuffix) {
// Locate the index of the last '.' in the part of the `hostname` preceding
// the public suffix.
//
// examples:
// 1. not.evil.co.uk => evil.co.uk
// ^ ^
// | | start of public suffix
// | index of the last dot
//
// 2. example.co.uk => example.co.uk
// ^ ^
// | | start of public suffix
// |
// | (-1) no dot found before the public suffix
var publicSuffixIndex = hostname.length - publicSuffix.length - 2;
var lastDotBeforeSuffixIndex = hostname.lastIndexOf('.', publicSuffixIndex);

// No '.' found, then `hostname` is the general domain (no sub-domain)
if (lastDotBeforeSuffixIndex === -1) {
return hostname;
}

// Extract the part between the last '.'
return hostname.substr(lastDotBeforeSuffixIndex + 1);
}


/**
* Detects the domain based on rules and upon and a host string
Expand All @@ -12,26 +88,35 @@ var getRulesForTld = require('./tld-rules.js');
* @param {string} host
* @return {String}
*/
module.exports = function getDomain (allRules, validHosts, host) {
var domain = null, hostTld, rules, rule;
var _validHosts = validHosts || [];
module.exports = function getDomain(rules, validHosts, hostname) {
hostname = cleanHostValue(hostname);

if (isValid(_validHosts, host) === false) {
if (isValid(validHosts, hostname) === false) {
return null;
}

host = cleanHostValue(host);
hostTld = extractTldFromHost(host);
rules = getRulesForTld(allRules, hostTld, new Rule({"firstLevel": hostTld, "isHost": _validHosts.indexOf(hostTld) !== -1}));
rule = getCandidateRule(host, rules);
// Check if `hostname` ends with a member of `validHosts`.
for (var i = 0; i < validHosts.length; i += 1) {
var vhost = validHosts[i];
if (shareSameDomainSuffix(hostname, vhost)) {
return vhost;
}
}

// To extract the general domain, we start by identifying the public suffix
// (if any), then consider the domain to be the public suffix with one added
// level of depth. (e.g.: if hostname is `not.evil.co.uk` and public suffix:
// `co.uk`, then we take one more level: `evil`, giving the final result:
// `evil.co.uk`).
var suffix = getPublicSuffix(rules, hostname);

if (rule === null) {
// If `hostname` is a valid public suffix, then there is no domain to return.
// Since we already know that `getPublicSuffix` returns a suffix of `hostname`
// there is no need to perform a string comparison and we only compare the
// size.
if (suffix.length === hostname.length) {
return null;
}

host.replace(new RegExp(rule.getPattern()), function (m, d) {
domain = d;
});

return domain;
return extractDomainWithSuffix(hostname, suffix);
};
11 changes: 0 additions & 11 deletions lib/escape-regexp.js

This file was deleted.

34 changes: 0 additions & 34 deletions lib/exports/standard-json.js

This file was deleted.

11 changes: 9 additions & 2 deletions lib/from-host.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
"use strict";

/**
* Utility to extract the TLD from a host string
*
* @param {string} host
* @return {String}
*/
module.exports = function extractTldFromHost(host){
return host.split('.').pop();
module.exports = function extractTldFromHost(host) {
var lastDotIndex = host.lastIndexOf('.');
if (lastDotIndex === -1) {
return null;
}

return host.substr(lastDotIndex + 1);
};
4 changes: 3 additions & 1 deletion lib/is-valid.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"use strict";

/**
* Checking if a host string is valid
* It's usually a preliminary check before trying to use getDomain or anything else
Expand All @@ -8,6 +10,6 @@
* @param host {String}
* @return {Boolean}
*/
module.exports = function isValid (validHosts, host) {
module.exports = function isValid(validHosts, host) {
return typeof host === 'string' && (validHosts.indexOf(host) !== -1 || (host.indexOf('.') !== -1 && host[0] !== '.'));
};
Loading