Skip to content

Commit

Permalink
Switch to whatwg-url for url parsing + add ip validation
Browse files Browse the repository at this point in the history
  • Loading branch information
remusao committed Sep 10, 2017
1 parent 9a7a3f2 commit bd5b82d
Show file tree
Hide file tree
Showing 7 changed files with 266 additions and 52 deletions.
70 changes: 48 additions & 22 deletions bin/benchmark.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ var tld = require('../index.js');
var Benchmark = require('benchmark');


var DOMAINS = [
var HOSTNAMES = [
// No public suffix
'example.foo.edu.au', // null
'example.foo.edu.sh', // null
Expand All @@ -30,7 +30,10 @@ var DOMAINS = [
'example.www.ck', // !www.ck
'foo.bar.baz.city.yokohama.jp', // !city.yokohama.jp
'example.city.kobe.jp', // !city.kobe.jp
];


var URLS = [
// IDN labels
'example.北海道.jp', // 北海道.jp
'example.和歌山.jp', // 和歌山.jp
Expand All @@ -44,54 +47,62 @@ var DOMAINS = [
'FOO.bar.BAZ.ortsinfo.AT', // null

// Full URLs
// '2001:0DB8:0100:F101:0210:A4FF:FEE3:9566',
// 'http://user:pass@www.examplegoogle.com:21/blah#baz',
// 'http://iris.test.ing/ḍ̇/?ḍ̇#ḍ̇',
// 'http://0000000000000300.0xffffffffFFFFFFFF.3022415481470977',
'2001:0DB8:0100:F101:0210:A4FF:FEE3:9566',
'http://user:pass@www.examplegoogle.com:21/blah#baz',
'http://iris.test.ing/ḍ̇/?ḍ̇#ḍ̇',
'http://0000000000000300.0xffffffffFFFFFFFF.3022415481470977',
'http://192.168.0.1/',
'http://%30%78%63%30%2e%30%32%35%30.01%2e',
'http://user:pass@[::1]/segment/index.html?query#frag',
'https://[::1]',
];


// TODO - Compare to other libraries
function main() {
function bench(values) {
console.log(
'While interpreting the results, keep in mind that each "op" reported' +
' by the benchmark is processing ' + DOMAINS.length + ' domains'
' by the benchmark is processing ' + values.length + ' domains'
);

new Benchmark.Suite()
.add('tldjs#isIp', () => {
for (var i = 0; i < values.length; i += 1) {
tld.isIp(values[i]);
}
})
.add('tldjs#isValid', () => {
for (var i = 0; i < DOMAINS.length; i += 1) {
tld.isValid(DOMAINS[i]);
for (var i = 0; i < values.length; i += 1) {
tld.isValid(values[i]);
}
})
.add('tldjs#extractHostname', () => {
for (var i = 0; i < DOMAINS.length; i += 1) {
tld.extractHostname(DOMAINS[i]);
for (var i = 0; i < values.length; i += 1) {
tld.extractHostname(values[i]);
}
})
.add('tldjs#tldExists', () => {
for (var i = 0; i < DOMAINS.length; i += 1) {
tld.tldExists(DOMAINS[i]);
for (var i = 0; i < values.length; i += 1) {
tld.tldExists(values[i]);
}
})
.add('tldjs#getPublicSuffix', () => {
for (var i = 0; i < DOMAINS.length; i += 1) {
tld.getPublicSuffix(DOMAINS[i]);
for (var i = 0; i < values.length; i += 1) {
tld.getPublicSuffix(values[i]);
}
})
.add('tldjs#getDomain', () => {
for (var i = 0; i < DOMAINS.length; i += 1) {
tld.getDomain(DOMAINS[i]);
for (var i = 0; i < values.length; i += 1) {
tld.getDomain(values[i]);
}
})
.add('tldjs#getSubdomain', () => {
for (var i = 0; i < DOMAINS.length; i += 1) {
tld.getSubdomain(DOMAINS[i]);
for (var i = 0; i < values.length; i += 1) {
tld.getSubdomain(values[i]);
}
})
.add('tldjs#parse', () => {
for (var i = 0; i < DOMAINS.length; i += 1) {
tld.parse(DOMAINS[i]);
for (var i = 0; i < values.length; i += 1) {
tld.parse(values[i]);
}
})
.on('cycle', function (event) {
Expand All @@ -101,4 +112,19 @@ function main() {
}


// TODO - Compare to other libraries
function main() {
console.log('>>> -------------------- <<<');
console.log('>>> Only valid hostnames <<<');
console.log('>>> -------------------- <<<');
bench(HOSTNAMES);

console.log();
console.log('>>> ----------- <<<');
console.log('>>> Random URLs <<<');
console.log('>>> ----------- <<<');
bench(URLS);
}


main();
19 changes: 18 additions & 1 deletion index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
'use strict';


// Load rules
var Trie = require('./lib/suffix-trie.js');
var allRules = Trie.fromJson(require('./rules.json'));
Expand All @@ -10,6 +11,7 @@ var getDomain = require('./lib/domain.js');
var getPublicSuffix = require('./lib/public-suffix.js');
var getSubdomain = require('./lib/subdomain.js');
var isValid = require('./lib/is-valid.js');
var isIp = require('./lib/is-ip.js');
var tldExists = require('./lib/tld-exists.js');


Expand Down Expand Up @@ -50,12 +52,26 @@ function factory(options) {
var result = {
hostname: _extractHostname(url),
isValid: null,
tldExists: null,
isIp: null,
tldExists: false,
publicSuffix: null,
domain: null,
subdomain: null,
};

if (result.hostname === null) {
result.isIp = false;
result.isValid = false;
return result;
}

// Check if `hostname` is a valid ip address
result.isIp = isIp(result.hostname);
if (result.isIp) {
result.isValid = true;
return result;
}

// Check if `hostname` is valid
result.isValid = isValid(result.hostname);
if (result.isValid === false) return result;
Expand Down Expand Up @@ -83,6 +99,7 @@ function factory(options) {

return {
extractHostname: _extractHostname,
isIp: isIp,
isValid: isValid,
parse: parse,
tldExists: function (url) {
Expand Down
21 changes: 15 additions & 6 deletions lib/clean-host.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

var URL = require('url');
var URL = require('whatwg-url').URL;
var isValid = require('./is-valid.js');


Expand All @@ -8,7 +8,6 @@ var isValid = require('./is-valid.js');
*
* Works for:
* - hostname
* - //hostname
* - scheme://hostname
* - scheme+scheme://hostname
*
Expand All @@ -18,7 +17,7 @@ var isValid = require('./is-valid.js');

// scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
var hasPrefixRE = /^(([a-z][a-z0-9+.-]*)?:)?\/\//;
var invalidHostnameChars = /[^A-Za-z0-9.-]/;


// @see https://github.com/oncletom/tld.js/issues/95
function rtrim(value) {
Expand All @@ -28,6 +27,7 @@ function rtrim(value) {
return value;
}


module.exports = function extractHostname(value) {
if (isValid(value)) {
return rtrim(value);
Expand All @@ -39,11 +39,20 @@ module.exports = function extractHostname(value) {
return rtrim(url);
}

if (!hasPrefixRE.test(url)) {
url = 'http://' + url;
}

// Proceed with heavier url parsing to extract the hostname.
var parts = URL.parse(hasPrefixRE.test(url) ? url : '//' + url, null, true);
var hostname;
try {
hostname = new URL(url).hostname;
} catch (ex) {
return null;
}

if (parts.hostname && !invalidHostnameChars.test(parts.hostname)) {
return rtrim(parts.hostname);
if (hostname) {
return rtrim(hostname);
}

return null;
Expand Down
59 changes: 59 additions & 0 deletions lib/is-ip.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
'use strict';


var isIpv4 = require('is-ip').v4;


/**
* Check if `hostname` could be a valid ipv4 address (contains only characters
* in the range: [0-9.]). If this is the case, we will perform a more exhaustive
* check using 'is-ip' library (which is more expensive).
*
* @param {string} hostname
* @return {boolean}
*/
function isProbablyIpv4(hostname) {
for (var i = 0; i < hostname.length; i += 1) {
var code = hostname.charCodeAt(i);
// 46 => '.'
// 48 => '0'
// 57 => '9'
if (code !== 46 && (code < 48 || code > 57)) {
return false;
}
}

return true;
}


/**
* Check if `hostname` is a valid ip addr (either ipv6 or ipv4).
*
* @param {string} hostname
* @return {boolean}
*/
module.exports = function isIp(hostname) {
if (typeof hostname !== 'string') {
return false;
}

// A valid ipv4 would be at least 7 chars long (e.g.: 1.1.1.1)
// A valid ipv6 would be at least 4 chars long (e.g.: [::])
if (hostname.length < 4) {
return false;
}

// We already know that `hostname` is a valid hostname, since it's the output
// of `whatwg-url` parser. So it's enough to check if the hostname is
// contained between '[' and ']' characters.
if (hostname[0] === '[' && hostname[hostname.length - 1] === ']') {
return true; // ipv6
}

// Here we perform a first, very naive check to make sure `hostname` has a
// chance of being a valid ipv4 address (contains only characters: [0-9.]).
// If this is the case, then we perform a more expensive check using an
// exhaustive regex.
return isProbablyIpv4(hostname) && isIpv4(hostname);
};
Loading

0 comments on commit bd5b82d

Please sign in to comment.