-
Notifications
You must be signed in to change notification settings - Fork 510
/
extractUrlsWithIndices.js
executable file
·95 lines (82 loc) · 2.96 KB
/
extractUrlsWithIndices.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
// Copyright 2018 Twitter, Inc.
// Licensed under the Apache License, Version 2.0
// http://www.apache.org/licenses/LICENSE-2.0
import extractUrl from './regexp/extractUrl';
import invalidUrlWithoutProtocolPrecedingChars from './regexp/invalidUrlWithoutProtocolPrecedingChars';
import idna from './lib/idna';
import validAsciiDomain from './regexp/validAsciiDomain';
import validTcoUrl from './regexp/validTcoUrl';
const DEFAULT_PROTOCOL = 'https://';
const DEFAULT_PROTOCOL_OPTIONS = { extractUrlsWithoutProtocol: true };
const MAX_URL_LENGTH = 4096;
const MAX_TCO_SLUG_LENGTH = 40;
const extractUrlsWithIndices = function(text, options = DEFAULT_PROTOCOL_OPTIONS) {
if (!text || (options.extractUrlsWithoutProtocol ? !text.match(/\./) : !text.match(/:/))) {
return [];
}
const urls = [];
while (extractUrl.exec(text)) {
const before = RegExp.$2;
let url = RegExp.$3;
const protocol = RegExp.$4;
const domain = RegExp.$5;
const path = RegExp.$7;
let endPosition = extractUrl.lastIndex;
const startPosition = endPosition - url.length;
if (!isValidUrl(url, protocol || DEFAULT_PROTOCOL, domain)) {
continue;
}
// extract ASCII-only domains.
if (!protocol) {
if (!options.extractUrlsWithoutProtocol || before.match(invalidUrlWithoutProtocolPrecedingChars)) {
continue;
}
let lastUrl = null;
let asciiEndPosition = 0;
domain.replace(validAsciiDomain, function(asciiDomain) {
const asciiStartPosition = domain.indexOf(asciiDomain, asciiEndPosition);
asciiEndPosition = asciiStartPosition + asciiDomain.length;
lastUrl = {
url: asciiDomain,
indices: [startPosition + asciiStartPosition, startPosition + asciiEndPosition]
};
urls.push(lastUrl);
});
// no ASCII-only domain found. Skip the entire URL.
if (lastUrl == null) {
continue;
}
// lastUrl only contains domain. Need to add path and query if they exist.
if (path) {
lastUrl.url = url.replace(domain, lastUrl.url);
lastUrl.indices[1] = endPosition;
}
} else {
// In the case of t.co URLs, don't allow additional path characters.
if (url.match(validTcoUrl)) {
const tcoUrlSlug = RegExp.$1;
if (tcoUrlSlug && tcoUrlSlug.length > MAX_TCO_SLUG_LENGTH) {
continue;
} else {
url = RegExp.lastMatch;
endPosition = startPosition + url.length;
}
}
urls.push({
url: url,
indices: [startPosition, endPosition]
});
}
}
return urls;
};
const isValidUrl = function(url, protocol, domain) {
let urlLength = url.length;
const punycodeEncodedDomain = idna.toAscii(domain);
if (!punycodeEncodedDomain || !punycodeEncodedDomain.length) {
return false;
}
urlLength = urlLength + punycodeEncodedDomain.length - domain.length;
return protocol.length + urlLength <= MAX_URL_LENGTH;
};
export default extractUrlsWithIndices;