forked from locutusjs/locutus
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstr_word_count.js
102 lines (99 loc) · 3.7 KB
/
str_word_count.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
function str_word_count (str, format, charlist) {
// http://kevin.vanzonneveld.net
// + original by: Ole Vrijenhoek
// + bugfixed by: Kevin van Zonneveld (http://kevin.vanzonneveld.net)
// + bugfixed by: Brett Zamir (http://brett-zamir.me)
// + input by: Bug?
// + bugfixed by: Brett Zamir (http://brett-zamir.me)
// + improved by: Brett Zamir (http://brett-zamir.me)
// - depends on: ctype_alpha
// * example 1: str_word_count("Hello fri3nd, you're\r\n looking good today!", 1);
// * returns 1: ['Hello', 'fri', 'nd', "you're", 'looking', 'good', 'today']
// * example 2: str_word_count("Hello fri3nd, you're\r\n looking good today!", 2);
// * returns 2: {0: 'Hello', 6: 'fri', 10: 'nd', 14: "you're", 29: 'looking', 46: 'good', 51: 'today'}
// * example 3: str_word_count("Hello fri3nd, you're\r\n looking good today!", 1, '\u00e0\u00e1\u00e3\u00e73');
// * returns 3: ['Hello', 'fri3nd', 'youre', 'looking', 'good', 'today']
var len = str.length,
cl = charlist && charlist.length,
chr = '',
tmpStr = '',
i = 0,
c = '',
wArr = [],
wC = 0,
assoc = {},
aC = 0,
reg = '',
match = false;
// BEGIN STATIC
var _preg_quote = function (str) {
return (str + '').replace(/([\\\.\+\*\?\[\^\]\$\(\)\{\}\=\!<>\|\:])/g, '\\$1');
},
_getWholeChar = function (str, i) { // Use for rare cases of non-BMP characters
var code = str.charCodeAt(i);
if (code < 0xD800 || code > 0xDFFF) {
return str.charAt(i);
}
if (0xD800 <= code && code <= 0xDBFF) { // High surrogate (could change last hex to 0xDB7F to treat high private surrogates as single characters)
if (str.length <= (i + 1)) {
throw 'High surrogate without following low surrogate';
}
var next = str.charCodeAt(i + 1);
if (0xDC00 > next || next > 0xDFFF) {
throw 'High surrogate without following low surrogate';
}
return str.charAt(i) + str.charAt(i + 1);
}
// Low surrogate (0xDC00 <= code && code <= 0xDFFF)
if (i === 0) {
throw 'Low surrogate without preceding high surrogate';
}
var prev = str.charCodeAt(i - 1);
if (0xD800 > prev || prev > 0xDBFF) { // (could change last hex to 0xDB7F to treat high private surrogates as single characters)
throw 'Low surrogate without preceding high surrogate';
}
return false; // We can pass over low surrogates now as the second component in a pair which we have already processed
};
// END STATIC
if (cl) {
reg = '^(' + _preg_quote(_getWholeChar(charlist, 0));
for (i = 1; i < cl; i++) {
if ((chr = _getWholeChar(charlist, i)) === false) {
continue;
}
reg += '|' + _preg_quote(chr);
}
reg += ')$';
reg = new RegExp(reg);
}
for (i = 0; i < len; i++) {
if ((c = _getWholeChar(str, i)) === false) {
continue;
}
match = this.ctype_alpha(c) || (reg && c.search(reg) !== -1) || ((i !== 0 && i !== len - 1) && c === '-') || // No hyphen at beginning or end unless allowed in charlist (or locale)
(i !== 0 && c === "'"); // No apostrophe at beginning unless allowed in charlist (or locale)
if (match) {
if (tmpStr === '' && format === 2) {
aC = i;
}
tmpStr = tmpStr + c;
}
if (i === len - 1 || !match && tmpStr !== '') {
if (format !== 2) {
wArr[wArr.length] = tmpStr;
} else {
assoc[aC] = tmpStr;
}
tmpStr = '';
wC++;
}
}
if (!format) {
return wC;
} else if (format === 1) {
return wArr;
} else if (format === 2) {
return assoc;
}
throw 'You have supplied an incorrect format';
}