Skip to content

Commit

Permalink
feat(string-tokenize): upgrade tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
sanjayaksaxena committed May 18, 2018
1 parent bf72aa7 commit 337425c
Show file tree
Hide file tree
Showing 6 changed files with 131 additions and 68 deletions.
14 changes: 14 additions & 0 deletions README.md
Expand Up @@ -47,6 +47,20 @@ console.log( nlp.string.sentences( para ) );
// 'I work for AI Inc.',
// 'My mail is r2d2@yahoo.com' ]

// Tokenize a sentence.
var s = 'For details on wink, check out http://winkjs.org/ URL!';
console.log( nlp.string.tokenize( s, true ) );
// -> [ { value: 'For', tag: 'word' },
// { value: 'details', tag: 'word' },
// { value: 'on', tag: 'word' },
// { value: 'wink', tag: 'word' },
// { value: ',', tag: 'punctuation' },
// { value: 'check', tag: 'word' },
// { value: 'out', tag: 'word' },
// { value: 'http://winkjs.org/', tag: 'url' },
// { value: 'URL', tag: 'word' },
// { value: '!', tag: 'punctuation' } ]

// Remove stop words:
var t = nlp.tokens.removeWords( [ 'mary', 'had', 'a', 'little', 'lamb' ] );
console.log( t );
Expand Down
76 changes: 58 additions & 18 deletions docs/index.html
Expand Up @@ -524,6 +524,20 @@ <h2>Getting Started</h2>
<span class="hljs-comment">// 'I work for AI Inc.',</span>
<span class="hljs-comment">// 'My mail is r2d2@yahoo.com' ]</span>

<span class="hljs-comment">// Tokenize a sentence.</span>
<span class="hljs-keyword">var</span> s = <span class="hljs-string">'For details on wink, check out http://winkjs.org/ URL!'</span>;
<span class="hljs-built_in">console</span>.log( nlp.string.tokenize( s, <span class="hljs-literal">true</span> ) );
<span class="hljs-comment">// -&gt; [ { value: 'For', tag: 'word' },</span>
<span class="hljs-comment">// { value: 'details', tag: 'word' },</span>
<span class="hljs-comment">// { value: 'on', tag: 'word' },</span>
<span class="hljs-comment">// { value: 'wink', tag: 'word' },</span>
<span class="hljs-comment">// { value: ',', tag: 'punctuation' },</span>
<span class="hljs-comment">// { value: 'check', tag: 'word' },</span>
<span class="hljs-comment">// { value: 'out', tag: 'word' },</span>
<span class="hljs-comment">// { value: 'http://winkjs.org/', tag: 'url' },</span>
<span class="hljs-comment">// { value: 'URL', tag: 'word' },</span>
<span class="hljs-comment">// { value: '!', tag: 'punctuation' } ]</span>

<span class="hljs-comment">// Remove stop words:</span>
<span class="hljs-keyword">var</span> t = nlp.tokens.removeWords( [ <span class="hljs-string">'mary'</span>, <span class="hljs-string">'had'</span>, <span class="hljs-string">'a'</span>, <span class="hljs-string">'little'</span>, <span class="hljs-string">'lamb'</span> ] );
<span class="hljs-built_in">console</span>.log( t );
Expand Down Expand Up @@ -2550,19 +2564,12 @@ <h3 class='fl m0' id='stringtokenize'>
</div>


<p>The function uses the following set of rules to tokenize:</p>
<ol>
<li>Single quotes are processed first as they may be part of elisions; and
<code>...</code> are converted to ellipses.</li>
<li><code>Not</code> elisions are amplified and then split on elisions. Thus words with elisions get tokenized.</li>
<li>The word <code>cannot</code> is split in to <code>can not</code>.</li>
<li><code>. , -</code> punctuations that commonly embedded in numbers are left intact,</li>
<li>All other punctuations are tokenized.</li>
<li>The currency symbols are padded by space i.e. become separate tokens.</li>
<li>Underscore (<code>_</code>) embedded in the word is preserved.</li>
<li>Spacial characters are left untouched and may/may not become separate token.</li>
<li>Finally after removing extra/leading/trailing spaces, split on space to tokenize.</li>
</ol>
<p>Tokenizes the input <code>sentence</code> according to the value of <code>detailed</code> flag.
Any occurance of <code>...</code> in the <code>sentence</code> is
converted to ellipses. In <code>detailed = true</code> mode, it
tags every token with its type; the supported tags are currency, email,
emoji, emoticon, hashtag, number, ordinal, punctuation, quoted_phrase, symbol,
time, mention, url, and word.</p>


<div class='pre p1 fill-light mt0'>string.tokenize</div>
Expand All @@ -2582,13 +2589,31 @@ <h3 class='fl m0' id='stringtokenize'>

<div class='space-bottom0'>
<div>
<span class='code bold'>str</span> <code class='quiet'>(<a href="#string">string</a>)</code>
<span class='code bold'>sentence</span> <code class='quiet'>(<a href="#string">string</a>)</code>
— the input string.

</div>

</div>

<div class='space-bottom0'>
<div>
<span class='code bold'>detailed</span> <code class='quiet'>(<a href="https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Boolean">boolean</a>
= <code>false</code>)</code>
— if true, each token is a object cotaining

<code>value</code>
and
<code>tag</code>
of each token; otherwise each token is a string. It's default
value of
<strong>false</strong>
ensures compatibility with previous version.

</div>

</div>

</div>


Expand All @@ -2597,8 +2622,11 @@ <h3 class='fl m0' id='stringtokenize'>


<div class='py1 quiet mt1 prose-big'>Returns</div>
<code><a href="https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Array">Array</a>&#x3C;<a href="#string">string</a>></code>:
of tokens.
<code>(<a href="https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Array">Array</a>&#x3C;<a href="#string">string</a>> | <a href="https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Array">Array</a>&#x3C;<a href="https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Object">object</a>>)</code>:
an array of strings if
<code>detailed</code>
is false otherwise
an array of objects.



Expand All @@ -2611,8 +2639,20 @@ <h3 class='fl m0' id='stringtokenize'>


<pre class='p1 overflow-auto round fill-light'>tokenize( <span class="hljs-string">"someone's wallet, isn't it? I'll return!"</span> );
<span class="hljs-comment">// -&gt; [ 'someone\'s', 'wallet', ',', 'is', 'not', 'it',</span>
<span class="hljs-comment">// '?', 'i', '\'ll', 'return', '!' ]</span></pre>
<span class="hljs-comment">// -&gt; [ 'someone', '\'s', 'wallet', ',', 'is', 'n\'t', 'it', '?',</span>
<span class="hljs-comment">// 'I', '\'ll', 'return', '!' ]</span>

tokenize( <span class="hljs-string">'For details on wink, check out http://winkjs.org/ URL!'</span>, <span class="hljs-literal">true</span> );
<span class="hljs-comment">// -&gt; [ { value: 'For', tag: 'word' },</span>
<span class="hljs-comment">// { value: 'details', tag: 'word' },</span>
<span class="hljs-comment">// { value: 'on', tag: 'word' },</span>
<span class="hljs-comment">// { value: 'wink', tag: 'word' },</span>
<span class="hljs-comment">// { value: ',', tag: 'punctuation' },</span>
<span class="hljs-comment">// { value: 'check', tag: 'word' },</span>
<span class="hljs-comment">// { value: 'out', tag: 'word' },</span>
<span class="hljs-comment">// { value: 'http://winkjs.org/', tag: 'url' },</span>
<span class="hljs-comment">// { value: 'URL', tag: 'word' },</span>
<span class="hljs-comment">// { value: '!', tag: 'punctuation' } ]</span></pre>



Expand Down
5 changes: 5 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion package.json
Expand Up @@ -47,6 +47,7 @@
},
"dependencies": {
"wink-helpers": "^1.4.0",
"wink-porter2-stemmer": "^1.0.8"
"wink-porter2-stemmer": "^1.0.8",
"wink-tokenizer": "^4.0.0"
}
}
80 changes: 35 additions & 45 deletions src/string-tokenize.js
Expand Up @@ -22,62 +22,52 @@
// If not, see <http://www.gnu.org/licenses/>.

//
var splitElisions = require( './string-split-elisions.js' );
var amplifyNotElision = require( './string-amplify-not-elision.js' );
var rgx = require( './util_regexes.js' );
var winkTokenize = require( 'wink-tokenizer' )().tokenize;

// ## string

// ### tokenize
/**
*
* The function uses the following set of rules to tokenize:
*
* 1. Single quotes are processed first as they may be part of elisions; and
* `...` are converted to ellipses.
* 2. `Not` elisions are amplified and then split on elisions. Thus words with elisions get tokenized.
* 3. The word `cannot` is split in to `can not`.
* 4. `. , -` punctuations that commonly embedded in numbers are left intact,
* 5. All other punctuations are tokenized.
* 6. The currency symbols are padded by space i.e. become separate tokens.
* 7. Underscore (`_`) embedded in the word is preserved.
* 8. Spacial characters are left untouched and may/may not become separate token.
* 9. Finally after removing extra/leading/trailing spaces, split on space to tokenize.
* Tokenizes the input `sentence` according to the value of `detailed` flag.
* Any occurance of `...` in the `sentence` is
* converted to ellipses. In `detailed = true` mode, it
* tags every token with its type; the supported tags are currency, email,
* emoji, emoticon, hashtag, number, ordinal, punctuation, quoted_phrase, symbol,
* time, mention, url, and word.
*
* @name string.tokenize
* @param {string} str — the input string.
* @return {string[]} of tokens.
* @param {string} sentence — the input string.
* @param {boolean} [detailed=false] — if true, each token is a object cotaining
* `value` and `tag` of each token; otherwise each token is a string. It's default
* value of **false** ensures compatibility with previous version.
* @return {(string[]|object[])} an array of strings if `detailed` is false otherwise
* an array of objects.
* @example
* tokenize( "someone's wallet, isn't it? I'll return!" );
* // -> [ 'someone\'s', 'wallet', ',', 'is', 'not', 'it',
* // '?', 'i', '\'ll', 'return', '!' ]
* // -> [ 'someone', '\'s', 'wallet', ',', 'is', 'n\'t', 'it', '?',
* // 'I', '\'ll', 'return', '!' ]
*
* tokenize( 'For details on wink, check out http://winkjs.org/ URL!', true );
* // -> [ { value: 'For', tag: 'word' },
* // { value: 'details', tag: 'word' },
* // { value: 'on', tag: 'word' },
* // { value: 'wink', tag: 'word' },
* // { value: ',', tag: 'punctuation' },
* // { value: 'check', tag: 'word' },
* // { value: 'out', tag: 'word' },
* // { value: 'http://winkjs.org/', tag: 'url' },
* // { value: 'URL', tag: 'word' },
* // { value: '!', tag: 'punctuation' } ]
*/
var tokenize = function ( str ) {
// Handle single quotes first & ellipses.
var su = str
// > TODO: promote to regex utils after adding more test cases
.replace( /(^|[^a-z0-9])(\’|\')/gi, '$1 $2 ')
.replace( /([a-z0-9])(\’|\')(\W)/gi, '$1 $2 $3')
.replace( '...', '…' )
.replace( '…', ' … ' );
var tokens = splitElisions( amplifyNotElision( su ) )
// Handle cannot.
.replace( rgx.cannot, '$1 $2' )
// Separate out punctuations that are not part of a number.
.replace( rgx.nonNumPunctuations, ' $& ' )
// Separate out all other punctuations.
.replace( /[\‘\’\`\“\”\"\[\]\(\)\{\}\…\!\;\?\/\:]/ig, ' $& ' )
// Separate out currency symbol; all separated stuff becomes a token.
.replace( rgx.currency, ' $& ')
.replace( rgx.spaces, ' ' )
.trim()
// Handle period sign in the end specially.
.replace( /\.$/, ' .' )
// Now tokenize on space!
.split( ' ' );
// Splitting an empty string on space leaves an empty string in the array,
// get rid of it.
return ( ( tokens.length === 1 && tokens[ 0 ] === '' ) ? [] : tokens );
var tokenize = function ( sentence, detailed ) {
var tokens = winkTokenize( sentence.replace( '...', '…' ) );
var i;
if ( !detailed ) {
for ( i = 0; i < tokens.length; i += 1 ) tokens[ i ] = tokens[ i ].value;
}

return tokens;
}; // tokenize()

module.exports = tokenize;
21 changes: 17 additions & 4 deletions test/wink-nlp-utils-specs.js
Expand Up @@ -605,12 +605,12 @@ describe( 'string.tokenize()', function () {
{ whenInputIs: [ '' ], expectedOutputIs: [ ] },
{ whenInputIs: [ ' ' ], expectedOutputIs: [ ] },
{ whenInputIs: [ 'rain rain go away, come again another day' ], expectedOutputIs: [ 'rain', 'rain', 'go', 'away', ',', 'come', 'again', 'another', 'day' ] },
{ whenInputIs: [ 'rain rain_ go away, come again another day' ], expectedOutputIs: [ 'rain', 'rain_', 'go', 'away', ',', 'come', 'again', 'another', 'day' ] },
{ whenInputIs: [ 'rain rain_ go away, come again another day' ], expectedOutputIs: [ 'rain', 'rain', '_', 'go', 'away', ',', 'come', 'again', 'another', 'day' ] },
{ whenInputIs: [ 'what\'s ended in the year 1919 ~? The $1 was equal to 1.2 rupees.' ], expectedOutputIs: [ 'what', '\'s', 'ended', 'in', 'the', 'year', '1919', '~', '?', 'The', '$', '1', 'was', 'equal', 'to', '1.2', 'rupees', '.' ] },
{ whenInputIs: [ 'what ended in the 1919 year~? The £1 was equal to 1.2 rupees.' ], expectedOutputIs: [ 'what', 'ended', 'in', 'the', '1919', 'year~', '?', 'The', '£', '1', 'was', 'equal', 'to', '1.2', 'rupees', '.' ] },
{ whenInputIs: [ 'what ended in the 1919 year~? The £1 was equal to 1.2 rupees.' ], expectedOutputIs: [ 'what', 'ended', 'in', 'the', '1919', 'year', '~', '?', 'The', '£', '1', 'was', 'equal', 'to', '1.2', 'rupees', '.' ] },
{ whenInputIs: [ 'what\'ll \'end in the year 1919\'? The ¥1 was equal to 1.2 rupees.' ], expectedOutputIs: [ 'what', '\'ll', '\'', 'end', 'in', 'the', 'year', '1919', '\'', '?', 'The', '¥', '1', 'was', 'equal', 'to', '1.2', 'rupees', '.' ] },
{ whenInputIs: [ 'what ended in the year\'s last month ? The €1 cannot be equal to 1.2 rupees.' ], expectedOutputIs: [ 'what', 'ended', 'in', 'the', 'year\'s', 'last', 'month', '?', 'The', '€', '1', 'can', 'not', 'be', 'equal', 'to', '1.2', 'rupees', '.' ] },
{ whenInputIs: [ 'Isn\'t... it? ' ], expectedOutputIs: [ 'Is', 'not', '…', 'it', '?' ] },
{ whenInputIs: [ 'what ended in the year\'s last month ? The €1 cannot be equal to 1.2 rupees.' ], expectedOutputIs: [ 'what', 'ended', 'in', 'the', 'year', '\'s', 'last', 'month', '?', 'The', '€', '1', 'cannot', 'be', 'equal', 'to', '1.2', 'rupees', '.' ] },
{ whenInputIs: [ 'Isn\'t... it? ' ], expectedOutputIs: [ 'Is', 'n\'t', '…', 'it', '?' ] },
];

tests.forEach( function ( test ) {
Expand All @@ -619,6 +619,19 @@ describe( 'string.tokenize()', function () {
} );
} );

it( 'should tokenize a sentence with multiple contractions & containing extra spaces', function () {
var output = [ { value: 'I', tag: 'word' },
{ value: '\'ll', tag: 'word' },
{ value: 'eat', tag: 'word' },
{ value: 'John', tag: 'word' },
{ value: '\'s', tag: 'word' },
{ value: 'food', tag: 'word' },
{ value: 'today', tag: 'word' },
{ value: 'with', tag: 'word' },
{ value: 'O\'kelly', tag: 'word' } ];
expect( prepare.string.tokenize( ' I\'ll eat John\'s food today with O\'kelly ', true ) ).to.deep.equal( output );
} );

errors.slice( 0, 2 ).forEach( function ( error ) {
it( 'should throw ' + error.expectedOutputIs + ' if the input is ' + JSON.stringify( error.whenInputIs ), function () {
expect( prepare.string.tokenize.bind( null, error.whenInputIs ) ).to.throw( error.expectedOutputIs );
Expand Down

0 comments on commit 337425c

Please sign in to comment.