feat(string-tokenize): upgrade tokenizer

winkjs · May 18, 2018 · 337425c · 337425c
1 parent bf72aa7
commit 337425c
Show file tree

Hide file tree

Showing 6 changed files with 131 additions and 68 deletions.
diff --git a/README.md b/README.md
@@ -47,6 +47,20 @@ console.log( nlp.string.sentences( para ) );
 //      'I work for AI Inc.',
 //      'My mail is r2d2@yahoo.com' ]
 
+// Tokenize a sentence.
+var s = 'For details on wink, check out http://winkjs.org/ URL!';
+console.log( nlp.string.tokenize( s, true ) );
+// -> [ { value: 'For', tag: 'word' },
+//      { value: 'details', tag: 'word' },
+//      { value: 'on', tag: 'word' },
+//      { value: 'wink', tag: 'word' },
+//      { value: ',', tag: 'punctuation' },
+//      { value: 'check', tag: 'word' },
+//      { value: 'out', tag: 'word' },
+//      { value: 'http://winkjs.org/', tag: 'url' },
+//      { value: 'URL', tag: 'word' },
+//      { value: '!', tag: 'punctuation' } ]
+
 // Remove stop words:
 var t = nlp.tokens.removeWords( [ 'mary', 'had', 'a', 'little', 'lamb' ] );
 console.log( t );

diff --git a/docs/index.html b/docs/index.html
@@ -524,6 +524,20 @@ <h2>Getting Started</h2>
 <span class="hljs-comment">//      'I work for AI Inc.',</span>
 <span class="hljs-comment">//      'My mail is r2d2@yahoo.com' ]</span>
 
+<span class="hljs-comment">// Tokenize a sentence.</span>
+<span class="hljs-keyword">var</span> s = <span class="hljs-string">'For details on wink, check out http://winkjs.org/ URL!'</span>;
+<span class="hljs-built_in">console</span>.log( nlp.string.tokenize( s, <span class="hljs-literal">true</span> ) );
+<span class="hljs-comment">// -&gt; [ { value: 'For', tag: 'word' },</span>
+<span class="hljs-comment">//      { value: 'details', tag: 'word' },</span>
+<span class="hljs-comment">//      { value: 'on', tag: 'word' },</span>
+<span class="hljs-comment">//      { value: 'wink', tag: 'word' },</span>
+<span class="hljs-comment">//      { value: ',', tag: 'punctuation' },</span>
+<span class="hljs-comment">//      { value: 'check', tag: 'word' },</span>
+<span class="hljs-comment">//      { value: 'out', tag: 'word' },</span>
+<span class="hljs-comment">//      { value: 'http://winkjs.org/', tag: 'url' },</span>
+<span class="hljs-comment">//      { value: 'URL', tag: 'word' },</span>
+<span class="hljs-comment">//      { value: '!', tag: 'punctuation' } ]</span>
+
 <span class="hljs-comment">// Remove stop words:</span>
 <span class="hljs-keyword">var</span> t = nlp.tokens.removeWords( [ <span class="hljs-string">'mary'</span>, <span class="hljs-string">'had'</span>, <span class="hljs-string">'a'</span>, <span class="hljs-string">'little'</span>, <span class="hljs-string">'lamb'</span> ] );
 <span class="hljs-built_in">console</span>.log( t );
@@ -2550,19 +2564,12 @@ <h3 class='fl m0' id='stringtokenize'>
   </div>
 
 
-  <p>The function uses the following set of rules to tokenize:</p>
-<ol>
-<li>Single quotes are processed first as they may be part of elisions; and
-<code>...</code> are converted to ellipses.</li>
-<li><code>Not</code> elisions are amplified and then split on elisions. Thus words with elisions get tokenized.</li>
-<li>The word <code>cannot</code> is split in to <code>can not</code>.</li>
-<li><code>. , -</code> punctuations that commonly embedded in numbers are left intact,</li>
-<li>All other punctuations are tokenized.</li>
-<li>The currency symbols are padded by space i.e. become separate tokens.</li>
-<li>Underscore (<code>_</code>) embedded in the word is preserved.</li>
-<li>Spacial characters are left untouched and may/may not become separate token.</li>
-<li>Finally after removing extra/leading/trailing spaces, split on space to tokenize.</li>
-</ol>
+  <p>Tokenizes the input <code>sentence</code> according to the value of <code>detailed</code> flag.
+Any occurance of <code>...</code> in the <code>sentence</code> is
+converted to ellipses. In <code>detailed = true</code> mode, it
+tags every token with its type; the supported tags are currency, email,
+emoji, emoticon, hashtag, number, ordinal, punctuation, quoted_phrase, symbol,
+time, mention, url, and word.</p>
 
 
   <div class='pre p1 fill-light mt0'>string.tokenize</div>
@@ -2582,13 +2589,31 @@ <h3 class='fl m0' id='stringtokenize'>
 
         <div class='space-bottom0'>
           <div>
-            <span class='code bold'>str</span> <code class='quiet'>(<a href="#string">string</a>)</code>
+            <span class='code bold'>sentence</span> <code class='quiet'>(<a href="#string">string</a>)</code>
 	    — the input string.
 
           </div>
 
         </div>
 
+        <div class='space-bottom0'>
+          <div>
+            <span class='code bold'>detailed</span> <code class='quiet'>(<a href="https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Boolean">boolean</a>
+            = <code>false</code>)</code>
+	    — if true, each token is a object cotaining
+
+<code>value</code>
+ and 
+<code>tag</code>
+ of each token; otherwise each token is a string. It's default
+value of 
+<strong>false</strong>
+ ensures compatibility with previous version.
+
+          </div>
+
+        </div>
+
     </div>
 
 
@@ -2597,8 +2622,11 @@ <h3 class='fl m0' id='stringtokenize'>
 
 
       <div class='py1 quiet mt1 prose-big'>Returns</div>
-      <code><a href="https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Array">Array</a>&#x3C;<a href="#string">string</a>></code>:
-        of tokens.
+      <code>(<a href="https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Array">Array</a>&#x3C;<a href="#string">string</a>> | <a href="https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Array">Array</a>&#x3C;<a href="https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Object">object</a>>)</code>:
+        an array of strings if 
+<code>detailed</code>
+ is false otherwise
+an array of objects.
 
 
 
@@ -2611,8 +2639,20 @@ <h3 class='fl m0' id='stringtokenize'>
 
 
       <pre class='p1 overflow-auto round fill-light'>tokenize( <span class="hljs-string">"someone's wallet, isn't it? I'll return!"</span> );
-<span class="hljs-comment">// -&gt; [ 'someone\'s', 'wallet', ',', 'is', 'not', 'it',</span>
-<span class="hljs-comment">//      '?', 'i', '\'ll', 'return', '!' ]</span></pre>
+<span class="hljs-comment">// -&gt; [ 'someone', '\'s', 'wallet', ',', 'is', 'n\'t', 'it', '?',</span>
+<span class="hljs-comment">//      'I', '\'ll', 'return', '!' ]</span>
+
+tokenize( <span class="hljs-string">'For details on wink, check out http://winkjs.org/ URL!'</span>, <span class="hljs-literal">true</span> );
+<span class="hljs-comment">// -&gt; [ { value: 'For', tag: 'word' },</span>
+<span class="hljs-comment">//      { value: 'details', tag: 'word' },</span>
+<span class="hljs-comment">//      { value: 'on', tag: 'word' },</span>
+<span class="hljs-comment">//      { value: 'wink', tag: 'word' },</span>
+<span class="hljs-comment">//      { value: ',', tag: 'punctuation' },</span>
+<span class="hljs-comment">//      { value: 'check', tag: 'word' },</span>
+<span class="hljs-comment">//      { value: 'out', tag: 'word' },</span>
+<span class="hljs-comment">//      { value: 'http://winkjs.org/', tag: 'url' },</span>
+<span class="hljs-comment">//      { value: 'URL', tag: 'word' },</span>
+<span class="hljs-comment">//      { value: '!', tag: 'punctuation' } ]</span></pre>
 
 
 

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -47,6 +47,7 @@
   },
   "dependencies": {
     "wink-helpers": "^1.4.0",
-    "wink-porter2-stemmer": "^1.0.8"
+    "wink-porter2-stemmer": "^1.0.8",
+    "wink-tokenizer": "^4.0.0"
   }
 }
diff --git a/src/string-tokenize.js b/src/string-tokenize.js
@@ -22,62 +22,52 @@
 //     If not, see <http://www.gnu.org/licenses/>.
 
 //
-var splitElisions = require( './string-split-elisions.js' );
-var amplifyNotElision = require( './string-amplify-not-elision.js' );
-var rgx = require( './util_regexes.js' );
+var winkTokenize = require( 'wink-tokenizer' )().tokenize;
 
 // ## string
 
 // ### tokenize
 /**
  *
- * The function uses the following set of rules to tokenize:
- *
- * 1. Single quotes are processed first as they may be part of elisions; and
- * `...` are converted to ellipses.
- * 2. `Not` elisions are amplified and then split on elisions. Thus words with elisions get tokenized.
- * 3. The word `cannot` is split in to `can not`.
- * 4. `. , -` punctuations that commonly embedded in numbers are left intact,
- * 5. All other punctuations are tokenized.
- * 6. The currency symbols are padded by space i.e. become separate tokens.
- * 7. Underscore (`_`) embedded in the word is preserved.
- * 8. Spacial characters are left untouched and may/may not become separate token.
- * 9. Finally after removing extra/leading/trailing spaces, split on space to tokenize.
+ * Tokenizes the input `sentence` according to the value of `detailed` flag.
+ * Any occurance of `...` in the `sentence` is
+ * converted to ellipses. In `detailed = true` mode, it
+ * tags every token with its type; the supported tags are currency, email,
+ * emoji, emoticon, hashtag, number, ordinal, punctuation, quoted_phrase, symbol,
+ * time, mention, url, and word.
  *
  * @name string.tokenize
- * @param {string} str — the input string.
- * @return {string[]} of tokens.
+ * @param {string} sentence — the input string.
+ * @param {boolean} [detailed=false] — if true, each token is a object cotaining
+ * `value` and `tag` of each token; otherwise each token is a string. It's default
+ * value of **false** ensures compatibility with previous version.
+ * @return {(string[]|object[])} an array of strings if `detailed` is false otherwise
+ * an array of objects.
  * @example
  * tokenize( "someone's wallet, isn't it? I'll return!" );
- * // -> [ 'someone\'s', 'wallet', ',', 'is', 'not', 'it',
- * //      '?', 'i', '\'ll', 'return', '!' ]
+ * // -> [ 'someone', '\'s', 'wallet', ',', 'is', 'n\'t', 'it', '?',
+ * //      'I', '\'ll', 'return', '!' ]
+ *
+ * tokenize( 'For details on wink, check out http://winkjs.org/ URL!', true );
+ * // -> [ { value: 'For', tag: 'word' },
+ * //      { value: 'details', tag: 'word' },
+ * //      { value: 'on', tag: 'word' },
+ * //      { value: 'wink', tag: 'word' },
+ * //      { value: ',', tag: 'punctuation' },
+ * //      { value: 'check', tag: 'word' },
+ * //      { value: 'out', tag: 'word' },
+ * //      { value: 'http://winkjs.org/', tag: 'url' },
+ * //      { value: 'URL', tag: 'word' },
+ * //      { value: '!', tag: 'punctuation' } ]
  */
-var tokenize = function ( str ) {
-  // Handle single quotes first & ellipses.
-  var su = str
-            // > TODO: promote to regex utils after adding more test cases
-            .replace( /(^|[^a-z0-9])(\’|\')/gi, '$1 $2 ')
-            .replace( /([a-z0-9])(\’|\')(\W)/gi, '$1 $2 $3')
-            .replace( '...', '…' )
-            .replace( '…', ' … ' );
-  var tokens = splitElisions( amplifyNotElision( su ) )
-            // Handle cannot.
-            .replace( rgx.cannot, '$1 $2' )
-            // Separate out punctuations that are not part of a number.
-            .replace( rgx.nonNumPunctuations, ' $& ' )
-            // Separate out all other punctuations.
-            .replace( /[\‘\’\`\“\”\"\[\]\(\)\{\}\…\!\;\?\/\:]/ig, ' $& ' )
-            // Separate out currency symbol; all separated stuff becomes a token.
-            .replace( rgx.currency, ' $& ')
-            .replace( rgx.spaces, ' ' )
-            .trim()
-            // Handle period sign in the end specially.
-            .replace( /\.$/, ' .' )
-            // Now tokenize on space!
-            .split( ' ' );
-  // Splitting an empty string on space leaves an empty string in the array,
-  // get rid of it.
-  return ( ( tokens.length === 1 && tokens[ 0 ] === '' ) ? [] : tokens );
+var tokenize = function ( sentence, detailed ) {
+  var tokens = winkTokenize( sentence.replace( '...', '…' ) );
+  var i;
+  if ( !detailed ) {
+    for ( i = 0; i < tokens.length; i += 1 ) tokens[ i ] = tokens[ i ].value;
+  }
+
+  return tokens;
 }; // tokenize()
 
 module.exports = tokenize;
diff --git a/test/wink-nlp-utils-specs.js b/test/wink-nlp-utils-specs.js
@@ -605,12 +605,12 @@ describe( 'string.tokenize()', function () {
     { whenInputIs: [ '' ], expectedOutputIs: [ ] },
     { whenInputIs: [ ' ' ], expectedOutputIs: [ ] },
     { whenInputIs: [ 'rain rain go away, come again another day' ], expectedOutputIs: [ 'rain', 'rain', 'go', 'away', ',', 'come', 'again', 'another', 'day' ] },
-    { whenInputIs: [ 'rain rain_ go away, come again another day' ], expectedOutputIs: [ 'rain', 'rain_', 'go', 'away', ',', 'come', 'again', 'another', 'day' ] },
+    { whenInputIs: [ 'rain rain_ go away, come again another day' ], expectedOutputIs: [ 'rain', 'rain', '_', 'go', 'away', ',', 'come', 'again', 'another', 'day' ] },
     { whenInputIs: [ 'what\'s ended in the year 1919 ~?  The $1 was equal to 1.2 rupees.' ], expectedOutputIs: [ 'what', '\'s', 'ended', 'in', 'the', 'year', '1919', '~', '?', 'The', '$', '1', 'was', 'equal', 'to', '1.2', 'rupees', '.' ] },
-    { whenInputIs: [ 'what ended in the 1919 year~?  The £1 was equal to 1.2 rupees.' ], expectedOutputIs: [ 'what', 'ended', 'in', 'the', '1919', 'year~', '?', 'The', '£', '1', 'was', 'equal', 'to', '1.2', 'rupees', '.' ] },
+    { whenInputIs: [ 'what ended in the 1919 year~?  The £1 was equal to 1.2 rupees.' ], expectedOutputIs: [ 'what', 'ended', 'in', 'the', '1919', 'year', '~', '?', 'The', '£', '1', 'was', 'equal', 'to', '1.2', 'rupees', '.' ] },
     { whenInputIs: [ 'what\'ll \'end in the year 1919\'?  The ¥1 was equal to 1.2 rupees.' ], expectedOutputIs: [ 'what', '\'ll', '\'', 'end', 'in', 'the', 'year', '1919', '\'', '?', 'The', '¥', '1', 'was', 'equal', 'to', '1.2', 'rupees', '.' ] },
-    { whenInputIs: [ 'what ended in the year\'s last month ?  The €1 cannot be equal to 1.2 rupees.' ], expectedOutputIs: [ 'what', 'ended', 'in', 'the', 'year\'s', 'last', 'month', '?', 'The', '€', '1', 'can', 'not', 'be', 'equal', 'to', '1.2', 'rupees', '.' ] },
-    { whenInputIs: [ 'Isn\'t... it? ' ], expectedOutputIs: [ 'Is', 'not', '…', 'it', '?' ] },
+    { whenInputIs: [ 'what ended in the year\'s last month ?  The €1 cannot be equal to 1.2 rupees.' ], expectedOutputIs: [ 'what', 'ended', 'in', 'the', 'year', '\'s', 'last', 'month', '?', 'The', '€', '1', 'cannot', 'be', 'equal', 'to', '1.2', 'rupees', '.' ] },
+    { whenInputIs: [ 'Isn\'t... it? ' ], expectedOutputIs: [ 'Is', 'n\'t', '…', 'it', '?' ] },
   ];
 
   tests.forEach( function ( test ) {
@@ -619,6 +619,19 @@ describe( 'string.tokenize()', function () {
     } );
   } );
 
+  it( 'should tokenize a sentence with multiple contractions & containing extra spaces', function () {
+    var output = [ { value: 'I', tag: 'word' },
+                   { value: '\'ll', tag: 'word' },
+                   { value: 'eat', tag: 'word' },
+                   { value: 'John', tag: 'word' },
+                   { value: '\'s', tag: 'word' },
+                   { value: 'food', tag: 'word' },
+                   { value: 'today', tag: 'word' },
+                   { value: 'with', tag: 'word' },
+                   { value: 'O\'kelly', tag: 'word' } ];
+    expect( prepare.string.tokenize( '     I\'ll eat      John\'s food today with O\'kelly  ', true ) ).to.deep.equal( output );
+  } );
+
   errors.slice( 0, 2 ).forEach( function ( error ) {
     it( 'should throw ' + error.expectedOutputIs + ' if the input is ' + JSON.stringify( error.whenInputIs ), function () {
       expect( prepare.string.tokenize.bind( null, error.whenInputIs ) ).to.throw( error.expectedOutputIs );