docs(*): complete JSDoc for tokenize & tokenize0

winkjs · Oct 12, 2017 · 06ff19f · 06ff19f
1 parent 81d063c
commit 06ff19f
Show file tree

Hide file tree

Showing 5 changed files with 332 additions and 37 deletions.
diff --git a/docs-toc.yml b/docs-toc.yml
@@ -19,6 +19,8 @@ toc:
   - string.song
   - string.splitElisions
   - string.stem
+  - string.tokenize
+  - string.tokenize0
   - string.trim
   - string.upperCase
   - name: helper

diff --git a/docs/index.html b/docs/index.html
@@ -224,6 +224,26 @@ <h3 class='mb0 no-anchor'>wink-nlp-utils</h3>
                 </li>
 
 
+                <li><a
+                  href='#stringtokenize'
+                  class="">
+                  string.tokenize
+
+                </a>
+
+                </li>
+
+
+                <li><a
+                  href='#stringtokenize0'
+                  class="">
+                  string.tokenize0
+
+                </a>
+
+                </li>
+
+
                 <li><a
                   href='#stringtrim'
                   class="">
@@ -2004,6 +2024,174 @@ <h3 class='fl m0' id='stringstem'>
 
 
 
+</section>
+
+
+
+
+            <section class='p2 mb2 clearfix bg-white minishadow'>
+
+
+  <div class='clearfix'>
+
+    <h3 class='fl m0' id='stringtokenize'>
+      string.tokenize
+    </h3>
+
+
+  </div>
+
+
+  <p>The function uses the following set of rules to tokenize: </p>
+<ol>
+<li>Single quotes are processed first as they may be part of elisions; and
+<code>...</code> are converted to ellipses.</li>
+<li><code>Not</code> elisions are amplified and then split on elisions. Thus words with elisions get tokenized.</li>
+<li>The word <code>cannot</code> is split in to <code>can not</code>.</li>
+<li><code>. , -</code> punctuations that commonly embedded in numbers are left intact,</li>
+<li>All other punctuations are tokenized.</li>
+<li>The currency symbols are padded by space i.e. become separate tokens.</li>
+<li>Underscore (<code>_</code>) embedded in the word is preserved.</li>
+<li>Spacial characters are left untouched and may/may not become separate token.</li>
+<li>Finally after removing extra/leading/trailing spaces, split on space to tokenize.</li>
+</ol>
+
+
+  <div class='pre p1 fill-light mt0'>string.tokenize</div>
+
+
+
+
+
+
+
+
+
+
+
+    <div class='py1 quiet mt1 prose-big'>Parameters</div>
+    <div class='prose'>
+
+        <div class='space-bottom0'>
+          <div>
+            <span class='code bold'>str</span> <code class='quiet'>(<a href="#string">string</a>)</code>
+	    — the input string.
+
+          </div>
+
+        </div>
+
+    </div>
+
+
+
+
+
+
+      <div class='py1 quiet mt1 prose-big'>Returns</div>
+      <code><a href="https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array">Array</a>&#x3C;<a href="#string">string</a>></code>:
+        of tokens.
+
+
+
+
+
+
+
+
+    <div class='py1 quiet mt1 prose-big'>Example</div>
+
+
+      <pre class='p1 overflow-auto round fill-light'>tokenize( <span class="hljs-string">"someone's wallet, isn't it? I'll return!"</span> );
+<span class="hljs-comment">// -&gt; [ 'someone\'s', 'wallet', ',', 'is', 'not', 'it',</span>
+<span class="hljs-comment">//      '?', 'i', '\'ll', 'return', '!' ]</span></pre>
+
+
+
+
+
+
+
+
+</section>
+
+
+
+
+            <section class='p2 mb2 clearfix bg-white minishadow'>
+
+
+  <div class='clearfix'>
+
+    <h3 class='fl m0' id='stringtokenize0'>
+      string.tokenize0
+    </h3>
+
+
+  </div>
+
+
+  <p>Tokenizes by splitting the input string on <strong>non-words</strong>. This means tokens would
+consists of only alphas, numerals and underscores; all other characters will
+be stripped as they are treated as separators. It also removes all elisions;
+however negations are retained and amplified.</p>
+
+
+  <div class='pre p1 fill-light mt0'>string.tokenize0</div>
+
+
+
+
+
+
+
+
+
+
+
+    <div class='py1 quiet mt1 prose-big'>Parameters</div>
+    <div class='prose'>
+
+        <div class='space-bottom0'>
+          <div>
+            <span class='code bold'>str</span> <code class='quiet'>(<a href="#string">string</a>)</code>
+	    — the input string.
+
+          </div>
+
+        </div>
+
+    </div>
+
+
+
+
+
+
+      <div class='py1 quiet mt1 prose-big'>Returns</div>
+      <code><a href="https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array">Array</a>&#x3C;<a href="#string">string</a>></code>:
+        of tokens.
+
+
+
+
+
+
+
+
+    <div class='py1 quiet mt1 prose-big'>Example</div>
+
+
+      <pre class='p1 overflow-auto round fill-light'>tokenize0( <span class="hljs-string">"someone's wallet, isn't it?"</span> );
+<span class="hljs-comment">// -&gt; [ 'someone', 's', 'wallet', 'is', 'not', 'it' ]</span></pre>
+
+
+
+
+
+
+
+
 </section>
 
 

diff --git a/src/string-tokenize.js b/src/string-tokenize.js
@@ -0,0 +1,83 @@
+//     wink-nlp-utils
+//     NLP Functions for removing HTML Tags, Managing Elisions,
+//     NGrams, Stemming, Phoneticising to Tokenizating and more.
+//
+//     Copyright (C) 2017  GRAYPE Systems Private Limited
+//
+//     This file is part of “wink-nlp-utils”.
+//
+//     “wink-nlp-utils” is free software: you can redistribute it
+//     and/or modify it under the terms of the GNU Affero
+//     General Public License as published by the Free
+//     Software Foundation, version 3 of the License.
+//
+//     “wink-nlp-utils” is distributed in the hope that it will
+//     be useful, but WITHOUT ANY WARRANTY; without even
+//     the implied warranty of MERCHANTABILITY or FITNESS
+//     FOR A PARTICULAR PURPOSE.  See the GNU Affero General
+//     Public License for more details.
+//
+//     You should have received a copy of the GNU Affero
+//     General Public License along with “wink-nlp-utils”.
+//     If not, see <http://www.gnu.org/licenses/>.
+
+//
+var splitElisions = require( './string-split-elisions.js' );
+var amplifyNotElision = require( './string-amplify-not-elision.js' );
+var rgx = require( './util_regexes.js' );
+
+// ## string
+
+// ### tokenize
+/**
+ *
+ * The function uses the following set of rules to tokenize: 
+ *
+ * 1. Single quotes are processed first as they may be part of elisions; and
+ * `...` are converted to ellipses.
+ * 2. `Not` elisions are amplified and then split on elisions. Thus words with elisions get tokenized.
+ * 3. The word `cannot` is split in to `can not`.
+ * 4. `. , -` punctuations that commonly embedded in numbers are left intact,
+ * 5. All other punctuations are tokenized.
+ * 6. The currency symbols are padded by space i.e. become separate tokens.
+ * 7. Underscore (`_`) embedded in the word is preserved.
+ * 8. Spacial characters are left untouched and may/may not become separate token.
+ * 9. Finally after removing extra/leading/trailing spaces, split on space to tokenize.
+ *
+ * @name string.tokenize
+ * @param {string} str — the input string.
+ * @return {string[]} of tokens.
+ * @example
+ * tokenize( "someone's wallet, isn't it? I'll return!" );
+ * // -> [ 'someone\'s', 'wallet', ',', 'is', 'not', 'it',
+ * //      '?', 'i', '\'ll', 'return', '!' ]
+ */
+var tokenize = function ( str ) {
+  // Handle single quotes first & ellipses.
+  var su = str
+            // > TODO: promote to regex utils after adding more test cases
+            .replace( /(^|[^a-z0-9])(\’|\')/gi, '$1 $2 ')
+            .replace( /([a-z0-9])(\’|\')(\W)/gi, '$1 $2 $3')
+            .replace( '...', '…' )
+            .replace( '…', ' … ' );
+  var tokens = splitElisions( amplifyNotElision( su ) )
+            // Handle cannot.
+            .replace( rgx.cannot, '$1 $2' )
+            // Separate out punctuations that are not part of a number.
+            .replace( rgx.nonNumPunctuations, ' $& ' )
+            // Separate out all other punctuations.
+            .replace( /[\‘\’\`\“\”\"\[\]\(\)\{\}\…\!\;\?\/\:]/ig, ' $& ' )
+            // Separate out currency symbol; all separated stuff becomes a token.
+            .replace( rgx.currency, ' $& ')
+            .replace( rgx.spaces, ' ' )
+            .trim()
+            // Handle period sign in the end specially.
+            .replace( /\.$/, ' .' )
+            // Now tokenize on space!
+            .split( ' ' );
+  // Splitting an empty string on space leaves an empty string in the array,
+  // get rid of it.
+  return ( ( tokens.length === 1 && tokens[ 0 ] === '' ) ? [] : tokens );
+}; // tokenize()
+
+module.exports = tokenize;
diff --git a/src/string-tokenize0.js b/src/string-tokenize0.js
@@ -0,0 +1,57 @@
+//     wink-nlp-utils
+//     NLP Functions for removing HTML Tags, Managing Elisions,
+//     NGrams, Stemming, Phoneticising to Tokenizating and more.
+//
+//     Copyright (C) 2017  GRAYPE Systems Private Limited
+//
+//     This file is part of “wink-nlp-utils”.
+//
+//     “wink-nlp-utils” is free software: you can redistribute it
+//     and/or modify it under the terms of the GNU Affero
+//     General Public License as published by the Free
+//     Software Foundation, version 3 of the License.
+//
+//     “wink-nlp-utils” is distributed in the hope that it will
+//     be useful, but WITHOUT ANY WARRANTY; without even
+//     the implied warranty of MERCHANTABILITY or FITNESS
+//     FOR A PARTICULAR PURPOSE.  See the GNU Affero General
+//     Public License for more details.
+//
+//     You should have received a copy of the GNU Affero
+//     General Public License along with “wink-nlp-utils”.
+//     If not, see <http://www.gnu.org/licenses/>.
+
+//
+var removeElisions = require( './string-remove-elisions.js' );
+var amplifyNotElision = require( './string-amplify-not-elision.js' );
+var rgx = require( './util_regexes.js' );
+
+// ## string
+
+// ### tokenize0
+/**
+ *
+ * Tokenizes by splitting the input string on **non-words**. This means tokens would
+ * consists of only alphas, numerals and underscores; all other characters will
+ * be stripped as they are treated as separators. It also removes all elisions;
+ * however negations are retained and amplified.
+ *
+ * @name string.tokenize0
+ * @param {string} str — the input string.
+ * @return {string[]} of tokens.
+ * @example
+ * tokenize0( "someone's wallet, isn't it?" );
+ * // -> [ 'someone', 's', 'wallet', 'is', 'not', 'it' ]
+ */
+var tokenize0 = function ( str ) {
+  var tokens = removeElisions( amplifyNotElision( str ) )
+                .replace( rgx.cannot, '$1 $2' )
+                .split( rgx.nonWords );
+  // Check the 0th and last element of array for empty string because if
+  // fisrt/last characters are non-words then these will be empty stings!
+  if ( tokens[ 0 ] === '' ) tokens.shift();
+  if ( tokens[ tokens.length - 1 ] === '' ) tokens.pop();
+  return tokens;
+}; // tokenize0()
+
+module.exports = tokenize0;