diff --git a/docs-toc.yml b/docs-toc.yml index c32804c..ff72b34 100644 --- a/docs-toc.yml +++ b/docs-toc.yml @@ -19,6 +19,8 @@ toc: - string.song - string.splitElisions - string.stem + - string.tokenize + - string.tokenize0 - string.trim - string.upperCase - name: helper diff --git a/docs/index.html b/docs/index.html index c85a15c..a98fb2d 100644 --- a/docs/index.html +++ b/docs/index.html @@ -224,6 +224,26 @@

wink-nlp-utils

+
  • + string.tokenize + + + +
  • + + +
  • + string.tokenize0 + + + +
  • + +
  • @@ -2004,6 +2024,174 @@

    + + + + + +
    + + +
    + +

    + string.tokenize +

    + + +
    + + +

    The function uses the following set of rules to tokenize:

    +
      +
    1. Single quotes are processed first as they may be part of elisions; and +... are converted to ellipses.
    2. +
    3. Not elisions are amplified and then split on elisions. Thus words with elisions get tokenized.
    4. +
    5. The word cannot is split in to can not.
    6. +
    7. . , - punctuations that commonly embedded in numbers are left intact,
    8. +
    9. All other punctuations are tokenized.
    10. +
    11. The currency symbols are padded by space i.e. become separate tokens.
    12. +
    13. Underscore (_) embedded in the word is preserved.
    14. +
    15. Spacial characters are left untouched and may/may not become separate token.
    16. +
    17. Finally after removing extra/leading/trailing spaces, split on space to tokenize.
    18. +
    + + +
    string.tokenize
    + + + + + + + + + + + +
    Parameters
    +
    + +
    +
    + str (string) + — the input string. + +
    + +
    + +
    + + + + + + +
    Returns
    + Array<string>: + of tokens. + + + + + + + + +
    Example
    + + +
    tokenize( "someone's wallet, isn't it? I'll return!" );
    +// -> [ 'someone\'s', 'wallet', ',', 'is', 'not', 'it',
    +//      '?', 'i', '\'ll', 'return', '!' ]
    + + + + + + + + +
    + + + + +
    + + +
    + +

    + string.tokenize0 +

    + + +
    + + +

    Tokenizes by splitting the input string on non-words. This means tokens would +consists of only alphas, numerals and underscores; all other characters will +be stripped as they are treated as separators. It also removes all elisions; +however negations are retained and amplified.

    + + +
    string.tokenize0
    + + + + + + + + + + + +
    Parameters
    +
    + +
    +
    + str (string) + — the input string. + +
    + +
    + +
    + + + + + + +
    Returns
    + Array<string>: + of tokens. + + + + + + + + +
    Example
    + + +
    tokenize0( "someone's wallet, isn't it?" );
    +// -> [ 'someone', 's', 'wallet', 'is', 'not', 'it' ]
    + + + + + + + +
    diff --git a/src/string-tokenize.js b/src/string-tokenize.js new file mode 100644 index 0000000..96763e5 --- /dev/null +++ b/src/string-tokenize.js @@ -0,0 +1,83 @@ +// wink-nlp-utils +// NLP Functions for removing HTML Tags, Managing Elisions, +// NGrams, Stemming, Phoneticising to Tokenizating and more. +// +// Copyright (C) 2017 GRAYPE Systems Private Limited +// +// This file is part of “wink-nlp-utils”. +// +// “wink-nlp-utils” is free software: you can redistribute it +// and/or modify it under the terms of the GNU Affero +// General Public License as published by the Free +// Software Foundation, version 3 of the License. +// +// “wink-nlp-utils” is distributed in the hope that it will +// be useful, but WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General +// Public License for more details. +// +// You should have received a copy of the GNU Affero +// General Public License along with “wink-nlp-utils”. +// If not, see . + +// +var splitElisions = require( './string-split-elisions.js' ); +var amplifyNotElision = require( './string-amplify-not-elision.js' ); +var rgx = require( './util_regexes.js' ); + +// ## string + +// ### tokenize +/** + * + * The function uses the following set of rules to tokenize: + * + * 1. Single quotes are processed first as they may be part of elisions; and + * `...` are converted to ellipses. + * 2. `Not` elisions are amplified and then split on elisions. Thus words with elisions get tokenized. + * 3. The word `cannot` is split in to `can not`. + * 4. `. , -` punctuations that commonly embedded in numbers are left intact, + * 5. All other punctuations are tokenized. + * 6. The currency symbols are padded by space i.e. become separate tokens. + * 7. Underscore (`_`) embedded in the word is preserved. + * 8. Spacial characters are left untouched and may/may not become separate token. + * 9. Finally after removing extra/leading/trailing spaces, split on space to tokenize. + * + * @name string.tokenize + * @param {string} str — the input string. + * @return {string[]} of tokens. + * @example + * tokenize( "someone's wallet, isn't it? I'll return!" ); + * // -> [ 'someone\'s', 'wallet', ',', 'is', 'not', 'it', + * // '?', 'i', '\'ll', 'return', '!' ] + */ +var tokenize = function ( str ) { + // Handle single quotes first & ellipses. + var su = str + // > TODO: promote to regex utils after adding more test cases + .replace( /(^|[^a-z0-9])(\’|\')/gi, '$1 $2 ') + .replace( /([a-z0-9])(\’|\')(\W)/gi, '$1 $2 $3') + .replace( '...', '…' ) + .replace( '…', ' … ' ); + var tokens = splitElisions( amplifyNotElision( su ) ) + // Handle cannot. + .replace( rgx.cannot, '$1 $2' ) + // Separate out punctuations that are not part of a number. + .replace( rgx.nonNumPunctuations, ' $& ' ) + // Separate out all other punctuations. + .replace( /[\‘\’\`\“\”\"\[\]\(\)\{\}\…\!\;\?\/\:]/ig, ' $& ' ) + // Separate out currency symbol; all separated stuff becomes a token. + .replace( rgx.currency, ' $& ') + .replace( rgx.spaces, ' ' ) + .trim() + // Handle period sign in the end specially. + .replace( /\.$/, ' .' ) + // Now tokenize on space! + .split( ' ' ); + // Splitting an empty string on space leaves an empty string in the array, + // get rid of it. + return ( ( tokens.length === 1 && tokens[ 0 ] === '' ) ? [] : tokens ); +}; // tokenize() + +module.exports = tokenize; diff --git a/src/string-tokenize0.js b/src/string-tokenize0.js new file mode 100644 index 0000000..9c35a8c --- /dev/null +++ b/src/string-tokenize0.js @@ -0,0 +1,57 @@ +// wink-nlp-utils +// NLP Functions for removing HTML Tags, Managing Elisions, +// NGrams, Stemming, Phoneticising to Tokenizating and more. +// +// Copyright (C) 2017 GRAYPE Systems Private Limited +// +// This file is part of “wink-nlp-utils”. +// +// “wink-nlp-utils” is free software: you can redistribute it +// and/or modify it under the terms of the GNU Affero +// General Public License as published by the Free +// Software Foundation, version 3 of the License. +// +// “wink-nlp-utils” is distributed in the hope that it will +// be useful, but WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General +// Public License for more details. +// +// You should have received a copy of the GNU Affero +// General Public License along with “wink-nlp-utils”. +// If not, see . + +// +var removeElisions = require( './string-remove-elisions.js' ); +var amplifyNotElision = require( './string-amplify-not-elision.js' ); +var rgx = require( './util_regexes.js' ); + +// ## string + +// ### tokenize0 +/** + * + * Tokenizes by splitting the input string on **non-words**. This means tokens would + * consists of only alphas, numerals and underscores; all other characters will + * be stripped as they are treated as separators. It also removes all elisions; + * however negations are retained and amplified. + * + * @name string.tokenize0 + * @param {string} str — the input string. + * @return {string[]} of tokens. + * @example + * tokenize0( "someone's wallet, isn't it?" ); + * // -> [ 'someone', 's', 'wallet', 'is', 'not', 'it' ] + */ +var tokenize0 = function ( str ) { + var tokens = removeElisions( amplifyNotElision( str ) ) + .replace( rgx.cannot, '$1 $2' ) + .split( rgx.nonWords ); + // Check the 0th and last element of array for empty string because if + // fisrt/last characters are non-words then these will be empty stings! + if ( tokens[ 0 ] === '' ) tokens.shift(); + if ( tokens[ tokens.length - 1 ] === '' ) tokens.pop(); + return tokens; +}; // tokenize0() + +module.exports = tokenize0; diff --git a/src/wink-nlp-utils.js b/src/wink-nlp-utils.js index 13f3fbc..1e0b6ff 100644 --- a/src/wink-nlp-utils.js +++ b/src/wink-nlp-utils.js @@ -218,16 +218,7 @@ prepare.string.composeCorpus = require( './string-compose-corpus.js' ); // This means tokens would consists of only alphas, numerals and underscores; // all other characters will be stripped as they are treated as separators. // However negations are retained and amplified but all other elisions are removed. -prepare.string.tokenize0 = function ( s ) { - var tokens = prepare.string.removeElisions( prepare.string.amplifyNotElision( s ) ) - .replace( rgx.cannot, '$1 $2' ) - .split( rgx.nonWords ); - // Check the 0th and last element of array for empty string because if - // fisrt/last characters are non-words then these will be empty stings! - if ( tokens[ 0 ] === '' ) tokens.shift(); - if ( tokens[ tokens.length - 1 ] === '' ) tokens.pop(); - return tokens; -}; // tokenize0() +prepare.string.tokenize0 = require( './string-tokenize0.js' ); // #### tokenize @@ -242,33 +233,7 @@ prepare.string.tokenize0 = function ( s ) { // 6. Retains `_` as is - no tokenization, // 7. Spacial characters are left untouched and may/may not become separate token. // 8. Finally after removing extra/leading/trailing spaces, split on space to tokenize. -prepare.string.tokenize = function ( s ) { - // Handle single quotes first & ellipses. - var su = s - // > TODO: promote to regex utils after adding more test cases - .replace( /(^|[^a-z0-9])(\’|\')/gi, '$1 $2 ') - .replace( /([a-z0-9])(\’|\')(\W)/gi, '$1 $2 $3') - .replace( '...', '…' ) - .replace( '…', ' … ' ); - var tokens = prepare.string.splitElisions( prepare.string.amplifyNotElision( su ) ) - // Handle cannot. - .replace( rgx.cannot, '$1 $2' ) - // Separate out punctuations that are not part of a number. - .replace( rgx.nonNumPunctuations, ' $& ' ) - // Separate out all other punctuations. - .replace( /[\‘\’\`\“\”\"\[\]\(\)\{\}\…\!\;\?\/\:]/ig, ' $& ' ) - // Separate out currency symbol; all separated stuff becomes a token. - .replace( rgx.currency, ' $& ') - .replace( rgx.spaces, ' ' ) - .trim() - // Handle period sign in the end specially. - .replace( /\.$/, ' .' ) - // Now tokenize on space! - .split( ' ' ); - // Splitting an empty string on space leaves an empty string in the array, - // get rid of it. - return ( ( tokens.length === 1 && tokens[ 0 ] === '' ) ? [] : tokens ); -}; // tokenize() +prepare.string.tokenize = require( './string-tokenize.js' ); // #### stem