-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
docs(*): complete JSDoc for tokenize & tokenize0
- Loading branch information
sanjayaksaxena
committed
Oct 12, 2017
1 parent
81d063c
commit 06ff19f
Showing
5 changed files
with
332 additions
and
37 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
// wink-nlp-utils | ||
// NLP Functions for removing HTML Tags, Managing Elisions, | ||
// NGrams, Stemming, Phoneticising to Tokenizating and more. | ||
// | ||
// Copyright (C) 2017 GRAYPE Systems Private Limited | ||
// | ||
// This file is part of “wink-nlp-utils”. | ||
// | ||
// “wink-nlp-utils” is free software: you can redistribute it | ||
// and/or modify it under the terms of the GNU Affero | ||
// General Public License as published by the Free | ||
// Software Foundation, version 3 of the License. | ||
// | ||
// “wink-nlp-utils” is distributed in the hope that it will | ||
// be useful, but WITHOUT ANY WARRANTY; without even | ||
// the implied warranty of MERCHANTABILITY or FITNESS | ||
// FOR A PARTICULAR PURPOSE. See the GNU Affero General | ||
// Public License for more details. | ||
// | ||
// You should have received a copy of the GNU Affero | ||
// General Public License along with “wink-nlp-utils”. | ||
// If not, see <http://www.gnu.org/licenses/>. | ||
|
||
// | ||
var splitElisions = require( './string-split-elisions.js' ); | ||
var amplifyNotElision = require( './string-amplify-not-elision.js' ); | ||
var rgx = require( './util_regexes.js' ); | ||
|
||
// ## string | ||
|
||
// ### tokenize | ||
/** | ||
* | ||
* The function uses the following set of rules to tokenize: | ||
* | ||
* 1. Single quotes are processed first as they may be part of elisions; and | ||
* `...` are converted to ellipses. | ||
* 2. `Not` elisions are amplified and then split on elisions. Thus words with elisions get tokenized. | ||
* 3. The word `cannot` is split in to `can not`. | ||
* 4. `. , -` punctuations that commonly embedded in numbers are left intact, | ||
* 5. All other punctuations are tokenized. | ||
* 6. The currency symbols are padded by space i.e. become separate tokens. | ||
* 7. Underscore (`_`) embedded in the word is preserved. | ||
* 8. Spacial characters are left untouched and may/may not become separate token. | ||
* 9. Finally after removing extra/leading/trailing spaces, split on space to tokenize. | ||
* | ||
* @name string.tokenize | ||
* @param {string} str — the input string. | ||
* @return {string[]} of tokens. | ||
* @example | ||
* tokenize( "someone's wallet, isn't it? I'll return!" ); | ||
* // -> [ 'someone\'s', 'wallet', ',', 'is', 'not', 'it', | ||
* // '?', 'i', '\'ll', 'return', '!' ] | ||
*/ | ||
var tokenize = function ( str ) { | ||
// Handle single quotes first & ellipses. | ||
var su = str | ||
// > TODO: promote to regex utils after adding more test cases | ||
.replace( /(^|[^a-z0-9])(\’|\')/gi, '$1 $2 ') | ||
.replace( /([a-z0-9])(\’|\')(\W)/gi, '$1 $2 $3') | ||
.replace( '...', '…' ) | ||
.replace( '…', ' … ' ); | ||
var tokens = splitElisions( amplifyNotElision( su ) ) | ||
// Handle cannot. | ||
.replace( rgx.cannot, '$1 $2' ) | ||
// Separate out punctuations that are not part of a number. | ||
.replace( rgx.nonNumPunctuations, ' $& ' ) | ||
// Separate out all other punctuations. | ||
.replace( /[\‘\’\`\“\”\"\[\]\(\)\{\}\…\!\;\?\/\:]/ig, ' $& ' ) | ||
// Separate out currency symbol; all separated stuff becomes a token. | ||
.replace( rgx.currency, ' $& ') | ||
.replace( rgx.spaces, ' ' ) | ||
.trim() | ||
// Handle period sign in the end specially. | ||
.replace( /\.$/, ' .' ) | ||
// Now tokenize on space! | ||
.split( ' ' ); | ||
// Splitting an empty string on space leaves an empty string in the array, | ||
// get rid of it. | ||
return ( ( tokens.length === 1 && tokens[ 0 ] === '' ) ? [] : tokens ); | ||
}; // tokenize() | ||
|
||
module.exports = tokenize; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
// wink-nlp-utils | ||
// NLP Functions for removing HTML Tags, Managing Elisions, | ||
// NGrams, Stemming, Phoneticising to Tokenizating and more. | ||
// | ||
// Copyright (C) 2017 GRAYPE Systems Private Limited | ||
// | ||
// This file is part of “wink-nlp-utils”. | ||
// | ||
// “wink-nlp-utils” is free software: you can redistribute it | ||
// and/or modify it under the terms of the GNU Affero | ||
// General Public License as published by the Free | ||
// Software Foundation, version 3 of the License. | ||
// | ||
// “wink-nlp-utils” is distributed in the hope that it will | ||
// be useful, but WITHOUT ANY WARRANTY; without even | ||
// the implied warranty of MERCHANTABILITY or FITNESS | ||
// FOR A PARTICULAR PURPOSE. See the GNU Affero General | ||
// Public License for more details. | ||
// | ||
// You should have received a copy of the GNU Affero | ||
// General Public License along with “wink-nlp-utils”. | ||
// If not, see <http://www.gnu.org/licenses/>. | ||
|
||
// | ||
var removeElisions = require( './string-remove-elisions.js' ); | ||
var amplifyNotElision = require( './string-amplify-not-elision.js' ); | ||
var rgx = require( './util_regexes.js' ); | ||
|
||
// ## string | ||
|
||
// ### tokenize0 | ||
/** | ||
* | ||
* Tokenizes by splitting the input string on **non-words**. This means tokens would | ||
* consists of only alphas, numerals and underscores; all other characters will | ||
* be stripped as they are treated as separators. It also removes all elisions; | ||
* however negations are retained and amplified. | ||
* | ||
* @name string.tokenize0 | ||
* @param {string} str — the input string. | ||
* @return {string[]} of tokens. | ||
* @example | ||
* tokenize0( "someone's wallet, isn't it?" ); | ||
* // -> [ 'someone', 's', 'wallet', 'is', 'not', 'it' ] | ||
*/ | ||
var tokenize0 = function ( str ) { | ||
var tokens = removeElisions( amplifyNotElision( str ) ) | ||
.replace( rgx.cannot, '$1 $2' ) | ||
.split( rgx.nonWords ); | ||
// Check the 0th and last element of array for empty string because if | ||
// fisrt/last characters are non-words then these will be empty stings! | ||
if ( tokens[ 0 ] === '' ) tokens.shift(); | ||
if ( tokens[ tokens.length - 1 ] === '' ) tokens.pop(); | ||
return tokens; | ||
}; // tokenize0() | ||
|
||
module.exports = tokenize0; |
Oops, something went wrong.