diff --git a/README.md b/README.md index de23129..f040af3 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ var nlp = require( 'wink-nlp-utils' ); // Extract person's name from a string: var name = nlp.string.extractPersonsName( 'Dr. Sarah Connor M. Tech., PhD. - AI' ); console.log( name ); -// name -> 'Sarah Connor' +// -> 'Sarah Connor' // Compose all possible sentences from a string: var str = '[I] [am having|have] [a] [problem|question]'; diff --git a/docs-toc.yml b/docs-toc.yml index 0fdbe26..c32804c 100644 --- a/docs-toc.yml +++ b/docs-toc.yml @@ -1,7 +1,8 @@ toc: - name: string - - string.bong - string.amplifyNotElision + - string.bong + - string.composeCorpus - string.extractPersonsName - string.extractRunOfCapitalWords - string.lowerCase @@ -13,9 +14,11 @@ toc: - string.removePunctuations - string.removeSplChars - string.retainAlphaNums + - string.sentences - string.soc - string.song - string.splitElisions + - string.stem - string.trim - string.upperCase - name: helper diff --git a/docs/index.html b/docs/index.html index 8bea798..c85a15c 100644 --- a/docs/index.html +++ b/docs/index.html @@ -34,6 +34,16 @@

wink-nlp-utils

+
  • + string.amplifyNotElision + + + +
  • + +
  • @@ -45,9 +55,9 @@

    wink-nlp-utils

  • - string.amplifyNotElision + string.composeCorpus @@ -164,6 +174,16 @@

    wink-nlp-utils

  • +
  • + string.sentences + + + +
  • + +
  • @@ -194,6 +214,16 @@

    wink-nlp-utils

  • +
  • + string.stem + + + +
  • + +
  • @@ -308,6 +338,83 @@

    +
    + + +
    + +

    + string.amplifyNotElision +

    + + +
    + + +

    Amplifies the not elision by converting it into not; for example isn't +becomes is not.

    + + +
    string.amplifyNotElision
    + + + + + + + + + + + +
    Parameters
    +
    + +
    +
    + str (string) + — the input string. + +
    + +
    + +
    + + + + + + +
    Returns
    + string: + input string after not elision amplification. + + + + + + + + +
    Example
    + + +
    amplifyNotElision( "someone's wallet, isn't it?" );
    +// -> "someone's wallet, is not it?"
    + + + + + + + + +
    + + + +
    @@ -446,19 +553,24 @@

    -

    - string.amplifyNotElision +

    + string.composeCorpus

    -

    Amplifies the not elision by converting it into not; for example isn't -becomes is not.

    +

    Generates all possible sentences from the input argument string. +The string s must follow a special syntax as illustrated in the +example below:
    +'[I] [am having|have] [a] [problem|question]'

    +

    Each phrase must be quoted between [ ] and each possible option of phrases +(if any) must be separated by a | character. The corpus is composed by +computing the cartesian product of all the phrases.

    -
    string.amplifyNotElision
    +
    string.composeCorpus
    @@ -490,8 +602,8 @@

    Returns
    - string: - input string after not elision amplification. + Array<string>: + of all possible sentences. @@ -503,8 +615,11 @@

    Example
    -
    amplifyNotElision( "someone's wallet, isn't it?" );
    -// -> "someone's wallet, is not it?"
    +
    composeCorpus( '[I] [am having|have] [a] [problem|question]' );
    +// -> [ 'I am having a problem',
    +//      'I am having a question',
    +//      'I have a problem',
    +//      'I have a question' ]
    @@ -1393,6 +1508,99 @@

    +

    + + + + +
    + + +
    + +

    + string.sentences +

    + + +
    + + +

    Splits the input string into sentences. Punctuation marks found at the end +of a sentence are retained. The function can handle sentences beginning with +numbers as well, though it is not a good english practice. It uses ~ as the +special character for splitting and therefore it must not be present in the +input string; else you may give another special character as the second argument.

    + + +
    string.sentences
    + + + + + + + + + + + +
    Parameters
    +
    + +
    +
    + str (string) + — the input string. + +
    + +
    + +
    +
    + splChar (char + = '~') + — a single character to be used for splitting into sentences; +it must not be resent in the +str +. + +
    + +
    + +
    + + + + + + +
    Returns
    + Array<string>: + of sentences. + + + + + + + + +
    Example
    + + +
    sentences( 'There is a cat. 2 dogs are running!' );
    +// -> [ 'There is a cat.', '2 dogs are running!' ]
    + + + + + + + +
    @@ -1720,6 +1928,82 @@

    + + + + + +
    + + +
    + +

    + string.stem +

    + + +
    + + +

    Stems an inflected word using Porter2 stemming algorithm.

    + + +
    string.stem
    + + + + + + + + + + + +
    Parameters
    +
    + +
    +
    + word (string) + — to be stemmed. + +
    + +
    + +
    + + + + + + +
    Returns
    + string: + the stemmed word. + + + + + + + + +
    Example
    + + +
    stem( 'consisting' );
    +// -> 'consist'
    + + + + + + + +
    diff --git a/src/string-compose-corpus.js b/src/string-compose-corpus.js new file mode 100644 index 0000000..b0aed55 --- /dev/null +++ b/src/string-compose-corpus.js @@ -0,0 +1,70 @@ +// wink-nlp-utils +// NLP Functions for removing HTML Tags, Managing Elisions, +// NGrams, Stemming, Phoneticising to Tokenizating and more. +// +// Copyright (C) 2017 GRAYPE Systems Private Limited +// +// This file is part of “wink-nlp-utils”. +// +// “wink-nlp-utils” is free software: you can redistribute it +// and/or modify it under the terms of the GNU Affero +// General Public License as published by the Free +// Software Foundation, version 3 of the License. +// +// “wink-nlp-utils” is distributed in the hope that it will +// be useful, but WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General +// Public License for more details. +// +// You should have received a copy of the GNU Affero +// General Public License along with “wink-nlp-utils”. +// If not, see . + +// +var helpers = require( 'wink-helpers' ); +var returnQuotedTextExtractor = require( './helper-return-quoted-text-extractor.js' ); +var extractQuotedText = returnQuotedTextExtractor( '[', ']' ); +// ## string + +// ### composeCorpus +/** + * + * Generates all possible sentences from the input argument string. + * The string s must follow a special syntax as illustrated in the + * example below:
    + * `'[I] [am having|have] [a] [problem|question]'`
    + * + * Each phrase must be quoted between `[ ]` and each possible option of phrases + * (if any) must be separated by a `|` character. The corpus is composed by + * computing the cartesian product of all the phrases. + * + * @name string.composeCorpus + * @param {string} str — the input string. + * @return {string[]} of all possible sentences. + * @example + * composeCorpus( '[I] [am having|have] [a] [problem|question]' ); + * // -> [ 'I am having a problem', + * // 'I am having a question', + * // 'I have a problem', + * // 'I have a question' ] + */ +var composeCorpus = function ( str ) { + if ( !str || ( typeof str !== 'string' ) ) return []; + + var quotedTextElems = extractQuotedText( str ); + var corpus = []; + var finalCorpus = []; + + if ( !quotedTextElems ) return []; + quotedTextElems.forEach( function ( e ) { + corpus.push( e.split( '|' ) ); + } ); + + helpers.array.product( corpus ).forEach( function ( e ) { + finalCorpus.push( e.join( ' ' ) ); + } ); + return ( finalCorpus ); +}; // composeCorpus() + +module.exports = composeCorpus; diff --git a/src/string-sentences.js b/src/string-sentences.js new file mode 100644 index 0000000..57427c9 --- /dev/null +++ b/src/string-sentences.js @@ -0,0 +1,58 @@ +// wink-nlp-utils +// NLP Functions for removing HTML Tags, Managing Elisions, +// NGrams, Stemming, Phoneticising to Tokenizating and more. +// +// Copyright (C) 2017 GRAYPE Systems Private Limited +// +// This file is part of “wink-nlp-utils”. +// +// “wink-nlp-utils” is free software: you can redistribute it +// and/or modify it under the terms of the GNU Affero +// General Public License as published by the Free +// Software Foundation, version 3 of the License. +// +// “wink-nlp-utils” is distributed in the hope that it will +// be useful, but WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General +// Public License for more details. +// +// You should have received a copy of the GNU Affero +// General Public License along with “wink-nlp-utils”. +// If not, see . + +// +var rgx = require( './util_regexes.js' ); +var trim = require( './string-trim.js' ); +// ## string + +// ### sentences +/** + * + * Splits the input string into sentences. Punctuation marks found at the end + * of a sentence are retained. The function can handle sentences beginning with + * numbers as well, though it is not a good english practice. It uses `~` as the + * special character for splitting and therefore it must not be present in the + * input string; else you may give another special character as the second argument. + * + * @name string.sentences + * @param {string} str — the input string. + * @param {char} [splChar='~'] — a single character to be used for splitting into sentences; + * it must not be resent in the `str`. + * @return {string[]} of sentences. + * @example + * sentences( 'There is a cat. 2 dogs are running!' ); + * // -> [ 'There is a cat.', '2 dogs are running!' ] + */ +var sentences = function ( str, splChar ) { + var splCh = splChar || '~'; + var substitute = '$1' + splCh; + return ( str + .replace( '...', '…' ) + .replace( rgx.eosPunctuations, substitute ) + .split( splCh ) + .map( trim ) + ); +}; // sentences() + +module.exports = sentences; diff --git a/src/string-stem.js b/src/string-stem.js new file mode 100644 index 0000000..f0809d7 --- /dev/null +++ b/src/string-stem.js @@ -0,0 +1,46 @@ +// wink-nlp-utils +// NLP Functions for removing HTML Tags, Managing Elisions, +// NGrams, Stemming, Phoneticising to Tokenizating and more. +// +// Copyright (C) 2017 GRAYPE Systems Private Limited +// +// This file is part of “wink-nlp-utils”. +// +// “wink-nlp-utils” is free software: you can redistribute it +// and/or modify it under the terms of the GNU Affero +// General Public License as published by the Free +// Software Foundation, version 3 of the License. +// +// “wink-nlp-utils” is distributed in the hope that it will +// be useful, but WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General +// Public License for more details. +// +// You should have received a copy of the GNU Affero +// General Public License along with “wink-nlp-utils”. +// If not, see . + +// +var porter2Stemmer = require( 'wink-porter2-stemmer' ); + +// ## string + +// ### stem +/** + * + * Stems an inflected word using Porter2 stemming algorithm. + * + * @name string.stem + * @param {string} word — to be stemmed. + * @return {string} the stemmed word. + * + * @example + * stem( 'consisting' ); + * // -> 'consist' + */ +var stem = function ( word ) { + return ( porter2Stemmer( word ) ); +}; // stem() + +module.exports = stem; diff --git a/src/wink-nlp-utils.js b/src/wink-nlp-utils.js index 8cf384f..13f3fbc 100644 --- a/src/wink-nlp-utils.js +++ b/src/wink-nlp-utils.js @@ -27,7 +27,7 @@ var rgx = require( './util_regexes.js' ); var porter2Stemmer = require( 'wink-porter2-stemmer' ); var phnrgx = require( './phonetize_regexes.js' ); var defaultStopWords = require( './dictionaries/stop_words.json' ); -var helpers = require( 'wink-helpers' ); +// var helpers = require( 'wink-helpers' ); // ### Prepare Name Space @@ -201,16 +201,7 @@ prepare.string.song = require( './string-song.js' ); // It uses `~` as the `splChar` for splitting and therefore // it must not be present in the input string; you may give another `splChar` // as the second argument. -prepare.string.sentences = function ( s, splChar ) { - var splCh = splChar || '~'; - var substitute = '$1' + splCh; - return ( s - .replace( '...', '…' ) - .replace( rgx.eosPunctuations, substitute ) - .split( splCh ) - .map( prepare.string.trim ) - ); -}; // sentences() +prepare.string.sentences = require( './string-sentences.js' ); // #### compose corpus @@ -219,23 +210,7 @@ prepare.string.sentences = function ( s, splChar ) { // `'[I] [am having|have] [a] [problem|question]'`
    // The corpus is composed by computing the cartesian product of all the phrases. // It returns an array of sentences (i.e. strings). -prepare.string.composeCorpus = function ( s ) { - if ( !s || ( typeof s !== 'string' ) ) return []; - var extractQuotedText = prepare.helper.returnQuotedTextExtractor( '[', ']' ); - var quotedTextElems = extractQuotedText( s ); - var corpus = []; - var finalCorpus = []; - - if ( !quotedTextElems ) return []; - quotedTextElems.forEach( function ( e ) { - corpus.push( e.split( '|' ) ); - } ); - - helpers.array.product( corpus ).forEach( function ( e ) { - finalCorpus.push( e.join( ' ' ) ); - } ); - return ( finalCorpus ); -}; // composeCorpus() +prepare.string.composeCorpus = require( './string-compose-corpus.js' ); // #### tokenize0