diff --git a/docs-toc.yml b/docs-toc.yml index 9cacfb2..aa5da89 100644 --- a/docs-toc.yml +++ b/docs-toc.yml @@ -26,6 +26,8 @@ toc: - string.trim - string.upperCase - name: tokens + - tokens.appendBigrams + - tokens.bigrams - tokens.bow - tokens.phonetize - tokens.propagateNegations diff --git a/docs/index.html b/docs/index.html index 1402c88..f71d352 100644 --- a/docs/index.html +++ b/docs/index.html @@ -294,6 +294,26 @@

wink-nlp-utils

+
  • + tokens.appendBigrams + + + +
  • + + +
  • + tokens.bigrams + + + +
  • + +
  • @@ -2635,6 +2655,166 @@

    +
    + + +
    + +

    + tokens.appendBigrams +

    + + +
    + + +

    Generates bigrams from the input tokens and appends them to the input tokens.

    + + +
    tokens.appendBigrams
    + + + + + + + + + + + +
    Parameters
    +
    + +
    +
    + tokens (Array<string>) + — the input tokens. + +
    + +
    + +
    + + + + + + +
    Returns
    + Array<string>: + the input tokens appended with their bigrams. + + + + + + + + +
    Example
    + + +
    appendBigrams( [ 'he', 'acted', 'decisively', 'today' ] );
    +// -> [ 'he',
    +//      'acted',
    +//      'decisively',
    +//      'today',
    +//      'he_acted',
    +//      'acted_decisively',
    +//      'decisively_today' ]
    + + + + + + + + +
    + + + + +
    + + +
    + +

    + tokens.bigrams +

    + + +
    + + +

    Generates bigrams from the input tokens.

    + + +
    tokens.bigrams
    + + + + + + + + + + + +
    Parameters
    +
    + +
    +
    + tokens (Array<string>) + — the input tokens. + +
    + +
    + +
    + + + + + + +
    Returns
    + Array<string>: + the bigrams. + + + + + + + + +
    Example
    + + +
    bigrams( [ 'he', 'acted', 'decisively', 'today' ] );
    +// -> [ [ 'he', 'acted' ],
    +//      [ 'acted', 'decisively' ],
    +//      [ 'decisively', 'today' ] ]
    + + + + + + + + +
    + + + +
    @@ -2818,7 +2998,7 @@

    Returns
    - string: + Array<string>: phonetized tokens. @@ -3083,7 +3263,7 @@

    Returns
    - string: + Array<string>: soundex coded tokens. @@ -3273,7 +3453,7 @@

    Returns
    - string: + Array<string>: stemmed tokens. diff --git a/src/tokens-append-bigrams.js b/src/tokens-append-bigrams.js new file mode 100644 index 0000000..9d716ef --- /dev/null +++ b/src/tokens-append-bigrams.js @@ -0,0 +1,54 @@ +// wink-nlp-utils +// NLP Functions for removing HTML Tags, Managing Elisions, +// NGrams, appendBigramsming, Phoneticising to Tokenizating and more. +// +// Copyright (C) 2017 GRAYPE SyappendBigramss Private Limited +// +// This file is part of “wink-nlp-utils”. +// +// “wink-nlp-utils” is free software: you can redistribute it +// and/or modify it under the terms of the GNU Affero +// General Public License as published by the Free +// Software Foundation, version 3 of the License. +// +// “wink-nlp-utils” is distributed in the hope that it will +// be useful, but WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General +// Public License for more details. +// +// You should have received a copy of the GNU Affero +// General Public License along with “wink-nlp-utils”. +// If not, see . + +// + +// ## tokens + +// ### appendBigrams +/** + * + * Generates bigrams from the input tokens and appends them to the input tokens. + * + * @name tokens.appendBigrams + * @param {string[]} tokens — the input tokens. + * @return {string[]} the input tokens appended with their bigrams. + * @example + * appendBigrams( [ 'he', 'acted', 'decisively', 'today' ] ); + * // -> [ 'he', + * // 'acted', + * // 'decisively', + * // 'today', + * // 'he_acted', + * // 'acted_decisively', + * // 'decisively_today' ] + */ +var appendBigrams = function ( tokens ) { + var i, imax; + for ( i = 0, imax = tokens.length - 1; i < imax; i += 1 ) { + tokens.push( tokens[ i ] + '_' + tokens[ i + 1 ] ); + } + return tokens; +}; // appendBigrams() + +module.exports = appendBigrams; diff --git a/src/tokens-bigrams.js b/src/tokens-bigrams.js new file mode 100644 index 0000000..70a0cbb --- /dev/null +++ b/src/tokens-bigrams.js @@ -0,0 +1,54 @@ +// wink-nlp-utils +// NLP Functions for removing HTML Tags, Managing Elisions, +// NGrams, bigramsming, Phoneticising to Tokenizating and more. +// +// Copyright (C) 2017 GRAYPE Sybigramss Private Limited +// +// This file is part of “wink-nlp-utils”. +// +// “wink-nlp-utils” is free software: you can redistribute it +// and/or modify it under the terms of the GNU Affero +// General Public License as published by the Free +// Software Foundation, version 3 of the License. +// +// “wink-nlp-utils” is distributed in the hope that it will +// be useful, but WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General +// Public License for more details. +// +// You should have received a copy of the GNU Affero +// General Public License along with “wink-nlp-utils”. +// If not, see . + +// + +// ## tokens + +// ### bigrams +/** + * + * Generates bigrams from the input tokens. + * + * @name tokens.bigrams + * @param {string[]} tokens — the input tokens. + * @return {string[]} the bigrams. + * @example + * bigrams( [ 'he', 'acted', 'decisively', 'today' ] ); + * // -> [ [ 'he', 'acted' ], + * // [ 'acted', 'decisively' ], + * // [ 'decisively', 'today' ] ] + */ +var bigrams = function ( tokens ) { + // Bigrams will be stored here. + var bgs = []; + // Helper variables. + var i, imax; + // Create bigrams. + for ( i = 0, imax = tokens.length - 1; i < imax; i += 1 ) { + bgs.push( [ tokens[ i ], tokens[ i + 1 ] ] ); + } + return bgs; +}; // bigrams() + +module.exports = bigrams; diff --git a/src/tokens-phonetize.js b/src/tokens-phonetize.js index 079e98a..3889bdb 100644 --- a/src/tokens-phonetize.js +++ b/src/tokens-phonetize.js @@ -33,7 +33,7 @@ var stringPhonetize = require( './string-phonetize.js' ); * * @name tokens.phonetize * @param {string[]} tokens — the input tokens. - * @return {string} phonetized tokens. + * @return {string[]} phonetized tokens. * @example * phonetize( [ 'he', 'acted', 'decisively', 'today' ] ); * // -> [ 'h', 'aktd', 'dssvl', 'td' ] diff --git a/src/tokens-soundex.js b/src/tokens-soundex.js index 21c4457..d75f43d 100644 --- a/src/tokens-soundex.js +++ b/src/tokens-soundex.js @@ -33,7 +33,7 @@ var stringSoundex = require( './string-soundex.js' ); * * @name tokens.soundex * @param {string[]} tokens — the input tokens. - * @return {string} soundex coded tokens. + * @return {string[]} soundex coded tokens. * @example * soundex( [ 'he', 'acted', 'decisively', 'today' ] ); * // -> [ 'H000', 'A233', 'D221', 'T300' ] diff --git a/src/tokens-stem.js b/src/tokens-stem.js index 4e72480..8fa31a6 100644 --- a/src/tokens-stem.js +++ b/src/tokens-stem.js @@ -33,7 +33,7 @@ var porter2Stemmer = require( 'wink-porter2-stemmer' ); * * @name tokens.stem * @param {string[]} tokens — the input tokens. - * @return {string} stemmed tokens. + * @return {string[]} stemmed tokens. * @example * stem( [ 'he', 'acted', 'decisively', 'today' ] ); * // -> [ 'he', 'act', 'decis', 'today' ] diff --git a/src/wink-nlp-utils.js b/src/wink-nlp-utils.js index 9c24fde..3d24822 100644 --- a/src/wink-nlp-utils.js +++ b/src/wink-nlp-utils.js @@ -22,12 +22,7 @@ // If not, see . // -// var rgx = require( './util_regexes.js' ); -// var ncrgx = require( './name_cleaner_regexes.js' ); var porter2Stemmer = require( 'wink-porter2-stemmer' ); -// var phnrgx = require( './phonetize_regexes.js' ); -// var defaultStopWords = require( './dictionaries/stop_words.json' ); -// var helpers = require( 'wink-helpers' ); // ### Prepare Name Space @@ -39,39 +34,16 @@ var prepare = Object.create( null ); // Create prepare.helper name space. prepare.helper = Object.create( null ); -// #### Words - -// Returns an object containing functions (a) `set`, which returns a `Set` of -// words given in the input array `w` and (b) `exclude` that is suitable for -// filtering operations. If the second argment `givenMappers` is given as an -// array of **mapper** functions then these are applied on the input array -// before converting in to a set. Typical example of mapper functions are -// `prepare.string.stem()` and `prepare.string.phonetize()`. +// Words prepare.helper.words = require( './helper-return-words-filter.js' ); - // Make better **alias** name for the `word()` function. prepare.helper.returnWordsFilter = prepare.helper.words; - -// Create default stop words here - an internal variable. -// defaultStopWords = prepare.helper.words( defaultStopWords ); - -// #### index - -// Builds index - returns 2 functions viz. (a) `build` and `result`. Useful with -// bag & set creation functions, where by bassing the build function, they can -// also build an index of each key/member. +// Index prepare.helper.index = require( './helper-return-indexer.js' ); - // Make better **alias** name for the `index()` function. prepare.helper.returnIndexer = prepare.helper.index; -// #### return Quoted Text Extractor - -// Returns a uoated text extractor function. The (returned) extractor function -// takes `s` string argument; extracts all the text elements quoted between -// `lq` (left quote) and `rq` (right quote) string; and finally returns an -// array of those text elements. Note elements do not contain quote strings. -// If `lq` and/or `rq` is not defined or is not a string then it defaults to `'"'`. +// Return Quoted Text Extractor prepare.helper.returnQuotedTextExtractor = require( './helper-return-quoted-text-extractor.js' ); // ### Prepare.String Name Space @@ -79,179 +51,55 @@ prepare.helper.returnQuotedTextExtractor = require( './helper-return-quoted-text // Create prepare.string name space. prepare.string = Object.create( null ); -// #### Lower Case - -// Converts the input string `s` to lower case. +// Lower Case prepare.string.lowerCase = require( './string-lower-case.js' ); - -// #### Upper Case - -// Converts the input sting `s` to upper case. +// Upper Case prepare.string.upperCase = require( './string-upper-case.js' ); - -// #### Trim - -// Trims leading and trailing spaces from the input string `s`. +// Trim prepare.string.trim = require( './string-trim.js' ); - -// #### Remove Extra Spaces - -// Removes leading & trailing whitespaces, extra in-between spaces from the input -// string `s`. +// Remove Extra Spaces prepare.string.removeExtraSpaces = require( './string-remove-extra-spaces.js' ); - -// #### Retain Alpha-numerics - -// Retains only apha, numerals, and spaces and removes all other characters from -// the input string `s`. +// Retain Alpha-numerics prepare.string.retainAlphaNums = require( './string-retain-alpha-nums.js' ); - -// #### Extract Person's Name - -// Attemts to extract person's name from input string `s` in formats like -// **Dr. Ashwini Kumar Sharma B. Tech., M. Tech., PhD. - Electrical** by dropping -// the titles and degrees. -// It assmues the following name format: -// `[] []`. +// Extract Person's Name prepare.string.extractPersonsName = require( './string-extract-persons-name.js' ); - -// #### Extract Run of Capital Words - -// Returns an array of **run of captial words** from thr input string `s`, -// if any; otherwise returns `null`. +// Extract Run of Capital Words prepare.string.extractRunOfCapitalWords = require( './string-extract-run-of-capital-words.js' ); - -// #### Remove Punctuations - -// Removes punctuations from the input string `s` by replacing each one of them -// by a single space character. +// Remove Punctuations prepare.string.removePunctuations = require( './string-remove-punctuations.js' ); - -// #### Remove Special Chars - -// Removes special characters from the input string `s`. +// Remove Special Chars prepare.string.removeSplChars = require( './string-remove-spl-chars.js' ); - -// #### Remove HTML Tags - -// Removes HTML tags from the input string `s` and replaces them by a space char. +// Remove HTML Tags prepare.string.removeHTMLTags = require( './string-remove-html-tags.js' ); - -// #### Remove Elisions - -// Removes elisions from the input string `s`. +// Remove Elisions prepare.string.removeElisions = require( './string-remove-elisions.js' ); - -// #### Split Elisions - -// Splits elisions from the input string `s` by inserting a space. +// Split Elisions prepare.string.splitElisions = require( './string-split-elisions.js' ); - -// #### Amplify Not Elision - -// Amplifies the not elision by replacing it by the word **not** in the input string `s`; -// it must be used before calling the `removeElisions()`. +// Amplify Not Elision prepare.string.amplifyNotElision = require( './string-amplify-not-elision' ); - -// #### Marker - -// Generate a **marker** for the input string `s` - an 1-gram sorted and joined back as -// string again; it is useful for in determining a quick but approximate degree -// of match between short strings (with potentially more false positives). +// Marker prepare.string.marker = require( './string-marker.js' ); - -// #### SOC - -// Creates a **s**et **o**f **c**hars from the input string `s`. This is useful -// in even more aggressive string matching using Jaccard or Tversky compared to -// `marker()`. -// -// If `ifn` and `idx` arguments are passed then it builds an *alphabetic -// index* of `s`. In other words, only the first character of `s` is passed to the -// indexer function (`ifn`) along with `idx`. This pattern is also used in `song()`, -// `bong()`, `bow()`, and `sow()`. However for these functions either ngram or -// word/token (whatever is applicable) is passed along with `idx` to `ifn`. Note: -// usage of `ifn` are limited by the developer's imagination! +// SOC prepare.string.soc = require( './string-soc.js' ); - -// #### ngrams - -// Generates an array of ngrams of `size` from the input string `s`. -// The default value of `size` is 2. The `size` 0 is forced to 2. +// NGrams prepare.string.ngrams = require( './string-ngram.js' ); - -// #### BONG - -// Generates the **b**ag **o**f **ng**rams of `size` from the input string `s`. -// The default value of `size` is 2. The `size` 0 is forced to 2. +// BONG prepare.string.bong = require( './string-bong.js' ); - -// #### SONG - -// Generates the **s**et **o**f **ng**rams of `size` from the input string `s`. -// The default value of `size` is 2. The `size` 0 is forced to 2. +// SONG prepare.string.song = require( './string-song.js' ); - -// #### sentences - -// Splits the text contained in the input string `s` into sentences returned -// in form of an array. Note, the end-of-sentence punctuations are retained in -// each of the sentence. It can handle sentences started from numeric values as -// well, though it is not a good english practice. -// It uses `~` as the `splChar` for splitting and therefore -// it must not be present in the input string; you may give another `splChar` -// as the second argument. +// Sentences prepare.string.sentences = require( './string-sentences.js' ); - -// #### compose corpus - -// Generates all possible sentences from the input argument string — s. -// The string s must follow a special syntax:
    -// `'[I] [am having|have] [a] [problem|question]'`
    -// The corpus is composed by computing the cartesian product of all the phrases. -// It returns an array of sentences (i.e. strings). +// Compose Corpus prepare.string.composeCorpus = require( './string-compose-corpus.js' ); - -// #### tokenize0 - -// Quick & dirty tokenizer by splitting the input string `s` on non-words. -// This means tokens would consists of only alphas, numerals and underscores; -// all other characters will be stripped as they are treated as separators. -// However negations are retained and amplified but all other elisions are removed. +// Tokenize0 prepare.string.tokenize0 = require( './string-tokenize0.js' ); - -// #### tokenize - -// Tokenizes the input string `s` by applying following rules: -// 0. Single quotes are processed first as they may be part of elisions; and -// `...` are converted to ellipses. -// 1. Split elisions after amplifying not elisions i.e. balance elisions get tokenized, -// 2. `cannot` is split in to `can not`. -// 3. `. , -` punctuations that commonly embedded in numbers are left intact, -// 4. All other punctuations are tokenized, -// 5. currency symbols are padded by space i.e. become separate tokens, -// 6. Retains `_` as is - no tokenization, -// 7. Spacial characters are left untouched and may/may not become separate token. -// 8. Finally after removing extra/leading/trailing spaces, split on space to tokenize. +// Tokenize prepare.string.tokenize = require( './string-tokenize.js' ); - -// #### stem - -// Stems the input string using Porter V2 stemmer. Details in `porter2_stemmer.js` -// file. +// #### Stem prepare.string.stem = porter2Stemmer; - -// #### phonetize - -// Phonetize the input string `s` using an algorithmic adaption of Metaphone. -/* eslint no-underscore-dangle: "off" */ +// Phonetize prepare.string.phonetize = require( './string-phonetize.js' ); - - -// #### soundex - -// Produces the soundex code from the input `word`. Default value of maxLength -// is **4**. +// Soundex prepare.string.soundex = require( './string-soundex.js' ); // ### Prepare.Tokens Name Space @@ -259,74 +107,24 @@ prepare.string.soundex = require( './string-soundex.js' ); // Create prepare.tokens name space. prepare.tokens = Object.create( null ); -// #### stem - -// Stems the input token `t` using `string.stem()`. +// Stem prepare.tokens.stem = require( './tokens-stem.js' ); - -// #### phonetize - -// Phonetize the input tokens `t` using an algorithmic adaption of Metaphone. +// Phonetize prepare.tokens.phonetize = require( './tokens-phonetize.js' ); - -// #### soundex - -// Produces the soundex code from the input `word`. +// Soundex prepare.tokens.soundex = require( './tokens-soundex.js' ); - -// #### Remove Words - -// Removes the `givenStopWords` or the `defaultStopWords` from the input -// array of tokens `t`. The input stop words must be created using -// `prepare.words()`. +// Remove Words prepare.tokens.removeWords = require( './tokens-remove-words.js' ); - -// #### bow - -// Creates Bag of Words from the input array of tokens `t`. The `logCounts` flags -// to use log2( word counts ) instead of counts directly. The idea behind using -// log2 is to ensure that a word's importance does not increase linearly with its -// count. It is required as an input for computing similarity using Cosine similarity. +// BOW prepare.tokens.bow = require( './tokens-bow.js' ); - -// #### sow - -// Creates a Set of tokens from the input array `t`. It is required as an input -// for computing similarity using Jaccard or Tversky Indexes. +// SOW prepare.tokens.sow = require( './tokens-sow.js' ); - -// #### Propagate Negations - -// It looks for neagtion tokens in `t` and propagate negation in subsequent `upto` -// tokens by prefixing them by a `!`. +// Propagate Negations prepare.tokens.propagateNegations = require( './tokens-propagate-negations.js' ); - -// #### Bigrams - -// Creates bigrams from the input `t` tokens. -prepare.tokens.bigrams = function ( t ) { - // Bigrams will be stored here. - var bgs = []; - // Helper variables. - var i, imax; - // Create bigrams. - for ( i = 0, imax = t.length - 1; i < imax; i += 1 ) { - bgs.push( [ t[ i ], t[ i + 1 ] ] ); - } - return bgs; -}; // bigrams() - -// ### Append Bigrams - -// Generates bigrams from the input `t` tokens and returns them by -// appending to `t`. -prepare.tokens.appendBigrams = function ( t ) { - var i, imax; - for ( i = 0, imax = t.length - 1; i < imax; i += 1 ) { - t.push( t[ i ] + '_' + t[ i + 1 ] ); - } - return t; -}; // appendBigrams() +// Bigrams +prepare.tokens.bigrams = require( './tokens-bigrams.js' ); +// Append Bigrams +prepare.tokens.appendBigrams = require( './tokens-append-bigrams.js' ); // Export prepare. module.exports = prepare;