feat(*): enable nbsp handling

references #135 Co-authored-by: Rachna <rachna@graype.in>
winkjs · Mar 27, 2024 · e139a5a · e139a5a
1 parent b48cb11
commit e139a5a
Show file tree

Hide file tree

Showing 12 changed files with 187 additions and 94 deletions.
diff --git a/src/api/col-tokens-out.js b/src/api/col-tokens-out.js
@@ -33,11 +33,7 @@
 var its = require( '../its.js' );
 var as = require( '../as.js' );
 var allowed = require( '../allowed.js' );
-var constants = require( '../constants.js' );
-// Size of a single token.
-var tkSize = constants.tkSize;
-// Mask for preceding spaces.
-var psMask = constants.psMask;
+var reconstructSpaces = require( '../reconstruct-spaces.js' );
 
 // ## colTokensOut
 /**
@@ -65,11 +61,11 @@ var colTokensOut = function ( start, end, rdd, itsf, asf, addons ) {
   // Note, `as.text/markedUpText` needs special attention to include preceeding spaces.
   if ( asfn === as.text || asfn === as.markedUpText ) {
     for ( let i = start; i <= end; i += 1 ) {
-      mappedTkns.push( ''.padEnd( rdd.tokens[ ( i * tkSize ) + 1 ] & psMask ), itsf( i, rdd.tokens, rdd.cache, addons ) );  // eslint-disable-line no-bitwise
+      mappedTkns.push( reconstructSpaces( i, rdd ), itsf( i, rdd, addons ) );
     }
   } else {
     for ( let i = start; i <= end; i += 1 ) {
-      mappedTkns.push( itsfn( i, rdd.tokens, rdd.cache, addons ) );
+      mappedTkns.push( itsfn( i, rdd, addons ) );
     }
   }
 

diff --git a/src/api/itm-token-out.js b/src/api/itm-token-out.js
@@ -47,7 +47,7 @@ var allowed = require( '../allowed.js' );
 var itmTokenOut = function ( index, rdd, itsf, addons ) {
   // Not a vector request, map using `itsf`.
   var f = ( allowed.its4token.has( itsf ) ) ? itsf : its.value;
-  return f( index, rdd.tokens, rdd.cache, addons );
+  return f( index, rdd, addons );
 }; // itmTokenOut()
 
 module.exports = itmTokenOut;
diff --git a/src/api/sel-tokens-out.js b/src/api/sel-tokens-out.js
@@ -33,6 +33,7 @@
 var its = require( '../its.js' );
 var as = require( '../as.js' );
 var allowed = require( '../allowed.js' );
+var reconstructSpaces = require( '../reconstruct-spaces.js' );
 var constants = require( '../constants.js' );
 // Size of a single token.
 var tkSize = constants.tkSize;
@@ -65,11 +66,11 @@ var selTokensOut = function ( selTokens, rdd, itsf, asf, addons ) {
   // No `markedUpText` allowed here.
   if ( asfn === as.text ) {
     for ( let i = 0; i < selTokens.length; i += 1 ) {
-      mappedTkns.push( ''.padEnd( rdd.tokens[ ( selTokens[ i ] * tkSize ) + 1 ] & psMask ), itsf( selTokens[ i ], rdd.tokens, rdd.cache, addons ) );  // eslint-disable-line no-bitwise
+      mappedTkns.push( reconstructSpaces( selTokens[ i ], rdd ), itsf( selTokens[ i ], rdd, addons ) );
     }
   } else {
     for ( let i = 0; i < selTokens.length; i += 1 ) {
-      mappedTkns.push( itsfn( selTokens[ i ], rdd.tokens, rdd.cache, addons ) );
+      mappedTkns.push( itsfn( selTokens[ i ], rdd, addons ) );
     }
   }
 

diff --git a/src/dd-wrapper.js b/src/dd-wrapper.js
@@ -42,6 +42,8 @@ var xpSize = constants.xpSize;
 var bits4lemma = constants.bits4lemma;
 // The UNK!
 var UNK = constants.UNK;
+// Size of a single token.
+var tkSize = constants.tkSize;
 
 var docDataWrapper = function ( data ) {
   // Extract frequently referred data elements:
@@ -63,12 +65,16 @@ var docDataWrapper = function ( data ) {
    * @param {string} text to be added as token.
    * @param {string} category of the token.
    * @param {number} precedingSpaces to the `text` as parsed by tokenizer.
-   * @param {number[]} tokens, where the token is added.
+   * @param {array} nbsp, containing details of nbsp.
    * @returns {boolean} always `true`.
    * @private
   */
-  var addToken = function ( text, category, precedingSpaces ) {
-    tokens.push( cache.add( text, category ), precedingSpaces, 0, 0 );
+  var addToken = function ( text, category, precedingSpaces, nbsp ) {
+    // Non-normalized index of the token being pushed.
+    var idx;
+    idx = tokens.push( cache.add( text, category ), precedingSpaces, 0, 0 );
+    // See comments in `addTokenIfInCache()`
+    if ( nbsp !== null && precedingSpaces > 0 ) data.nonBreakingSpaces[ ( idx / tkSize ) - 1 ] = nbsp;
     return true;
   }; // addToken()
 
@@ -84,10 +90,11 @@ var docDataWrapper = function ( data ) {
    *
    * @param {string} text to be added as token.
    * @param {number} precedingSpaces to the `text` as parsed by tokenizer.
+   * @param {string} nbsp non breaking spaces
    * @returns {boolean} `truthy` if `text` is found in cache otherwise `falsy`.
    * @private
   */
-  var addTokenIfInCache = function ( text, precedingSpaces ) {
+  var addTokenIfInCache = function ( text, precedingSpaces, nbsp ) {
     // The array `tokenIndex` will contain 1-element if `text` is not a predefined
     // contraction; otherwise it will contain `n x 4` elements, where `n` is the
     // number of expansions.
@@ -96,12 +103,20 @@ var docDataWrapper = function ( data ) {
     var ps;
     // Temp for lemma & pos.
     var lemma, pos;
+    // Non-normalized index of the token being pushed.
+    var idx;
 
     // `UNK` means 0 or `falsy`; it flags that token has not been added.
     if ( tokenIndex === null ) return UNK;
 
     if ( tokenIndex.length === 1 ) {
-      tokens.push( tokenIndex[ 0 ], precedingSpaces, 0, 0 );
+      idx = tokens.push( tokenIndex[ 0 ], precedingSpaces, 0, 0 );
+      // Store non breaking spaces preceding this token. Do it only if `precedingSpaces > 0` (Note:
+      // it is zero in case of expansion of a contraction) AND `nbsp` is defined (Note: in this case
+      // precedingSpaces would be set to max i.e. 0xFFFF with only exception when the token is being
+      // expanded: the first one will have nbsp but the subsequent ones with have 0 preceding spaces).
+      // The storage index should be the normalaized token index.
+      if ( nbsp !== null && precedingSpaces > 0 ) data.nonBreakingSpaces[ ( idx / tkSize ) - 1 ] = nbsp;
     } else {
       // Contraction, itereate through each expansion.
       for ( let k = 0; k < tokenIndex.length; k += xpSize ) {
@@ -114,7 +129,9 @@ var docDataWrapper = function ( data ) {
         lemma = tokenIndex[ k + 2 ];
         pos   = tokenIndex[ k + 3 ];
         // Add token; annotations may be filled later in the pipeline.
-        tokens.push( tokenIndex[ k ], ps, ( lemma | ( pos << bits4lemma ) ), 0 ); // eslint-disable-line no-bitwise
+        idx = tokens.push( tokenIndex[ k ], ps, ( lemma | ( pos << bits4lemma ) ), 0 ); // eslint-disable-line no-bitwise
+        // See comment above in the then block of this if-statement.
+        if ( nbsp !== null && precedingSpaces > 0 ) data.nonBreakingSpaces[ ( idx / tkSize ) - 1 ] = nbsp;
       }
     }
     // Return `truthy`, indicating that token(s) has been added successfully.

diff --git a/src/its.js b/src/its.js
@@ -34,63 +34,64 @@ var sort4FT = require( './sort4FT.js' );
 var constants = require( './constants.js' );
 var caseMap = [ 'other', 'lowerCase', 'upperCase', 'titleCase' ];
 var swi = require( './sentence-wise-importance.js' );
+var reconstructSpaces = require( './reconstruct-spaces.js' );
 
 // Size of a single token.
 var tkSize = constants.tkSize;
 // Bits reserved for `lemma`.
 var bits4lemma = constants.bits4lemma;
 // Mask for extracting pos
 var posMask = constants.posMask;
-// Mask for preceding spaces.
-var psMask = constants.psMask;
 // Mask for lemma in case of contraction.
 var lemmaMask = constants.lemmaMask;
 
 var its = Object.create( null );
 
-its.case = function ( index, tokens, cache ) {
-  return caseMap[ cache.property( tokens[ index * tkSize ], 'lutCase' ) ];
+its.case = function ( index, rdd ) {
+  return caseMap[ rdd.cache.property( rdd.tokens[ index * tkSize ], 'lutCase' ) ];
 }; // case()
 
-its.uniqueId = function ( index, tokens ) {
-  return tokens[ index * tkSize ];
+its.uniqueId = function ( index, rdd ) {
+  return rdd.tokens[ index * tkSize ];
 }; // uniqueId()
 
-its.negationFlag = function ( index, tokens ) {
-  return tokens[ ( index * tkSize ) + 3 ] >= constants.negationFlag;
+its.negationFlag = function ( index, rdd ) {
+  return rdd.tokens[ ( index * tkSize ) + 3 ] >= constants.negationFlag;
 }; // negationFlag()
 
-its.normal = function ( index, tokens, cache ) {
+its.normal = function ( index, rdd ) {
+  var tokens = rdd.tokens;
+  var cache = rdd.cache;
   return (
     ( tokens[ ( index * tkSize ) + 1 ] > 65535 ) ?
       cache.value( cache.nox( tokens[ ( index * tkSize ) + 1 ] ) ) :
       cache.value( cache.normal( tokens[ index * tkSize ] ) )
   );
 }; // normal()
 
-its.contractionFlag = function ( index, tokens ) {
-  return ( tokens[ ( index * tkSize ) + 1 ] > 65535 );
+its.contractionFlag = function ( index, rdd ) {
+  return ( rdd.tokens[ ( index * tkSize ) + 1 ] > 65535 );
 }; // contractionFlag()
 
-its.pos = function ( index, tokens, cache ) {
-  return cache.valueOf( 'pos', ( tokens[ ( index * tkSize ) + 2 ] & posMask ) >>> bits4lemma );  // eslint-disable-line no-bitwise
+its.pos = function ( index, rdd ) {
+  return rdd.cache.valueOf( 'pos', ( rdd.tokens[ ( index * tkSize ) + 2 ] & posMask ) >>> bits4lemma );  // eslint-disable-line no-bitwise
 }; // pos()
 
-its.precedingSpaces = function ( index, tokens ) {
-  var token = tokens[ ( index * tkSize ) + 1 ];
-  var count = token & psMask;  // eslint-disable-line no-bitwise
-  return ( ''.padEnd( count ) );
+its.precedingSpaces = function ( index, rdd ) {
+  return reconstructSpaces( index, rdd );
 }; // precedingSpaces()
 
-its.prefix = function ( index, tokens, cache ) {
-  return cache.property( tokens[ index * tkSize ], 'prefix' );
+its.prefix = function ( index, rdd ) {
+  return rdd.cache.property( rdd.tokens[ index * tkSize ], 'prefix' );
 }; // prefix()
 
-its.shape = function ( index, tokens, cache ) {
-  return cache.property( tokens[ index * tkSize ], 'shape' );
+its.shape = function ( index, rdd ) {
+  return rdd.cache.property( rdd.tokens[ index * tkSize ], 'shape' );
 }; // shape()
 
-its.stopWordFlag = function ( index, tokens, cache ) {
+its.stopWordFlag = function ( index, rdd ) {
+  var tokens = rdd.tokens;
+  var cache = rdd.cache;
   // Apply check on normalized token and not the original value, because
   // stop words are always defined in the lowercase.
   var normal = ( tokens[ ( index * tkSize ) + 1 ] > 65535 ) ?
@@ -99,27 +100,29 @@ its.stopWordFlag = function ( index, tokens, cache ) {
   return ( cache.property( normal, 'isStopWord' ) === 1 );
 }; // stopWordFlag()
 
-its.abbrevFlag = function ( index, tokens, cache ) {
-  return ( cache.property( tokens[ index * tkSize ], 'isAbbrev' ) === 1 );
+its.abbrevFlag = function ( index, rdd ) {
+  return ( rdd.cache.property( rdd.tokens[ index * tkSize ], 'isAbbrev' ) === 1 );
 }; // abbrevFlag()
 
-its.suffix = function ( index, tokens, cache ) {
-  return cache.property( tokens[ index * tkSize ], 'suffix' );
+its.suffix = function ( index, rdd ) {
+  return rdd.cache.property( rdd.tokens[ index * tkSize ], 'suffix' );
 }; // suffix()
 
-its.type = function ( index, tokens, cache ) {
-  return cache.property( tokens[ index * tkSize ], 'tokenType' );
+its.type = function ( index, rdd ) {
+  return rdd.cache.property( rdd.tokens[ index * tkSize ], 'tokenType' );
 }; // type()
 
-its.value = function ( index, tokens, cache ) {
-  return cache.value( tokens[ index * tkSize ] );
+its.value = function ( index, rdd ) {
+  return rdd.cache.value( rdd.tokens[ index * tkSize ] );
 }; // value()
 
-its.stem = function ( index, tokens, cache, addons ) {
-  return addons.stem( cache.value( tokens[ index * tkSize ] ) );
+its.stem = function ( index, rdd, addons ) {
+  return addons.stem( rdd.cache.value( rdd.tokens[ index * tkSize ] ) );
 }; // stem()
 
-its.lemma = function ( index, tokens, cache, addons ) {
+its.lemma = function ( index, rdd, addons ) {
+  var tokens = rdd.tokens;
+  var cache = rdd.cache;
   // If it is a contraction that lemma is already available in the token's data structure.
   if ( tokens[ ( index * tkSize ) + 1 ] > 65535 ) {
     return cache.value( tokens[ ( index * tkSize ) + 2 ] & lemmaMask ); // eslint-disable-line no-bitwise
@@ -131,7 +134,7 @@ its.lemma = function ( index, tokens, cache, addons ) {
     return cache.value( cache.property( mappedIdx, 'lemma' ) );
   }
   // Exhausted all possibilities to avoid processing! Now lemmatize!
-  const pos = its.pos( index, tokens, cache );
+  const pos = its.pos( index, rdd );
   const value = cache.value( cache.normal( tokens[ index * tkSize ] ) );
   return addons.lemmatize( value, pos, cache );
 }; // lemmas()
@@ -144,11 +147,11 @@ its.detail = function ( ) {
   return true;
 }; // detail()
 
-its.markedUpText = function ( index, tokens, cache ) {
+its.markedUpText = function ( index, rdd ) {
   // This is a special case because `tokens.out()` allows `as.markedUpText`.
   // Therefore simply return the value and rest is handled by `colTokensOut` with
   // `as.markedUpText()`` or `as.text()` as one of the arugments.
-  return its.value( index, tokens, cache );
+  return its.value( index, rdd );
 }; // markedUpText()
 
 its.span = function ( spanItem ) {

diff --git a/src/reconstruct-spaces.js b/src/reconstruct-spaces.js
@@ -0,0 +1,46 @@
+//     wink-nlp
+//
+//     Copyright (C) GRAYPE Systems Private Limited
+//
+//     This file is part of “wink-nlp”.
+//
+//     Permission is hereby granted, free of charge, to any
+//     person obtaining a copy of this software and
+//     associated documentation files (the "Software"), to
+//     deal in the Software without restriction, including
+//     without limitation the rights to use, copy, modify,
+//     merge, publish, distribute, sublicense, and/or sell
+//     copies of the Software, and to permit persons to
+//     whom the Software is furnished to do so, subject to
+//     the following conditions:
+//
+//     The above copyright notice and this permission notice
+//     shall be included in all copies or substantial
+//     portions of the Software.
+//
+//     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+//     ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+//     TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+//     PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+//     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+//     DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+//     CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+//     CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+//     DEALINGS IN THE SOFTWARE.
+
+//
+
+var constants = require( './constants.js' );
+
+// Size of a single token.
+var tkSize = constants.tkSize;
+// Mask for preceding spaces.
+var psMask = constants.psMask;
+
+var reconstructSpaces = function ( index, rdd ) {
+    var token = rdd.tokens[ ( index * tkSize ) + 1 ];
+    var count = token & psMask;  // eslint-disable-line no-bitwise
+    return ( count < 0xFFFF ) ? ( ''.padEnd( count ) ) : rdd.nonBreakingSpaces[ index ];
+}; // reconstructSpaces()
+
+module.exports = reconstructSpaces;
diff --git a/src/recursive-tokenizer.js b/src/recursive-tokenizer.js
@@ -62,6 +62,8 @@ var tokenizer = function ( categories, preserve ) {
   var isLexeme;
   // Preceding Spaces — special need for recursive tokenizer.
   var ps = 0;
+  // Will only be needed for the first token, after that it si all zero (ps)!
+  var nonBreakingSpaces = null;
 
   // ### pushHyphenatedToken
   /**
@@ -293,7 +295,7 @@ var tokenizer = function ( categories, preserve ) {
       // No regex left, this is the true **unk**.
       // Becuase it is `UNK`, we can use `addToken` instead of attempting
       // `addTokenIfInCache`.
-      addToken( text, categories.unk, ps );
+      addToken( text, categories.unk, ps, nonBreakingSpaces );
       ps = 0;
       return;
     }
@@ -309,8 +311,8 @@ var tokenizer = function ( categories, preserve ) {
         // Use the passed value of preceding spaces only once!
         // First try cache, otherwise make a direct addition. This ensures
         // processing of expansions.
-        cat = addTokenIfInCache( tokens[ i ][ 0 ], ps );
-        if ( cat === categories.unk ) addToken( tokens[ i ][ 0 ], tokens[ i ][ 1 ], ps );
+        cat = addTokenIfInCache( tokens[ i ][ 0 ], ps, nonBreakingSpaces );
+        if ( cat === categories.unk ) addToken( tokens[ i ][ 0 ], tokens[ i ][ 1 ], ps, nonBreakingSpaces );
         // Reset `ps` to **0** as there can never be spaces in a text passed to
         // this tokenizer.
         ps = 0;
@@ -329,18 +331,20 @@ var tokenizer = function ( categories, preserve ) {
    * @param {string} text the input sentence.
    * @param {number} precedingSpaces to the text
    * @param {object} doc contains the document; used here for adding tokens.
+   * @param {array}  nbsp contains non breaking spaces details.
    * @return {void} nothing!
    * `value` and its `tag` identifying the type of the token.
    * @private
   */
-  var tokenize = function ( rgxs, text, precedingSpaces, doc ) {
+  var tokenize = function ( rgxs, text, precedingSpaces, doc, nbsp ) {
     // Cache frequently used doc methods.
     addToken = doc._addToken;
     addTokenIfInCache = doc._addTokenIfInCache;
     isLexeme = doc.isLexeme;
     // Set `ps` to the passed value of preceding spaces, it will be reset to **0**
     // after first use during recursion.
     ps = precedingSpaces;
+    nonBreakingSpaces = nbsp;
     tokenizeTextRecursively( text, rgxs, precedingSpaces );
   }; // tokenize()