Skip to content

Commit

Permalink
feat(*): enable nbsp handling
Browse files Browse the repository at this point in the history
references #135

Co-authored-by: Rachna <rachna@graype.in>
  • Loading branch information
sanjayaksaxena and rachnachakraborty committed Mar 27, 2024
1 parent b48cb11 commit e139a5a
Show file tree
Hide file tree
Showing 12 changed files with 187 additions and 94 deletions.
10 changes: 3 additions & 7 deletions src/api/col-tokens-out.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,7 @@
var its = require( '../its.js' );
var as = require( '../as.js' );
var allowed = require( '../allowed.js' );
var constants = require( '../constants.js' );
// Size of a single token.
var tkSize = constants.tkSize;
// Mask for preceding spaces.
var psMask = constants.psMask;
var reconstructSpaces = require( '../reconstruct-spaces.js' );

// ## colTokensOut
/**
Expand Down Expand Up @@ -65,11 +61,11 @@ var colTokensOut = function ( start, end, rdd, itsf, asf, addons ) {
// Note, `as.text/markedUpText` needs special attention to include preceeding spaces.
if ( asfn === as.text || asfn === as.markedUpText ) {
for ( let i = start; i <= end; i += 1 ) {
mappedTkns.push( ''.padEnd( rdd.tokens[ ( i * tkSize ) + 1 ] & psMask ), itsf( i, rdd.tokens, rdd.cache, addons ) ); // eslint-disable-line no-bitwise
mappedTkns.push( reconstructSpaces( i, rdd ), itsf( i, rdd, addons ) );
}
} else {
for ( let i = start; i <= end; i += 1 ) {
mappedTkns.push( itsfn( i, rdd.tokens, rdd.cache, addons ) );
mappedTkns.push( itsfn( i, rdd, addons ) );
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/api/itm-token-out.js
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ var allowed = require( '../allowed.js' );
var itmTokenOut = function ( index, rdd, itsf, addons ) {
// Not a vector request, map using `itsf`.
var f = ( allowed.its4token.has( itsf ) ) ? itsf : its.value;
return f( index, rdd.tokens, rdd.cache, addons );
return f( index, rdd, addons );
}; // itmTokenOut()

module.exports = itmTokenOut;
5 changes: 3 additions & 2 deletions src/api/sel-tokens-out.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
var its = require( '../its.js' );
var as = require( '../as.js' );
var allowed = require( '../allowed.js' );
var reconstructSpaces = require( '../reconstruct-spaces.js' );
var constants = require( '../constants.js' );
// Size of a single token.
var tkSize = constants.tkSize;
Expand Down Expand Up @@ -65,11 +66,11 @@ var selTokensOut = function ( selTokens, rdd, itsf, asf, addons ) {
// No `markedUpText` allowed here.
if ( asfn === as.text ) {
for ( let i = 0; i < selTokens.length; i += 1 ) {
mappedTkns.push( ''.padEnd( rdd.tokens[ ( selTokens[ i ] * tkSize ) + 1 ] & psMask ), itsf( selTokens[ i ], rdd.tokens, rdd.cache, addons ) ); // eslint-disable-line no-bitwise
mappedTkns.push( reconstructSpaces( selTokens[ i ], rdd ), itsf( selTokens[ i ], rdd, addons ) );
}
} else {
for ( let i = 0; i < selTokens.length; i += 1 ) {
mappedTkns.push( itsfn( selTokens[ i ], rdd.tokens, rdd.cache, addons ) );
mappedTkns.push( itsfn( selTokens[ i ], rdd, addons ) );
}
}

Expand Down
29 changes: 23 additions & 6 deletions src/dd-wrapper.js
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ var xpSize = constants.xpSize;
var bits4lemma = constants.bits4lemma;
// The UNK!
var UNK = constants.UNK;
// Size of a single token.
var tkSize = constants.tkSize;

var docDataWrapper = function ( data ) {
// Extract frequently referred data elements:
Expand All @@ -63,12 +65,16 @@ var docDataWrapper = function ( data ) {
* @param {string} text to be added as token.
* @param {string} category of the token.
* @param {number} precedingSpaces to the `text` as parsed by tokenizer.
* @param {number[]} tokens, where the token is added.
* @param {array} nbsp, containing details of nbsp.
* @returns {boolean} always `true`.
* @private
*/
var addToken = function ( text, category, precedingSpaces ) {
tokens.push( cache.add( text, category ), precedingSpaces, 0, 0 );
var addToken = function ( text, category, precedingSpaces, nbsp ) {
// Non-normalized index of the token being pushed.
var idx;
idx = tokens.push( cache.add( text, category ), precedingSpaces, 0, 0 );
// See comments in `addTokenIfInCache()`
if ( nbsp !== null && precedingSpaces > 0 ) data.nonBreakingSpaces[ ( idx / tkSize ) - 1 ] = nbsp;
return true;
}; // addToken()

Expand All @@ -84,10 +90,11 @@ var docDataWrapper = function ( data ) {
*
* @param {string} text to be added as token.
* @param {number} precedingSpaces to the `text` as parsed by tokenizer.
* @param {string} nbsp non breaking spaces
* @returns {boolean} `truthy` if `text` is found in cache otherwise `falsy`.
* @private
*/
var addTokenIfInCache = function ( text, precedingSpaces ) {
var addTokenIfInCache = function ( text, precedingSpaces, nbsp ) {
// The array `tokenIndex` will contain 1-element if `text` is not a predefined
// contraction; otherwise it will contain `n x 4` elements, where `n` is the
// number of expansions.
Expand All @@ -96,12 +103,20 @@ var docDataWrapper = function ( data ) {
var ps;
// Temp for lemma & pos.
var lemma, pos;
// Non-normalized index of the token being pushed.
var idx;

// `UNK` means 0 or `falsy`; it flags that token has not been added.
if ( tokenIndex === null ) return UNK;

if ( tokenIndex.length === 1 ) {
tokens.push( tokenIndex[ 0 ], precedingSpaces, 0, 0 );
idx = tokens.push( tokenIndex[ 0 ], precedingSpaces, 0, 0 );
// Store non breaking spaces preceding this token. Do it only if `precedingSpaces > 0` (Note:
// it is zero in case of expansion of a contraction) AND `nbsp` is defined (Note: in this case
// precedingSpaces would be set to max i.e. 0xFFFF with only exception when the token is being
// expanded: the first one will have nbsp but the subsequent ones with have 0 preceding spaces).
// The storage index should be the normalaized token index.
if ( nbsp !== null && precedingSpaces > 0 ) data.nonBreakingSpaces[ ( idx / tkSize ) - 1 ] = nbsp;
} else {
// Contraction, itereate through each expansion.
for ( let k = 0; k < tokenIndex.length; k += xpSize ) {
Expand All @@ -114,7 +129,9 @@ var docDataWrapper = function ( data ) {
lemma = tokenIndex[ k + 2 ];
pos = tokenIndex[ k + 3 ];
// Add token; annotations may be filled later in the pipeline.
tokens.push( tokenIndex[ k ], ps, ( lemma | ( pos << bits4lemma ) ), 0 ); // eslint-disable-line no-bitwise
idx = tokens.push( tokenIndex[ k ], ps, ( lemma | ( pos << bits4lemma ) ), 0 ); // eslint-disable-line no-bitwise
// See comment above in the then block of this if-statement.
if ( nbsp !== null && precedingSpaces > 0 ) data.nonBreakingSpaces[ ( idx / tkSize ) - 1 ] = nbsp;
}
}
// Return `truthy`, indicating that token(s) has been added successfully.
Expand Down
75 changes: 39 additions & 36 deletions src/its.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,63 +34,64 @@ var sort4FT = require( './sort4FT.js' );
var constants = require( './constants.js' );
var caseMap = [ 'other', 'lowerCase', 'upperCase', 'titleCase' ];
var swi = require( './sentence-wise-importance.js' );
var reconstructSpaces = require( './reconstruct-spaces.js' );

// Size of a single token.
var tkSize = constants.tkSize;
// Bits reserved for `lemma`.
var bits4lemma = constants.bits4lemma;
// Mask for extracting pos
var posMask = constants.posMask;
// Mask for preceding spaces.
var psMask = constants.psMask;
// Mask for lemma in case of contraction.
var lemmaMask = constants.lemmaMask;

var its = Object.create( null );

its.case = function ( index, tokens, cache ) {
return caseMap[ cache.property( tokens[ index * tkSize ], 'lutCase' ) ];
its.case = function ( index, rdd ) {
return caseMap[ rdd.cache.property( rdd.tokens[ index * tkSize ], 'lutCase' ) ];
}; // case()

its.uniqueId = function ( index, tokens ) {
return tokens[ index * tkSize ];
its.uniqueId = function ( index, rdd ) {
return rdd.tokens[ index * tkSize ];
}; // uniqueId()

its.negationFlag = function ( index, tokens ) {
return tokens[ ( index * tkSize ) + 3 ] >= constants.negationFlag;
its.negationFlag = function ( index, rdd ) {
return rdd.tokens[ ( index * tkSize ) + 3 ] >= constants.negationFlag;
}; // negationFlag()

its.normal = function ( index, tokens, cache ) {
its.normal = function ( index, rdd ) {
var tokens = rdd.tokens;
var cache = rdd.cache;
return (
( tokens[ ( index * tkSize ) + 1 ] > 65535 ) ?
cache.value( cache.nox( tokens[ ( index * tkSize ) + 1 ] ) ) :
cache.value( cache.normal( tokens[ index * tkSize ] ) )
);
}; // normal()

its.contractionFlag = function ( index, tokens ) {
return ( tokens[ ( index * tkSize ) + 1 ] > 65535 );
its.contractionFlag = function ( index, rdd ) {
return ( rdd.tokens[ ( index * tkSize ) + 1 ] > 65535 );
}; // contractionFlag()

its.pos = function ( index, tokens, cache ) {
return cache.valueOf( 'pos', ( tokens[ ( index * tkSize ) + 2 ] & posMask ) >>> bits4lemma ); // eslint-disable-line no-bitwise
its.pos = function ( index, rdd ) {
return rdd.cache.valueOf( 'pos', ( rdd.tokens[ ( index * tkSize ) + 2 ] & posMask ) >>> bits4lemma ); // eslint-disable-line no-bitwise
}; // pos()

its.precedingSpaces = function ( index, tokens ) {
var token = tokens[ ( index * tkSize ) + 1 ];
var count = token & psMask; // eslint-disable-line no-bitwise
return ( ''.padEnd( count ) );
its.precedingSpaces = function ( index, rdd ) {
return reconstructSpaces( index, rdd );
}; // precedingSpaces()

its.prefix = function ( index, tokens, cache ) {
return cache.property( tokens[ index * tkSize ], 'prefix' );
its.prefix = function ( index, rdd ) {
return rdd.cache.property( rdd.tokens[ index * tkSize ], 'prefix' );
}; // prefix()

its.shape = function ( index, tokens, cache ) {
return cache.property( tokens[ index * tkSize ], 'shape' );
its.shape = function ( index, rdd ) {
return rdd.cache.property( rdd.tokens[ index * tkSize ], 'shape' );
}; // shape()

its.stopWordFlag = function ( index, tokens, cache ) {
its.stopWordFlag = function ( index, rdd ) {
var tokens = rdd.tokens;
var cache = rdd.cache;
// Apply check on normalized token and not the original value, because
// stop words are always defined in the lowercase.
var normal = ( tokens[ ( index * tkSize ) + 1 ] > 65535 ) ?
Expand All @@ -99,27 +100,29 @@ its.stopWordFlag = function ( index, tokens, cache ) {
return ( cache.property( normal, 'isStopWord' ) === 1 );
}; // stopWordFlag()

its.abbrevFlag = function ( index, tokens, cache ) {
return ( cache.property( tokens[ index * tkSize ], 'isAbbrev' ) === 1 );
its.abbrevFlag = function ( index, rdd ) {
return ( rdd.cache.property( rdd.tokens[ index * tkSize ], 'isAbbrev' ) === 1 );
}; // abbrevFlag()

its.suffix = function ( index, tokens, cache ) {
return cache.property( tokens[ index * tkSize ], 'suffix' );
its.suffix = function ( index, rdd ) {
return rdd.cache.property( rdd.tokens[ index * tkSize ], 'suffix' );
}; // suffix()

its.type = function ( index, tokens, cache ) {
return cache.property( tokens[ index * tkSize ], 'tokenType' );
its.type = function ( index, rdd ) {
return rdd.cache.property( rdd.tokens[ index * tkSize ], 'tokenType' );
}; // type()

its.value = function ( index, tokens, cache ) {
return cache.value( tokens[ index * tkSize ] );
its.value = function ( index, rdd ) {
return rdd.cache.value( rdd.tokens[ index * tkSize ] );
}; // value()

its.stem = function ( index, tokens, cache, addons ) {
return addons.stem( cache.value( tokens[ index * tkSize ] ) );
its.stem = function ( index, rdd, addons ) {
return addons.stem( rdd.cache.value( rdd.tokens[ index * tkSize ] ) );
}; // stem()

its.lemma = function ( index, tokens, cache, addons ) {
its.lemma = function ( index, rdd, addons ) {
var tokens = rdd.tokens;
var cache = rdd.cache;
// If it is a contraction that lemma is already available in the token's data structure.
if ( tokens[ ( index * tkSize ) + 1 ] > 65535 ) {
return cache.value( tokens[ ( index * tkSize ) + 2 ] & lemmaMask ); // eslint-disable-line no-bitwise
Expand All @@ -131,7 +134,7 @@ its.lemma = function ( index, tokens, cache, addons ) {
return cache.value( cache.property( mappedIdx, 'lemma' ) );
}
// Exhausted all possibilities to avoid processing! Now lemmatize!
const pos = its.pos( index, tokens, cache );
const pos = its.pos( index, rdd );
const value = cache.value( cache.normal( tokens[ index * tkSize ] ) );
return addons.lemmatize( value, pos, cache );
}; // lemmas()
Expand All @@ -144,11 +147,11 @@ its.detail = function ( ) {
return true;
}; // detail()

its.markedUpText = function ( index, tokens, cache ) {
its.markedUpText = function ( index, rdd ) {
// This is a special case because `tokens.out()` allows `as.markedUpText`.
// Therefore simply return the value and rest is handled by `colTokensOut` with
// `as.markedUpText()`` or `as.text()` as one of the arugments.
return its.value( index, tokens, cache );
return its.value( index, rdd );
}; // markedUpText()

its.span = function ( spanItem ) {
Expand Down
46 changes: 46 additions & 0 deletions src/reconstruct-spaces.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// wink-nlp
//
// Copyright (C) GRAYPE Systems Private Limited
//
// This file is part of “wink-nlp”.
//
// Permission is hereby granted, free of charge, to any
// person obtaining a copy of this software and
// associated documentation files (the "Software"), to
// deal in the Software without restriction, including
// without limitation the rights to use, copy, modify,
// merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to
// whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice
// shall be included in all copies or substantial
// portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
// TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
// CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.

//

var constants = require( './constants.js' );

// Size of a single token.
var tkSize = constants.tkSize;
// Mask for preceding spaces.
var psMask = constants.psMask;

var reconstructSpaces = function ( index, rdd ) {
var token = rdd.tokens[ ( index * tkSize ) + 1 ];
var count = token & psMask; // eslint-disable-line no-bitwise
return ( count < 0xFFFF ) ? ( ''.padEnd( count ) ) : rdd.nonBreakingSpaces[ index ];
}; // reconstructSpaces()

module.exports = reconstructSpaces;
12 changes: 8 additions & 4 deletions src/recursive-tokenizer.js
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ var tokenizer = function ( categories, preserve ) {
var isLexeme;
// Preceding Spaces — special need for recursive tokenizer.
var ps = 0;
// Will only be needed for the first token, after that it si all zero (ps)!
var nonBreakingSpaces = null;

// ### pushHyphenatedToken
/**
Expand Down Expand Up @@ -293,7 +295,7 @@ var tokenizer = function ( categories, preserve ) {
// No regex left, this is the true **unk**.
// Becuase it is `UNK`, we can use `addToken` instead of attempting
// `addTokenIfInCache`.
addToken( text, categories.unk, ps );
addToken( text, categories.unk, ps, nonBreakingSpaces );
ps = 0;
return;
}
Expand All @@ -309,8 +311,8 @@ var tokenizer = function ( categories, preserve ) {
// Use the passed value of preceding spaces only once!
// First try cache, otherwise make a direct addition. This ensures
// processing of expansions.
cat = addTokenIfInCache( tokens[ i ][ 0 ], ps );
if ( cat === categories.unk ) addToken( tokens[ i ][ 0 ], tokens[ i ][ 1 ], ps );
cat = addTokenIfInCache( tokens[ i ][ 0 ], ps, nonBreakingSpaces );
if ( cat === categories.unk ) addToken( tokens[ i ][ 0 ], tokens[ i ][ 1 ], ps, nonBreakingSpaces );
// Reset `ps` to **0** as there can never be spaces in a text passed to
// this tokenizer.
ps = 0;
Expand All @@ -329,18 +331,20 @@ var tokenizer = function ( categories, preserve ) {
* @param {string} text the input sentence.
* @param {number} precedingSpaces to the text
* @param {object} doc contains the document; used here for adding tokens.
* @param {array} nbsp contains non breaking spaces details.
* @return {void} nothing!
* `value` and its `tag` identifying the type of the token.
* @private
*/
var tokenize = function ( rgxs, text, precedingSpaces, doc ) {
var tokenize = function ( rgxs, text, precedingSpaces, doc, nbsp ) {
// Cache frequently used doc methods.
addToken = doc._addToken;
addTokenIfInCache = doc._addTokenIfInCache;
isLexeme = doc.isLexeme;
// Set `ps` to the passed value of preceding spaces, it will be reset to **0**
// after first use during recursion.
ps = precedingSpaces;
nonBreakingSpaces = nbsp;
tokenizeTextRecursively( text, rgxs, precedingSpaces );
}; // tokenize()

Expand Down
Loading

0 comments on commit e139a5a

Please sign in to comment.