Skip to content

Commit

Permalink
feat(*): add suppport for non regular spaces
Browse files Browse the repository at this point in the history
support em/en, third/quarter, thin/hair, medium math spaces &
regular/narrow nbsp

references winkjs/wink-eng-lite-web-model#15
  • Loading branch information
sanjayaksaxena committed May 19, 2024
1 parent 8baf2dd commit 5623c32
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/tokenizer.js
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ var tokenizer = function ( trex, categories, preserve ) {
// Skip empty (`''`) token.
if ( !t ) continue; // eslint-disable-line no-continue
// Non-empty token:
const hasNBSP = ( /\u00a0/ ).test( t );
const hasNBSP = ( /[\u00a0\u2002-\u2005\u2009\u200a\u202f\u205f]/ ).test( t );
if ( t[ 0 ] === ' ' || hasNBSP ) {
// This indicates spaces: count them.
precedingSpaces = t.length;
Expand Down

Large diffs are not rendered by default.

49 changes: 49 additions & 0 deletions test/wink-nlp-specs.js
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,16 @@ describe( 'wink-nlp test-coverage and basic behavior', function () {
expect( nlp.readDoc( nbspTokensArray.join(' \u00a0\u00a0') ).out() ).to.equal( nbspTokensArray.join(' \u00a0\u00a0') );
} );

it( 'should tokenize/detokenize the text with non-regular spaces', function () {
// Reconstruction.
expect( nlp.readDoc( nbspTokensArray.join('\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') ).out() ).to.equal( nbspTokensArray.join('\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') );
expect( nlp.readDoc( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') ).out() ).to.equal( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') );
expect( nlp.readDoc( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') ).out() ).to.equal( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') );
expect( nlp.readDoc( nbspTokensArray.join('\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') ).out() ).to.equal( nbspTokensArray.join('\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') );
expect( nlp.readDoc( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') ).out() ).to.equal( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') );
expect( nlp.readDoc( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') ).out() ).to.equal( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') );
} );

it( 'should tokenize/detokenize a sentence with non-breaking spaces', function () {
var textWith2S = 'I met Mr.\u00a0Gandhi. Mr.\u00a0Gandhi is a nice person.';
var sentences = nlp.readDoc( textWith2S ).sentences();
Expand All @@ -154,6 +164,16 @@ describe( 'wink-nlp test-coverage and basic behavior', function () {
} );
} );

it( 'should tokenize/detokenize a sentence with non-regular spaces', function () {
var textWith2S = 'I met Mr.\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205fGandhi. Mr.\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205fGandhi is a nice person.';
var sentences = nlp.readDoc( textWith2S ).sentences();
var sentencesText = [ 'I met Mr.\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205fGandhi.', 'Mr.\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205fGandhi is a nice person.' ];
// Reconstruction.
sentences.each( ( s, k ) => {
expect( s.out() ).to.equal( sentencesText[ k ] );
} );
} );

it( 'should tokenize/detokenize the entities\' value as text with non-breaking spaces', function () {
var textWith2S = 'I purchased 10 mangoes on March\u00a010th for US$\u00a099.00.';
var entities = nlp.readDoc( textWith2S ).entities();
Expand All @@ -165,6 +185,18 @@ describe( 'wink-nlp test-coverage and basic behavior', function () {
} );
} );

it( 'should tokenize/detokenize the entities\' value as text with non-regular spaces', function () {
var textWith2S = 'I purchased 10 mangoes on March\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f10th for US$\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f99.00.';
var entities = nlp.readDoc( textWith2S ).entities();
var entitiesText = [ '10', 'March\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f10th', 'US$\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f99.00' ];
console.log(666666, entities.out(),nlp.readDoc( textWith2S ).tokens().out());
// Reconstruction.
entities.each( ( e, k ) => {
expect( e.out( ) ).to.equal( entitiesText[ k ] );
expect( e.out( its.value, as.text ) ).to.equal( entitiesText[ k ] );
} );
} );

it( 'should preserve non-breaking spaces with mark up', function () {
var textWith2S = 'I purchased mangoes on March\u00a010th for US$\u00a099.00.';
var doc4mark = nlp.readDoc( textWith2S );
Expand All @@ -174,6 +206,15 @@ describe( 'wink-nlp test-coverage and basic behavior', function () {
expect( doc4mark.out(its.markedUpText) ).to.equal( markedText );
} );

it( 'should preserve non-regular spaces with mark up', function () {
var textWith2S = 'I purchased mangoes on March\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f10th for US$\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f99.00.';
var doc4mark = nlp.readDoc( textWith2S );
doc4mark.entities().each((e) => e.markup());
var markedText = 'I purchased mangoes on <mark>March\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f10th</mark> for <mark>US$\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f99.00</mark>.';
// Reconstruction.
expect( doc4mark.out(its.markedUpText) ).to.equal( markedText );
} );

it( 'should correctly reconstruct non-breaking spaces with its.precedingSpaces', function () {
var text = 'U.S.A is my birth place. \u00a0 I was born\u00a0on 06.12.1924.';
var reconstructed = [];
Expand All @@ -182,6 +223,14 @@ describe( 'wink-nlp test-coverage and basic behavior', function () {
expect( reconstructed.join( '' ) ).to.equal( ' \u00a0 I was born\u00a0on 06.12.1924.' );
} );

it( 'should correctly reconstruct non-regular spaces with its.precedingSpaces', function () {
var text = 'U.S.A is my birth place. \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f I was born\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205fon 06.12.1924.';
var reconstructed = [];
nlp.readDoc( text ).sentences().itemAt(1).tokens().each( ( t ) => reconstructed.push( t.out(its.precedingSpaces), t.out() ));
// Reconstruction.
expect( reconstructed.join( '' ) ).to.equal( ' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f I was born\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205fon 06.12.1924.' );
} );

it( 'should not contain empty tokens', function () {
var doc = nlp.readDoc( sentence );
expect( findEmptyTokens( doc ) ).deep.equal( [] );
Expand Down

0 comments on commit 5623c32

Please sign in to comment.