feat(*): add suppport for non regular spaces

support em/en, third/quarter, thin/hair, medium math spaces & regular/narrow nbsp references winkjs/wink-eng-lite-web-model#15
winkjs · May 19, 2024 · 5623c32 · 5623c32
1 parent 8baf2dd
commit 5623c32
Show file tree

Hide file tree

Showing 3 changed files with 51 additions and 2 deletions.
diff --git a/src/tokenizer.js b/src/tokenizer.js
@@ -197,7 +197,7 @@ var tokenizer = function ( trex, categories, preserve ) {
       // Skip empty (`''`) token.
       if ( !t ) continue; // eslint-disable-line no-continue
       // Non-empty token:
-      const hasNBSP = ( /\u00a0/ ).test( t );
+      const hasNBSP = ( /[\u00a0\u2002-\u2005\u2009\u200a\u202f\u205f]/ ).test( t );
       if ( t[ 0 ] === ' ' || hasNBSP ) {
         // This indicates spaces: count them.
         precedingSpaces = t.length;

diff --git a/test/test-model/languages/cur/models/eng-core-web-model.json b/test/test-model/languages/cur/models/eng-core-web-model.json
diff --git a/test/wink-nlp-specs.js b/test/wink-nlp-specs.js
@@ -144,6 +144,16 @@ describe( 'wink-nlp test-coverage and basic behavior', function () {
     expect( nlp.readDoc( nbspTokensArray.join('  \u00a0\u00a0') ).out() ).to.equal( nbspTokensArray.join('  \u00a0\u00a0') );
   } );
 
+  it( 'should tokenize/detokenize the text with non-regular spaces', function () {
+    // Reconstruction.
+    expect( nlp.readDoc( nbspTokensArray.join('\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') ).out() ).to.equal( nbspTokensArray.join('\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') );
+    expect( nlp.readDoc( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') ).out() ).to.equal( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') );
+    expect( nlp.readDoc( nbspTokensArray.join('  \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') ).out() ).to.equal( nbspTokensArray.join('  \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') );
+    expect( nlp.readDoc( nbspTokensArray.join('\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') ).out() ).to.equal( nbspTokensArray.join('\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') );
+    expect( nlp.readDoc( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') ).out() ).to.equal( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') );
+    expect( nlp.readDoc( nbspTokensArray.join('  \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') ).out() ).to.equal( nbspTokensArray.join('  \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') );
+  } );
+
   it( 'should tokenize/detokenize a sentence with non-breaking spaces', function () {
     var textWith2S = 'I met Mr.\u00a0Gandhi. Mr.\u00a0Gandhi is a nice person.';
     var sentences = nlp.readDoc( textWith2S ).sentences();
@@ -154,6 +164,16 @@ describe( 'wink-nlp test-coverage and basic behavior', function () {
     } );
   } );
 
+  it( 'should tokenize/detokenize a sentence with non-regular spaces', function () {
+    var textWith2S = 'I met Mr.\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205fGandhi. Mr.\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205fGandhi is a nice person.';
+    var sentences = nlp.readDoc( textWith2S ).sentences();
+    var sentencesText = [ 'I met Mr.\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205fGandhi.',  'Mr.\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205fGandhi is a nice person.' ];
+        // Reconstruction.
+    sentences.each( ( s, k ) => {
+      expect( s.out() ).to.equal( sentencesText[ k ] );
+    } );
+  } );
+
   it( 'should tokenize/detokenize the entities\' value as text with non-breaking spaces', function () {
     var textWith2S = 'I purchased 10 mangoes on March\u00a010th for US$\u00a099.00.';
     var entities = nlp.readDoc( textWith2S ).entities();
@@ -165,6 +185,18 @@ describe( 'wink-nlp test-coverage and basic behavior', function () {
     } );
   } );
 
+  it( 'should tokenize/detokenize the entities\' value as text with non-regular spaces', function () {
+    var textWith2S = 'I purchased 10 mangoes on March\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f10th for US$\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f99.00.';
+    var entities = nlp.readDoc( textWith2S ).entities();
+    var entitiesText = [ '10',  'March\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f10th', 'US$\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f99.00' ];
+    console.log(666666, entities.out(),nlp.readDoc( textWith2S ).tokens().out());
+    // Reconstruction.
+    entities.each( ( e, k ) => {
+      expect( e.out( ) ).to.equal( entitiesText[ k ] );
+      expect( e.out( its.value, as.text ) ).to.equal( entitiesText[ k ] );
+    } );
+  } );
+
   it( 'should preserve non-breaking spaces with mark up', function () {
     var textWith2S = 'I purchased mangoes on March\u00a010th for US$\u00a099.00.';
     var doc4mark = nlp.readDoc( textWith2S );
@@ -174,6 +206,15 @@ describe( 'wink-nlp test-coverage and basic behavior', function () {
     expect( doc4mark.out(its.markedUpText) ).to.equal( markedText );
   } );
 
+  it( 'should preserve non-regular spaces with mark up', function () {
+    var textWith2S = 'I purchased mangoes on March\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f10th for US$\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f99.00.';
+    var doc4mark = nlp.readDoc( textWith2S );
+    doc4mark.entities().each((e) => e.markup());
+    var markedText = 'I purchased mangoes on <mark>March\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f10th</mark> for <mark>US$\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f99.00</mark>.';
+    // Reconstruction.
+    expect( doc4mark.out(its.markedUpText) ).to.equal( markedText );
+  } );
+
   it( 'should correctly reconstruct non-breaking spaces with its.precedingSpaces', function () {
     var text = 'U.S.A is my birth place.  \u00a0 I was born\u00a0on 06.12.1924.';
     var reconstructed = [];
@@ -182,6 +223,14 @@ describe( 'wink-nlp test-coverage and basic behavior', function () {
     expect( reconstructed.join( '' ) ).to.equal( '  \u00a0 I was born\u00a0on 06.12.1924.' );
   } );
 
+  it( 'should correctly reconstruct non-regular spaces with its.precedingSpaces', function () {
+    var text = 'U.S.A is my birth place.  \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f I was born\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205fon 06.12.1924.';
+    var reconstructed = [];
+    nlp.readDoc( text ).sentences().itemAt(1).tokens().each( ( t ) => reconstructed.push( t.out(its.precedingSpaces), t.out() ));
+    // Reconstruction.
+    expect( reconstructed.join( '' ) ).to.equal( '  \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f I was born\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205fon 06.12.1924.' );
+  } );
+
   it( 'should not contain empty tokens', function () {
     var doc = nlp.readDoc( sentence  );
     expect( findEmptyTokens( doc ) ).deep.equal( [] );