test(*): add accented chars & word joiner test cases

winkjs · Apr 12, 2024 · fe891c6 · fe891c6
1 parent ab70a32
commit fe891c6
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 1 deletion.
diff --git a/test/its-specs.js b/test/its-specs.js
@@ -85,8 +85,19 @@ describe( 'its functions for .out()', function () {
   it( 'its.shape', function () {
     expect( nlp.readDoc( 'The' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxx' );
     expect( nlp.readDoc( 'TheOne' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'XxxXxx' );
+    expect( nlp.readDoc( 'The\u2060One' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'XxxXxx' );
     expect( nlp.readDoc( 'A1' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xd' );
     expect( nlp.readDoc( 'Abcdef123456' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxxdddd' );
+    expect( nlp.readDoc( 'Poincar\u00e9' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxx' );
+    expect( nlp.readDoc( 'Poin\u2060car\u00e9' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxx' );
+    expect( nlp.readDoc( 'Poincare\u0301' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxx' );
+  } );
+
+  it( 'its.shape special cases', function () {
+    expect( nlp.readDoc( 'The\u2060One' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'XxxXxx' );
+    expect( nlp.readDoc( 'Poincar\u00e9' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxx' );
+    expect( nlp.readDoc( 'Poin\u2060car\u00e9' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxx' );
+    expect( nlp.readDoc( 'Poincare\u0301' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxx' );
   } );
 
   it( 'its.type', function () {

diff --git a/test/test-model/feature.js b/test/test-model/feature.js
@@ -96,6 +96,7 @@ var feature = function ( config, lang, featuresData, isLexicographer ) {
   const rgxLC = /^[a-z][a-z\-\–\—\.]*$/;
   const rgxUC = /^[A-Z][A-Z\-\–\—\.]*$/;
   const rgxTC = /^[A-Z][a-z\-\–\—\.]*$/;
+  var rgxDiacriticalWordJoiner = /[\u0300-\u036f\u2060]/g;
 
   // The Regex, Category  pair goes in to this array for category detection &
   // assignment.
@@ -114,6 +115,7 @@ var feature = function ( config, lang, featuresData, isLexicographer ) {
   var shape = function ( word ) {
     return (
       word
+      .normalize( 'NFD' ).replace( rgxDiacriticalWordJoiner, '' )
         .replace( /[A-Z]{4,}/g, 'XXXX' )
         // Handle <4 Caps
         .replace( /[A-Z]/g, 'X' )

diff --git a/test/test-model/languages/cur/models/eng-core-web-model.json b/test/test-model/languages/cur/models/eng-core-web-model.json