Skip to content

Commit

Permalink
test(*): add accented chars & word joiner test cases
Browse files Browse the repository at this point in the history
  • Loading branch information
sanjayaksaxena committed Apr 12, 2024
1 parent ab70a32 commit fe891c6
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 1 deletion.
11 changes: 11 additions & 0 deletions test/its-specs.js
Expand Up @@ -85,8 +85,19 @@ describe( 'its functions for .out()', function () {
it( 'its.shape', function () {
expect( nlp.readDoc( 'The' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxx' );
expect( nlp.readDoc( 'TheOne' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'XxxXxx' );
expect( nlp.readDoc( 'The\u2060One' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'XxxXxx' );
expect( nlp.readDoc( 'A1' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xd' );
expect( nlp.readDoc( 'Abcdef123456' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxxdddd' );
expect( nlp.readDoc( 'Poincar\u00e9' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxx' );
expect( nlp.readDoc( 'Poin\u2060car\u00e9' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxx' );
expect( nlp.readDoc( 'Poincare\u0301' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxx' );
} );

it( 'its.shape special cases', function () {
expect( nlp.readDoc( 'The\u2060One' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'XxxXxx' );
expect( nlp.readDoc( 'Poincar\u00e9' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxx' );
expect( nlp.readDoc( 'Poin\u2060car\u00e9' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxx' );
expect( nlp.readDoc( 'Poincare\u0301' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxx' );
} );

it( 'its.type', function () {
Expand Down
2 changes: 2 additions & 0 deletions test/test-model/feature.js
Expand Up @@ -96,6 +96,7 @@ var feature = function ( config, lang, featuresData, isLexicographer ) {
const rgxLC = /^[a-z][a-z\-\–\—\.]*$/;
const rgxUC = /^[A-Z][A-Z\-\–\—\.]*$/;
const rgxTC = /^[A-Z][a-z\-\–\—\.]*$/;
var rgxDiacriticalWordJoiner = /[\u0300-\u036f\u2060]/g;

// The Regex, Category pair goes in to this array for category detection &
// assignment.
Expand All @@ -114,6 +115,7 @@ var feature = function ( config, lang, featuresData, isLexicographer ) {
var shape = function ( word ) {
return (
word
.normalize( 'NFD' ).replace( rgxDiacriticalWordJoiner, '' )
.replace( /[A-Z]{4,}/g, 'XXXX' )
// Handle <4 Caps
.replace( /[A-Z]/g, 'X' )
Expand Down

Large diffs are not rendered by default.

0 comments on commit fe891c6

Please sign in to comment.