Skip to content

Commit

Permalink
feat(*): add jaroWinkler function
Browse files Browse the repository at this point in the history
  • Loading branch information
sanjayaksaxena committed Oct 17, 2017
1 parent 123bec2 commit 20a66d3
Show file tree
Hide file tree
Showing 6 changed files with 288 additions and 7 deletions.
1 change: 1 addition & 0 deletions docs-toc.yml
Expand Up @@ -12,6 +12,7 @@ toc:
- string.hamming
- string.hammingNormalized
- string.jaro
- string.jaroWinkler
- string.soundex
- name: vector
- vector.chebyshev
Expand Down
133 changes: 133 additions & 0 deletions docs/index.html
Expand Up @@ -144,6 +144,16 @@ <h3 class='mb0 no-anchor'>wink-distance</h3>
</li>


<li><a
href='#stringjarowinkler'
class="">
string.jaroWinkler

</a>

</li>


<li><a
href='#stringsoundex'
class="">
Expand Down Expand Up @@ -973,6 +983,129 @@ <h3 class='fl m0' id='stringjaro'>



</section>




<section class='p2 mb2 clearfix bg-white minishadow'>


<div class='clearfix'>

<h3 class='fl m0' id='stringjarowinkler'>
string.jaroWinkler
</h3>


</div>


<p>Computes the jaro winkler distance between two strings. This distance,
controlled by the <code>scalingFactor</code>, is always between 0 and 1.</p>


<div class='pre p1 fill-light mt0'>string.jaroWinkler</div>











<div class='py1 quiet mt1 prose-big'>Parameters</div>
<div class='prose'>

<div class='space-bottom0'>
<div>
<span class='code bold'>str1</span> <code class='quiet'>(<a href="#string">string</a>)</code>
— first string.

</div>

</div>

<div class='space-bottom0'>
<div>
<span class='code bold'>str2</span> <code class='quiet'>(<a href="#string">string</a>)</code>
— second string.

</div>

</div>

<div class='space-bottom0'>
<div>
<span class='code bold'>boostThreshold</span> <code class='quiet'>(<a href="#number">number</a>
= <code>0.3</code>)</code>
— beyond which scaling is applied: it is
applied only if the jaro distance between the input strings is less than or
equal to this value. Any value > 1, is capped at 1 automatically.

</div>

</div>

<div class='space-bottom0'>
<div>
<span class='code bold'>scalingFactor</span> <code class='quiet'>(<a href="#number">number</a>
= <code>0.1</code>)</code>
— is used to scale the distance.
Such scaling, if applied, is proportional to the number of shared
consecutive characters from the first character of
<code>str1</code>
and
<code>str2</code>
.
Any value > 0.25, is capped at 0.25 automatically.

</div>

</div>

</div>






<div class='py1 quiet mt1 prose-big'>Returns</div>
<code><a href="#number">number</a></code>:
jaro winkler distance between
<code>str1</code>
and
<code>str2</code>
.








<div class='py1 quiet mt1 prose-big'>Example</div>


<pre class='p1 overflow-auto round fill-light'>jaroWinkler( <span class="hljs-string">'martha'</span>, <span class="hljs-string">'marhta'</span> );
<span class="hljs-comment">// -&gt; 0.03888888888888883</span>
jaroWinkler( <span class="hljs-string">'martha'</span>, <span class="hljs-string">'marhta'</span>, <span class="hljs-number">0.3</span>, <span class="hljs-number">0.2</span> );
<span class="hljs-comment">// -&gt; 0.022222222222222185</span>
jaroWinkler( <span class="hljs-string">'duane'</span>, <span class="hljs-string">'dwayne'</span> );
<span class="hljs-comment">// -&gt; .15999999999999992</span></pre>








</section>


Expand Down
82 changes: 82 additions & 0 deletions src/string-jaro-winkler.js
@@ -0,0 +1,82 @@
// wink-distance
// Distance functions for Bag of Words, Strings,
// Vectors and more.
//
// Copyright (C) 2017 GRAYPE Systems Private Limited
//
// This file is part of “wink-distance”.
//
// “wink-distance” is free software: you can redistribute
// it and/or modify it under the terms of the GNU Affero
// General Public License as published by the Free
// Software Foundation, version 3 of the License.
//
// “wink-distance” is distributed in the hope that it will
// be useful, but WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Affero General
// Public License for more details.
//
// You should have received a copy of the GNU Affero
// General Public License along with “wink-distance”.
// If not, see <http://www.gnu.org/licenses/>.

var jaro = require( './string-jaro.js' );
// ## string

// ### jaro
/**
*
* Computes the jaro winkler distance between two strings. This distance,
* controlled by the `scalingFactor`, is always between 0 and 1.
*
* @name string.jaroWinkler
* @param {string} str1 — first string.
* @param {string} str2 — second string.
* @param {number} [boostThreshold=0.3] — beyond which scaling is applied: it is
* applied only if the jaro distance between the input strings is less than or
* equal to this value. Any value > 1, is capped at 1 automatically.
* @param {number} [scalingFactor=0.1] — is used to scale the distance.
* Such scaling, if applied, is proportional to the number of shared
* consecutive characters from the first character of `str1` and `str2`.
* Any value > 0.25, is capped at 0.25 automatically.
* @return {number} jaro winkler distance between `str1` and `str2`.
* @example
* jaroWinkler( 'martha', 'marhta' );
* // -> 0.03888888888888883
* jaroWinkler( 'martha', 'marhta', 0.3, 0.2 );
* // -> 0.022222222222222185
* jaroWinkler( 'duane', 'dwayne' );
* // -> .15999999999999992
*/
var jaroWinkler = function ( str1, str2, boostThreshold, scalingFactor ) {
// Early exit!
if ( str1 === str2 ) return 0;
// Setup default values if undefined.
var sf = ( scalingFactor === undefined ) ? 0.1 : scalingFactor;
var bt = ( boostThreshold === undefined ) ? 0.3 : boostThreshold;
// Fix scaling factor & boost threshold, if required.
sf = Math.min( Math.abs( sf ), 0.25 );
bt = Math.min( Math.abs( bt ), 1 );

var distance = jaro( str1, str2 );

if ( distance > bt ) return distance;

var pLimit = Math.min( str1.length, str2.length, 4 );
var l = 0;

for ( var i = 0; i < pLimit; i += 1 ) {
if ( str1[ i ] === str2[ i ] ) {
l += 1;
} else {
break;
}
}

distance -= ( l * sf * distance );

return distance;
}; // jaroWinkler()

module.exports = jaroWinkler;
8 changes: 5 additions & 3 deletions src/wink-distance.js
Expand Up @@ -55,9 +55,11 @@ wd.string.hamming = require( './string-hamming.js' );
// 2. hammingNormalized
wd.string.hammingNormalized = require( './string-hamming-normalized.js' );
// 3. jaro
wd.string.jaro = require( './string-jaro' );
// 4. soundex
wd.string.soundex = require( './string-soundex' );
wd.string.jaro = require( './string-jaro.js' );
// 4. jaroWinkler
wd.string.jaroWinkler = require( './string-jaro-winkler.js' );
// 5. soundex
wd.string.soundex = require( './string-soundex.js' );

// Vector name space.
// 1. taxicab
Expand Down
16 changes: 12 additions & 4 deletions test/string-jaro-specs.js
Expand Up @@ -32,16 +32,24 @@ var it = mocha.it;

describe( 'string-jaro normal behaviour', function () {
var tests = [
{ whenInputIs: { str1: 'john', str2: 'johny' }, expectedOutputIs: 0.0667 },
{ whenInputIs: { str1: 'sam', str2: 'sat' }, expectedOutputIs: 0.2222 },
{ whenInputIs: { str1: 'summer', str2: 'samuel' }, expectedOutputIs: 0.3056 },
{ whenInputIs: { str1: 'SHACKLEFORD', str2: 'SHACKELFORD' }, expectedOutputIs: 0.030 },
{ whenInputIs: { str1: 'DUNNINGHAM', str2: 'CUNNIGHAM' }, expectedOutputIs: 0.104 },
{ whenInputIs: { str1: 'JONES', str2: 'JOHNSON' }, expectedOutputIs: 0.210 },
{ whenInputIs: { str1: 'MASSEY', str2: 'MASSIE' }, expectedOutputIs: 0.111 },
{ whenInputIs: { str1: 'ABROMS', str2: 'ABRAMS' }, expectedOutputIs: 0.111 },
{ whenInputIs: { str1: 'DWAYNE', str2: 'DUANE' }, expectedOutputIs: 0.178 },
{ whenInputIs: { str1: 'SEAN', str2: 'SUSAN' }, expectedOutputIs: 0.217 },
{ whenInputIs: { str1: 'MICHELLE', str2: 'MICHAEL' }, expectedOutputIs: 0.131 },
{ whenInputIs: { str1: 'MARHTA', str2: 'MARTHA' }, expectedOutputIs: 0.056 },
{ whenInputIs: { str1: 'TANYA', str2: 'TONYA' }, expectedOutputIs: 0.133 },
{ whenInputIs: { str1: 'sat', str2: 'urn' }, expectedOutputIs: 1 },
{ whenInputIs: { str1: 'saturn', str2: 'saturn' }, expectedOutputIs: 0 },
{ whenInputIs: { str1: '', str2: '' }, expectedOutputIs: 0 },
];

tests.forEach( function ( test ) {
it( 'should return ' + JSON.stringify( test.expectedOutputIs ) + ' if the input is ' + JSON.stringify( test.whenInputIs ), function () {
expect( +jaro( test.whenInputIs.str1, test.whenInputIs.str2 ).toFixed( 4 ) ).to.equal( test.expectedOutputIs );
expect( +jaro( test.whenInputIs.str1, test.whenInputIs.str2 ).toFixed( 3 ) ).to.equal( test.expectedOutputIs );
} );
} );
} );
55 changes: 55 additions & 0 deletions test/string-jaro-winkler-specs.js
@@ -0,0 +1,55 @@
// wink-distance
// Distance functions for Bag of Words, Strings,
// Vectors and more.
//
// Copyright (C) 2017 GRAYPE Systems Private Limited
//
// This file is part of “wink-distance”.
//
// “wink-distance” is free software: you can redistribute
// it and/or modify it under the terms of the GNU Affero
// General Public License as published by the Free
// Software Foundation, version 3 of the License.
//
// “wink-distance” is distributed in the hope that it will
// be useful, but WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Affero General
// Public License for more details.
//
// You should have received a copy of the GNU Affero
// General Public License along with “wink-distance”.
// If not, see <http://www.gnu.org/licenses/>.

//
var chai = require( 'chai' );
var mocha = require( 'mocha' );
var jaroWinkler = require( '../src/wink-distance.js' ).string.jaroWinkler;

var expect = chai.expect;
var describe = mocha.describe;
var it = mocha.it;

describe( 'string-jaro normal behaviour', function () {
var tests = [
{ whenInputIs: { str1: 'SHACKLEFORD', str2: 'SHACKELFORD' }, expectedOutputIs: 0.018 },
{ whenInputIs: { str1: 'DUNNINGHAM', str2: 'CUNNIGHAM' }, expectedOutputIs: 0.104 },
{ whenInputIs: { str1: 'JONES', str2: 'JOHNSON' }, expectedOutputIs: 0.168 },
{ whenInputIs: { str1: 'MASSEY', str2: 'MASSIE' }, expectedOutputIs: 0.067 },
{ whenInputIs: { str1: 'ABROMS', str2: 'ABRAMS' }, expectedOutputIs: 0.078 },
{ whenInputIs: { str1: 'DWAYNE', str2: 'DUANE' }, expectedOutputIs: 0.160 },
{ whenInputIs: { str1: 'SEAN', str2: 'SUSAN' }, expectedOutputIs: 0.195 },
{ whenInputIs: { str1: 'MICHELLE', str2: 'MICHAEL' }, expectedOutputIs: 0.079 },
{ whenInputIs: { str1: 'MARHTA', str2: 'MARTHA' }, expectedOutputIs: 0.039 },
{ whenInputIs: { str1: 'TANYA', str2: 'TONYA' }, expectedOutputIs: 0.120 },
{ whenInputIs: { str1: 'sat', str2: 'urn' }, expectedOutputIs: 1 },
{ whenInputIs: { str1: 'saturn', str2: 'saturn' }, expectedOutputIs: 0 },
{ whenInputIs: { str1: '', str2: '' }, expectedOutputIs: 0 },
];

tests.forEach( function ( test ) {
it( 'should return ' + JSON.stringify( test.expectedOutputIs ) + ' if the input is ' + JSON.stringify( test.whenInputIs ), function () {
expect( +jaroWinkler( test.whenInputIs.str1, test.whenInputIs.str2 ).toFixed( 3 ) ).to.equal( test.expectedOutputIs );
} );
} );
} );

0 comments on commit 20a66d3

Please sign in to comment.