Skip to content

Commit

Permalink
feat: add bowOf() method in bm25 vectorizer
Browse files Browse the repository at this point in the history
  • Loading branch information
sanjayaksaxena committed Jan 30, 2022
1 parent 68fb316 commit 0b7b72b
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 2 deletions.
16 changes: 16 additions & 0 deletions test/bm25-vectorizer-specs.js
Expand Up @@ -120,6 +120,10 @@ describe( 'bm25-vectorizer', function () {
expect( v.vectorOf( [ 'rain', 'is', 'going', 'away' ] ) ).to.deep.equal( [ 0.287682, 0, 0.287682 ] );
} );

it( 'bowOf() should return bow of tokens', function () {
expect( v.bowOf( [ 'rain', 'is', 'going', 'away' ] ) ).to.deep.equal( { away: 0.287682, rain: 0.287682 } );
} );

it( 'doc.out( its.tf ) should return freq table of terms', function () {
expect( v.doc( 0 ).out( its.tf ) ).to.deep.equal( [ [ 'rain', 0.395563 ], [ 'away', 0.287682 ], [ 'go', 0.287682 ] ] );
} );
Expand Down Expand Up @@ -183,6 +187,10 @@ describe( 'bm25-vectorizer', function () {
it( 'vectorOf() should return its vector', function () {
expect( v.vectorOf( 'rats were blue'.split( /\s+/g ) ) ).to.deep.equal( [ 0, 0, 0.901808, 0, 0.432138, 0, 0 ] );
} );

it( 'bowOf() should return its bow', function () {
expect( v.bowOf( 'rats were blue'.split( /\s+/g ) ) ).to.deep.equal( { blue: 0.901808, rats: 0.432138 } );
} );
} );

describe( 'learn from multiple documents with l1 norm', function () {
Expand Down Expand Up @@ -252,6 +260,10 @@ describe( 'bm25-vectorizer', function () {
it( 'should return 0-vector', function () {
expect( v.vectorOf([ 'cat', 'cat', 'green', 'is' ] ) ).to.deep.equal( [ 0, 0, 0, 0, 0 ] );
} );

it( 'should return empty bow', function () {
expect( v.bowOf([ 'cat', 'cat', 'green', 'is' ] ) ).to.deep.equal( {} );
} );
} );

describe( 'completely OOV tokens with l2 norm', function () {
Expand All @@ -264,6 +276,10 @@ describe( 'bm25-vectorizer', function () {
it( 'should return 0-vector', function () {
expect( v.vectorOf([ 'cat', 'cat', 'green', 'is' ] ) ).to.deep.equal( [ 0, 0, 0, 0, 0 ] );
} );

it( 'should return empty bow', function () {
expect( v.bowOf([ 'cat', 'cat', 'green', 'is' ] ) ).to.deep.equal( {} );
} );
} );

describe( 'load model json', function () {
Expand Down
3 changes: 2 additions & 1 deletion types/index.d.ts
Expand Up @@ -304,7 +304,7 @@ declare module 'wink-nlp/utilities/bm25-vectorizer' {
// turn off exporting by default since we don't want to expose internal details
export { };

import { Tokens, Document, ItsFunction } from 'wink-nlp';
import { Tokens, Document, ItsFunction, Bow } from 'wink-nlp';

export type Norm = "l1" | "l2" | "none";

Expand All @@ -320,6 +320,7 @@ declare module 'wink-nlp/utilities/bm25-vectorizer' {
out<T>(f: ItsFunction<T>): T;
doc(n: number): Document;
vectorOf(tokens: Tokens): number[];
bowOf(tokens: Tokens): Bow;
config(): BM25VectorizerConfig;
loadModel(json: string): void;
}
Expand Down
40 changes: 39 additions & 1 deletion utilities/bm25-vectorizer.js
Expand Up @@ -276,7 +276,7 @@ var bm25Vectorizer = function ( config ) {
/**
* Computes the vector of the input document given in form of tokens using
* the tf-idf learned so far.
* @param {string} tokens tokenized document, usually obtained via winkNLP.
* @param {string[]} tokens tokenized document, usually obtained via winkNLP.
* @return {number[]} its vector.
*/
methods.vectorOf = function ( tokens ) {
Expand All @@ -302,9 +302,47 @@ var bm25Vectorizer = function ( config ) {
} else if ( norm === NONE ) thisNorm = 1;

// `thisNorm || 1` ensures that there is no attempt to divide by zero!
// This may happen if all tokens are unseen.
return arr.map( ( v ) => +( v / ( thisNorm || 1 ) ).toFixed( precision ) );
}; // vectorOf()

// ## bowOf
/**
* Computes the bag-of-words (bowOf) of the input document, using the tf-idf
* learned so far.
* @param {string[]} tokens tokenized text, usually obtained via winkNLP.
* @return {object} its bow.
*/
methods.bowOf = function ( tokens ) {
computeWeights();
const bow = Object.create( null );
const avgDL = sumOfAllDLs / docId;
let thisNorm = 0;

for ( let i = 0; i < tokens.length; i += 1 ) {
const t = tokens[ i ];
// bow applies only if the token is not an unseen one!
if ( idf[ t ] ) bow[ t ] = 1 + ( bow[ t ] || 0 );
}

for ( const t in bow ) { // eslint-disable-line guard-for-in
bow[ t ] = idf[ t ] * ( ( k1 + 1 ) * bow[ t ] ) / ( ( k1 * ( 1 - b + ( b * ( tokens.length / avgDL ) ) ) ) + bow[ t ] );
thisNorm += normFn[ norm ]( bow[ t ] );
}

if ( norm === L2 ) {
thisNorm = Math.sqrt( thisNorm );
} else if ( norm === NONE ) thisNorm = 1;

for ( const t in bow ) { // eslint-disable-line guard-for-in
// Unlike in `vectorOf`, `thisNorm || 1` is not needed here as bow will be
// empty if `thisNorm` is zero!
bow[ t ] = +( bow[ t ] / thisNorm ).toFixed( precision );
}

return bow;
}; // bowOf()

methods.config = ( () => ( { k: k, k1: k1, b: b, norm: norm } ) );

// ## loadModel
Expand Down

0 comments on commit 0b7b72b

Please sign in to comment.