diff --git a/.eslintrc.json b/.eslintrc.json index 5d76fd8..c8004ba 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -28,6 +28,7 @@ "no-restricted-syntax": ["warn"], "no-shadow": ["off"], "no-underscore-dangle": ["off"], + "prefer-const": ["off"], "quotes" :["off"], "strict": ["off"] } diff --git a/README.md b/README.md index 99aa957..e0528bf 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ $ npm install string-dsa List of supported Data Structures and Algorithms are : +- [Aho Corasick](https://github.com/thsubaku9/string-dsa/blob/main/src/AhoCorasick.js) - [Bloom Filter](https://github.com/thsubaku9/string-dsa/blob/main/src/BloomFilter.js) - [Bracket Balance](https://github.com/thsubaku9/string-dsa/blob/main/src/bracketBalance.js) - [Custom Sort](https://github.com/thsubaku9/string-dsa/blob/main/src/stringSort.js) diff --git a/package.json b/package.json index 926e5dc..c01f279 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "string-dsa", - "version": "1.2.2", + "version": "1.3.0", "description": "String Data Structures and Algorithms Library in JavaScript", "main": "src/index.js", "files": [ @@ -21,6 +21,7 @@ "keywords": [ "String Algorithms", "String Data Structures", + "Aho Corasick", "Bloom Filter", "Custom Sort", "Levenshtein Distance", diff --git a/src/index.js b/src/index.js index 24e4def..2637afd 100644 --- a/src/index.js +++ b/src/index.js @@ -3,12 +3,12 @@ const BloomFilter = require('./BloomFilter'); const customSort = require('./stringSort'); const diceCoeff = require('./dice_coeff'); const editDist = require('./edit_distance'); -const { kmp } = require('./search'); +const { AhoCorasick, kmp, rabinKarp } = require('./search'); const lcs = require('./lcs'); -const { rabinKarp } = require('./search'); const Trie = require('./Trie'); module.exports = { + AhoCorasick, BloomFilter, bracketBalance, customSort, diff --git a/src/search/AhoCorasick.js b/src/search/AhoCorasick.js index e229947..390fc14 100644 --- a/src/search/AhoCorasick.js +++ b/src/search/AhoCorasick.js @@ -1,5 +1,6 @@ "use strict"; +// AC is being implemented as a transition: map> for lesser restrictions and failure: array[state] and output: array[state] class AC { /** * @@ -7,11 +8,120 @@ class AC { * @param {Number} maxStates the number of states that are allowed (maxStates >= total number of letters in keywordlist). Set as -1 to allow the algorithm to figure out by itself */ constructor(keywordList, maxStates = -1) { - this.totalStates = maxStates > 0 ? maxStates : keywordList.reduce((letterCount, str) => letterCount + str.length, 0); + this.totalStates = maxStates > 0 ? maxStates : (keywordList.reduce((letterCount, str) => letterCount + str.length, 0) + 1); + this.keyWords = new Array(...keywordList); + this.states = 0; + // this.stateTransitions = new Map>(); -- for typescript + this.stateTransitions = new Map(); + + this.stateTransitions.set(0, new Map()); + + this.failureTransition = new Array(this.totalStates); + + this.stateOutput = new Array(this.totalStates); + + for (let i = 0; i < this.totalStates; i++) { + this.stateOutput[i] = 0; + } + + // Inital insert + for (let i = 0; i < this.keyWords.length; i++) { + this._initialInsert(this.keyWords[i], i); + } + + // preprocessing for failure and outputs + this._exploreBFS(); } + _exploreBFS() { + let buffer = []; + // preprocess states for failure on 0 level + + this.failureTransition[0] = 0; + for (let x of this.stateTransitions.get(0).keys()) { + if (x != undefined) { + this.failureTransition[this.stateTransitions.get(0).get(x)] = 0; + buffer.push(this.stateTransitions.get(0).get(x)); + } + } + + while (buffer.length) { + let currentState = buffer.pop(); + + for (let x of this.stateTransitions.get(currentState).keys()) { + if (x == undefined) continue; + + let failureState = this.failureTransition[currentState]; + while (this.stateTransitions.get(failureState).get(x) == undefined && failureState != 0) { + failureState = this.failureTransition[failureState]; + } + if (this.stateTransitions.get(failureState).get(x) != undefined) failureState = this.stateTransitions.get(failureState).get(x); + + this.failureTransition[this.stateTransitions.get(currentState).get(x)] = failureState; + this.stateOutput[this.stateTransitions.get(currentState).get(x)] |= this.stateOutput[failureState]; + + buffer.push(this.stateTransitions.get(currentState).get(x)); + } + } + } + + _setOutput(currentStateNumber, keywordIndex) { + this.stateOutput[currentStateNumber] |= (1 << keywordIndex); + } + + _initialInsert(keyword, keywordIndex) { + let currentState = this.stateTransitions.get(0); + let transitionToState; + + for (let i = 0, c = keyword.charAt(i); i < keyword.length; i++, c = keyword.charAt(i)) { + if (currentState.get(c) == undefined) { + this.states++; + currentState.set(c, this.states); + } + + transitionToState = currentState.get(c); + if (this.stateTransitions.get(transitionToState) == undefined) { + this.stateTransitions.set(transitionToState, new Map()); + } + currentState = this.stateTransitions.get(transitionToState); + } + this._setOutput(transitionToState, keywordIndex); + } + + /** + * + * @param {String} searchSpace the text to be searched over + * + * @returns {Array[]} where each index gives a start position and end position of the term + */ find(searchSpace) { - return searchSpace; + // while searching, if base state is 0 then send the state back to zero if character state transition does not exist + let currentState = 0; + let resultLocation = []; + for (let i = 0, c = searchSpace[i]; i < searchSpace.length; i++, c = searchSpace[i]) { + while (this.stateTransitions.get(currentState).get(c) == undefined && currentState != 0) { + currentState = this.failureTransition[currentState]; + } + + if (this.stateTransitions.get(currentState).get(c) != undefined) { + currentState = this.stateTransitions.get(currentState).get(c); + } + + if (this.stateOutput[currentState] != 0) { + // store output to resultLocation + + for (let myOutputs = this.stateOutput[currentState], index = 0; myOutputs > 0; myOutputs >>= 1, index++) { + if (myOutputs & 0x01) { + let locationPair = []; + locationPair.push(i - (this.keyWords[index].length - 1)); + locationPair.push(i); + resultLocation.push(locationPair); + } + } + } + } + + return resultLocation; } } diff --git a/src/search/index.js b/src/search/index.js index 142bd05..b658b0f 100644 --- a/src/search/index.js +++ b/src/search/index.js @@ -1,7 +1,9 @@ const kmp = require('./kmp'); const rabinKarp = require('./rabin_karp'); +const AhoCorasick = require('./AhoCorasick'); module.exports = { + AhoCorasick, kmp, rabinKarp, }; diff --git a/test/search/test_AhoCorasick.js b/test/search/test_AhoCorasick.js new file mode 100644 index 0000000..d69ba5d --- /dev/null +++ b/test/search/test_AhoCorasick.js @@ -0,0 +1,26 @@ +const assert = require("assert"); +const { AhoCorasick } = require("../../src"); + +describe("Aho Corasick tests", () => { + const keywordList = ["lemon", "lemonade", "monday", "zebras"]; + const searchSpace = "lemon and lemonday is nice for zebzebras"; + const hitLocation = [[0, 4], [10, 14], [12, 17], [34, 39]]; + let mySearcher; + it("Insert the necessary keywords", () => { + mySearcher = new AhoCorasick(keywordList); + + assert.deepStrictEqual(mySearcher.keyWords, keywordList); + }); + + it("Should state the maximum number of required states", () => { + assert.strictEqual(mySearcher.totalStates, keywordList.reduce((prevVal, currentVal) => prevVal + currentVal.length, 1)); + }); + + it("Should state the actual number of used states", () => { + assert.strictEqual(mySearcher.totalStates >= mySearcher.states, true); + }); + + it("Find the matching keywords in given text", () => { + assert.deepStrictEqual(mySearcher.find(searchSpace), hitLocation); + }); +});