diff --git a/src/packages/BooleanQueryTokenizer/BooleanQueryTokenizer.ts b/src/packages/BooleanQueryTokenizer/BooleanQueryTokenizer.ts index 84249f918..fa40533ae 100644 --- a/src/packages/BooleanQueryTokenizer/BooleanQueryTokenizer.ts +++ b/src/packages/BooleanQueryTokenizer/BooleanQueryTokenizer.ts @@ -15,6 +15,7 @@ * form the keyword. For identifiers and some other tokens, the pattern is more complex * structure that is matched by many strings. */ +/* eslint-disable class-methods-use-this */ export enum TokenType { unknown, @@ -36,6 +37,9 @@ type Token = { type: TokenType; lexeme: string; }; +type Extract = { + wordsAndPhrases: string[]; +}; // Character classes const spaceSymbols = '\\s\\t\\n'; @@ -62,7 +66,7 @@ const separatorOrEnd = `(?:${separator}|$)`; const parentheses = '(?:\\(|\\)|\\[|\\]|\\{|\\}|\\<|\\>)'; const wildcard = '\\*'; const phrase = '(?:"[^"]*"|\'[^\']*\')'; -const simpleOperator = '(?:OR|AND|NEAR|NOT|\\&|\\!|\\|)'; +const simpleOperator = '(?:OR|AND|NEAR|NOT|\\&|\\|)'; const andNotOperator = `AND${separator}NOT`; const aroundOperator = `AROUND${separator}\\d+`; @@ -79,7 +83,7 @@ export class BooleanQueryTokenizer { private static readonly tokenPatterns: TokenPattern[] = [ [TokenType.separator, new RegExp(`^(?:${separator}|$)`)], [TokenType.word, new RegExp(`^${word}`)], - [TokenType.modifier, new RegExp(`^${wordModifier}(?=${word})`)], + [TokenType.modifier, new RegExp(`^(?:${wordModifier})(?=${phrase}|\\(|${word})`)], [ TokenType.wildcard, new RegExp(`^(?:${wildcard}(?=${word})|${wildcard}(?=${separatorOrEnd}))`), @@ -88,8 +92,10 @@ export class BooleanQueryTokenizer { [TokenType.phrase, new RegExp(`^${phrase}`)], [ TokenType.booleanOperator, + // Cases like `NOT word`, `NOT(group...`, `AND"a phrase"` new RegExp( - `^(?:${andNotOperator}|${aroundOperator}|${simpleOperator})(?=${separatorOrEnd})` + `^(?:${andNotOperator}|${aroundOperator}|${simpleOperator})` + + `(?=${separatorOrEnd}|\\(|${phrase})` ), ], [TokenType.parentheses, new RegExp(`^${parentheses}`)], @@ -99,11 +105,11 @@ export class BooleanQueryTokenizer { * Tokenizer splits an input text into meaningful chunks and labels each chunk according to its * lexical meaning, suchwise producing a list of tokens. */ - public static tokenize(inputString: string): Token[] { + public tokenize(inputString: string): Token[] { const tokens: Token[] = []; let remainingInputString = inputString; while (remainingInputString.length > 0) { - const token = BooleanQueryTokenizer.tokenizeCurrentChunk(remainingInputString); + const token = this.tokenizeCurrentChunk(remainingInputString); if (token) { // If token has been found, reduce the `remainingInputString ` // on the length of the matched lexeme and add the token to the result. @@ -129,9 +135,90 @@ export class BooleanQueryTokenizer { return tokens; } + /** + * @deprecated use BooleanQueryTokenizer#extractWords instead + */ + public extractWordsAndPhrases(inputString: string): string[] { + return this.filterWordsAndPhrases(this.tokenize(inputString)); + } + + /** + * Extracts words and phrases from boolean query. Ignores negated words and phrases. + * Supports negation of words, phrases, and groups (by parentheses). + * If there is not enough closing parentheses, negation is not be applied. + * @param inputString - boolean query string + */ + public extractWords(inputString: string): Extract { + const tokens = this.tokenize(inputString); + for (let i = 0; i < tokens.length; i += 1) { + const token = tokens[i]; + if (this.isNegation(token) && i + 1 < tokens.length) { + let j = i + 1; + while (j < tokens.length) { + const lookAheadToken = tokens[j]; + if (lookAheadToken.type === TokenType.separator) { + j += 1; + } else if (this.isNegationable(lookAheadToken)) { + this.resetSubsequenceToUnknown(tokens, i, j); + i = j; + break; + } else if (this.isParenthesisGroupStart(lookAheadToken)) { + let parenthesisTracker = 0; + for (let k = j; k < tokens.length; k += 1) { + const parenthesisLoopToken = tokens[k]; + if (parenthesisLoopToken.type === TokenType.parentheses) { + if (parenthesisLoopToken.lexeme === '(') { + parenthesisTracker += 1; + } else if (parenthesisLoopToken.lexeme === ')') { + parenthesisTracker -= 1; + } + } + if (parenthesisTracker === 0) { + this.resetSubsequenceToUnknown(tokens, i, k); + i = k; + break; + } + } + break; + } else { + break; + } + } + } + } + // Filter only words and phrases + return { + wordsAndPhrases: this.filterWordsAndPhrases(tokens), + }; + } + + /** + * @deprecated use `BooleanQueryTokenizer#tokenize` + */ + public static tokenize(inputString: string): Token[] { + const booleanQueryTokenizer = new BooleanQueryTokenizer(); + return booleanQueryTokenizer.tokenize(inputString); + } + + /** + * @deprecated use `BooleanQueryTokenizer#extractWords` + */ public static extractWordsAndPhrases(inputString: string): string[] { + const booleanQueryTokenizer = new BooleanQueryTokenizer(); + return booleanQueryTokenizer.extractWordsAndPhrases(inputString); + } + + /** + * @deprecated use `BooleanQueryTokenizer#extractWords` + */ + public static extractWords(inputString: string): Extract { + const booleanQueryTokenizer = new BooleanQueryTokenizer(); + return booleanQueryTokenizer.extractWords(inputString); + } + + private filterWordsAndPhrases(tokens: Token[]): string[] { return ( - BooleanQueryTokenizer.tokenize(inputString) + tokens // Filter only words and phrases .filter((token) => token.type === TokenType.word || token.type === TokenType.phrase) .map((wordOrPhrase) => { @@ -148,11 +235,44 @@ export class BooleanQueryTokenizer { ); } + private resetSubsequenceToUnknown(tokens: Token[], from: number, to: number): Token[] { + tokens.forEach((_token, index) => { + if (index >= from && index <= to) { + // eslint-disable-next-line no-param-reassign + tokens[index] = { + type: TokenType.unknown, + lexeme: '', + }; + } + }); + return tokens; + } + + private isNegation(token: Token): boolean { + return ( + (token.type === TokenType.booleanOperator && + (token.lexeme === 'NOT' || token.lexeme === 'AND NOT')) || + (token.type === TokenType.modifier && (token.lexeme === '-' || token.lexeme === '!')) + ); + } + + private isNegationable(token: Token): boolean { + return ( + token.type === TokenType.word || + token.type === TokenType.wildcardWord || + token.type === TokenType.phrase + ); + } + + private isParenthesisGroupStart(token: Token): boolean { + return token.type === TokenType.parentheses && token.lexeme === '('; + } + /** * `tokenizeCurrentChunk` tries to find the best possible match for the current * part of the input string */ - private static tokenizeCurrentChunk(remainingInputString: string): Token | undefined { + private tokenizeCurrentChunk(remainingInputString: string): Token | undefined { let tokenCandidate: Token | undefined; BooleanQueryTokenizer.tokenPatterns.forEach((tokenPattern) => { const [name, tokenRegExp] = tokenPattern; diff --git a/src/packages/BooleanQueryTokenizer/__tests__/BooleanQueryTokenizer.spec.js b/src/packages/BooleanQueryTokenizer/__tests__/BooleanQueryTokenizer.spec.js index a16070008..14131ca02 100644 --- a/src/packages/BooleanQueryTokenizer/__tests__/BooleanQueryTokenizer.spec.js +++ b/src/packages/BooleanQueryTokenizer/__tests__/BooleanQueryTokenizer.spec.js @@ -3,9 +3,10 @@ import { BooleanQueryTokenizer, TokenType } from '..'; describe('modules/BooleanQueryTokenizer', () => { describe('#tokenize()', () => { it('should tokenize boolean query respecting all types of tokens', () => { + const booleanQueryTokenizer = new BooleanQueryTokenizer(); expect( - BooleanQueryTokenizer.tokenize( - '-php AND\t\n+c++ +--+ - java* NOT *script ' + + booleanQueryTokenizer.tokenize( + '-php AND\t\n+c++ +--+ - java* NOT *script C++ L-3 Yum! ' + 'NEAR (rust OR web*hueb) OR -.NET "TO BE OR NOT TO BE"' ) ).toEqual([ @@ -29,6 +30,12 @@ describe('modules/BooleanQueryTokenizer', () => { { type: TokenType.wildcard, lexeme: '*' }, { type: TokenType.word, lexeme: 'script' }, { type: TokenType.separator, lexeme: ' ' }, + { type: TokenType.word, lexeme: 'C++' }, + { type: TokenType.separator, lexeme: ' ' }, + { type: TokenType.word, lexeme: 'L-3' }, + { type: TokenType.separator, lexeme: ' ' }, + { type: TokenType.word, lexeme: 'Yum!' }, + { type: TokenType.separator, lexeme: ' ' }, { type: TokenType.booleanOperator, lexeme: 'NEAR' }, { type: TokenType.separator, lexeme: ' ' }, { type: TokenType.parentheses, lexeme: '(' }, @@ -46,11 +53,11 @@ describe('modules/BooleanQueryTokenizer', () => { { type: TokenType.separator, lexeme: ' ' }, { type: TokenType.phrase, lexeme: '"TO BE OR NOT TO BE"' }, ]); - expect(BooleanQueryTokenizer.tokenize('web*')).toEqual([ + expect(booleanQueryTokenizer.tokenize('web*')).toEqual([ { type: TokenType.word, lexeme: 'web' }, { type: TokenType.wildcard, lexeme: '*' }, ]); - expect(BooleanQueryTokenizer.tokenize('to_be OR')).toEqual([ + expect(booleanQueryTokenizer.tokenize('to_be OR')).toEqual([ { type: TokenType.word, lexeme: 'to_be' }, { type: TokenType.separator, lexeme: ' ' }, { type: TokenType.booleanOperator, lexeme: 'OR' }, @@ -60,21 +67,24 @@ describe('modules/BooleanQueryTokenizer', () => { describe('#buildBagOfWords()', () => { it('should handle words boundaries correctly', () => { - expect(BooleanQueryTokenizer.extractWordsAndPhrases('')).toEqual([]); - expect(BooleanQueryTokenizer.extractWordsAndPhrases(' ')).toEqual([]); + const booleanQueryTokenizer = new BooleanQueryTokenizer(); + expect(booleanQueryTokenizer.extractWordsAndPhrases('')).toEqual([]); + expect(booleanQueryTokenizer.extractWordsAndPhrases(' ')).toEqual([]); expect( - BooleanQueryTokenizer.extractWordsAndPhrases(' \n \n \t foo \t bar \n\n') + booleanQueryTokenizer.extractWordsAndPhrases(' \n \n \t foo \t bar \n\n') ).toEqual(['foo', 'bar']); }); it('should handle operators correctly', () => { - expect(BooleanQueryTokenizer.extractWordsAndPhrases('NEAROP')).toEqual(['NEAROP']); - expect(BooleanQueryTokenizer.extractWordsAndPhrases('NEAR')).toEqual([]); + const booleanQueryTokenizer = new BooleanQueryTokenizer(); + expect(booleanQueryTokenizer.extractWordsAndPhrases('NEAROP')).toEqual(['NEAROP']); + expect(booleanQueryTokenizer.extractWordsAndPhrases('NEAR')).toEqual([]); }); it('should tokenize words properly', () => { + const booleanQueryTokenizer = new BooleanQueryTokenizer(); expect( - BooleanQueryTokenizer.extractWordsAndPhrases(` + booleanQueryTokenizer.extractWordsAndPhrases(` php c++ canal+ 23AndMe U x THE.BEST.COMPANY 888.com 3M WD40 Forever21 360 7/11 1and1 L-3 37signals 20x200 _lodash Macy's Yum! `) @@ -103,16 +113,18 @@ describe('modules/BooleanQueryTokenizer', () => { }); it('should tokenize words that contain non-latin characters', () => { + const booleanQueryTokenizer = new BooleanQueryTokenizer(); expect( - BooleanQueryTokenizer.extractWordsAndPhrases( + booleanQueryTokenizer.extractWordsAndPhrases( 'Düsseldorf Köln "Набережные Челны" Москва 北京市 إسرائيل' ) ).toEqual(['Düsseldorf', 'Köln', 'Набережные Челны', 'Москва', '北京市', 'إسرائيل']); }); it('should tokenize words with modifiers', () => { + const booleanQueryTokenizer = new BooleanQueryTokenizer(); expect( - BooleanQueryTokenizer.extractWordsAndPhrases( + booleanQueryTokenizer.extractWordsAndPhrases( '-php +c++ d-- +++ -canal+ + Yo!!! -- +23AndMe --- !U !x - !Yum! +.NET' ) ).toEqual([ @@ -130,34 +142,35 @@ describe('modules/BooleanQueryTokenizer', () => { }); it('should respect quoted phrases', () => { - expect(BooleanQueryTokenizer.extractWordsAndPhrases('""')).toEqual([]); - expect(BooleanQueryTokenizer.extractWordsAndPhrases("''")).toEqual([]); + const booleanQueryTokenizer = new BooleanQueryTokenizer(); + expect(booleanQueryTokenizer.extractWordsAndPhrases('""')).toEqual([]); + expect(booleanQueryTokenizer.extractWordsAndPhrases("''")).toEqual([]); expect( - BooleanQueryTokenizer.extractWordsAndPhrases('"a double-quoted phrase"') + booleanQueryTokenizer.extractWordsAndPhrases('"a double-quoted phrase"') ).toEqual(['a double-quoted phrase']); expect( - BooleanQueryTokenizer.extractWordsAndPhrases("'another phrase in single quotes'") + booleanQueryTokenizer.extractWordsAndPhrases("'another phrase in single quotes'") ).toEqual(['another phrase in single quotes']); expect( - BooleanQueryTokenizer.extractWordsAndPhrases( + booleanQueryTokenizer.extractWordsAndPhrases( '" phrase with some leading and trailing word separators \t\n "' ) ).toEqual(['phrase with some leading and trailing word separators']); expect( - BooleanQueryTokenizer.extractWordsAndPhrases( + booleanQueryTokenizer.extractWordsAndPhrases( '"a +phrase OR with some NOT boolean operators AND"' ) ).toEqual(['a +phrase OR with some NOT boolean operators AND']); expect( - BooleanQueryTokenizer.extractWordsAndPhrases('"a +phrase* *with* *some wild*cards"') + booleanQueryTokenizer.extractWordsAndPhrases('"a +phrase* *with* *some wild*cards"') ).toEqual(['a +phrase with some wildcards']); expect( - BooleanQueryTokenizer.extractWordsAndPhrases( + booleanQueryTokenizer.extractWordsAndPhrases( '& "a phrase & with some - special | characters inside" and * outside' ) ).toEqual(['a phrase & with some - special | characters inside', 'and', 'outside']); expect( - BooleanQueryTokenizer.extractWordsAndPhrases( + booleanQueryTokenizer.extractWordsAndPhrases( 'not only -"a phrase"+ but also some other words AND stuff' ) ).toEqual([ @@ -174,9 +187,10 @@ describe('modules/BooleanQueryTokenizer', () => { }); it('should handle boolean operators and parentheses', () => { - expect(BooleanQueryTokenizer.extractWordsAndPhrases('OR java AND')).toEqual(['java']); + const booleanQueryTokenizer = new BooleanQueryTokenizer(); + expect(booleanQueryTokenizer.extractWordsAndPhrases('OR java AND')).toEqual(['java']); expect( - BooleanQueryTokenizer.extractWordsAndPhrases(` + booleanQueryTokenizer.extractWordsAndPhrases(` Rust AND c++ OR (D R Python) OR (web NEAR developer) NOT manager AND [java AND spring] AROUND 5 boot NOT OR ORGANIST NEAR NEARNESS AROUND 10 AROUNDNESS @@ -209,30 +223,153 @@ describe('modules/BooleanQueryTokenizer', () => { }); it('should respect wildcards', () => { - expect(BooleanQueryTokenizer.extractWordsAndPhrases('* * *')).toEqual([]); + const booleanQueryTokenizer = new BooleanQueryTokenizer(); + expect(booleanQueryTokenizer.extractWordsAndPhrases('* * *')).toEqual([]); expect( - BooleanQueryTokenizer.extractWordsAndPhrases('do you speak english mother* ?') + booleanQueryTokenizer.extractWordsAndPhrases('do you speak english mother* ?') ).toEqual(['do', 'you', 'speak', 'english', 'mother']); expect( - BooleanQueryTokenizer.extractWordsAndPhrases( + booleanQueryTokenizer.extractWordsAndPhrases( "Fasten your *belts. It's going to be a * night." ) ).toEqual(['Fasten', 'your', 'belts.', "It's", 'going', 'to', 'be', 'a', 'night.']); expect( - BooleanQueryTokenizer.extractWordsAndPhrases('people like to * each other') + booleanQueryTokenizer.extractWordsAndPhrases('people like to * each other') ).toEqual(['people', 'like', 'to', 'each', 'other']); }); it('should ignore wildcards inside quotes', () => { - expect(BooleanQueryTokenizer.extractWordsAndPhrases('"*"')).toEqual([]); - expect(BooleanQueryTokenizer.extractWordsAndPhrases('"yo* "')).toEqual(['yo']); - expect(BooleanQueryTokenizer.extractWordsAndPhrases('python "is a *"')).toEqual([ + const booleanQueryTokenizer = new BooleanQueryTokenizer(); + expect(booleanQueryTokenizer.extractWordsAndPhrases('"*"')).toEqual([]); + expect(booleanQueryTokenizer.extractWordsAndPhrases('"yo* "')).toEqual(['yo']); + expect(booleanQueryTokenizer.extractWordsAndPhrases('python "is a *"')).toEqual([ 'python', 'is a', ]); expect( - BooleanQueryTokenizer.extractWordsAndPhrases('"*prog*ramming* is fun."') + booleanQueryTokenizer.extractWordsAndPhrases('"*prog*ramming* is fun."') ).toEqual(['programming is fun.']); }); }); + + describe('#extractWords()', () => { + it('should filter out terms, phrases negated by NOT', () => { + const booleanQueryTokenizer = new BooleanQueryTokenizer(); + expect( + booleanQueryTokenizer.extractWords( + 'typescript AND NOT lua OR NOT php NOT NOT NOT kotlin java' + ) + ).toEqual({ + wordsAndPhrases: ['typescript', 'java'], + }); + expect( + booleanQueryTokenizer.extractWords( + 'typescript AND NOT "C sharp" OR NOT"java developer" java' + ) + ).toEqual({ + wordsAndPhrases: ['typescript', 'java'], + }); + }); + + it('should filter out groups negated by NOT', () => { + const booleanQueryTokenizer = new BooleanQueryTokenizer(); + expect( + booleanQueryTokenizer.extractWords( + 'typescript NOT(php AND java) java NOT (kotlin VBScript)' + ) + ).toEqual({ + wordsAndPhrases: ['typescript', 'java'], + }); + expect( + booleanQueryTokenizer.extractWords( + 'typescript NOT((php AND java NOT php) OR (php AND java NOT(foo OR bar))) java' + ) + ).toEqual({ + wordsAndPhrases: ['typescript', 'java'], + }); + expect( + booleanQueryTokenizer.extractWords('typescript NOT(((((php AND java))))) java') + ).toEqual({ + wordsAndPhrases: ['typescript', 'java'], + }); + }); + + it('should filter out terms, phrases negated by minus (-)', () => { + const booleanQueryTokenizer = new BooleanQueryTokenizer(); + expect( + booleanQueryTokenizer.extractWords('typescript AND -lua OR -php - - -kotlin java') + ).toEqual({ + wordsAndPhrases: ['typescript', 'java'], + }); + expect( + booleanQueryTokenizer.extractWords( + 'typescript AND -"C sharp" OR -"java developer" java' + ) + ).toEqual({ + wordsAndPhrases: ['typescript', 'java'], + }); + }); + + it('should filter out groups negated by minus (-)', () => { + const booleanQueryTokenizer = new BooleanQueryTokenizer(); + expect( + booleanQueryTokenizer.extractWords( + 'typescript -(php AND java) java -(kotlin VBScript)' + ) + ).toEqual({ + wordsAndPhrases: ['typescript', 'java'], + }); + expect( + booleanQueryTokenizer.extractWords( + 'typescript -((php AND java -php) OR (php AND java NOT(foo OR bar))) java' + ) + ).toEqual({ + wordsAndPhrases: ['typescript', 'java'], + }); + expect( + booleanQueryTokenizer.extractWords('typescript -(((((php AND java))))) java') + ).toEqual({ + wordsAndPhrases: ['typescript', 'java'], + }); + }); + + it('should filter out terms, phrases negated by exclamation mark (!)', () => { + const booleanQueryTokenizer = new BooleanQueryTokenizer(); + expect( + booleanQueryTokenizer.extractWords('typescript AND !lua OR !php ! ! !kotlin java') + ).toEqual({ + wordsAndPhrases: ['typescript', 'java'], + }); + expect( + booleanQueryTokenizer.extractWords( + 'typescript AND !"C sharp" OR !"java developer" java' + ) + ).toEqual({ + wordsAndPhrases: ['typescript', 'java'], + }); + }); + + it('should filter out groups negated by exclamation mark (!)', () => { + const booleanQueryTokenizer = new BooleanQueryTokenizer(); + expect( + booleanQueryTokenizer.extractWords( + 'typescript !(php AND java) java !(kotlin VBScript)' + ) + ).toEqual({ + wordsAndPhrases: ['typescript', 'java'], + }); + expect( + booleanQueryTokenizer.extractWords( + 'typescript !((php AND java !php) OR (php AND java NOT(foo OR bar))) java' + ) + ).toEqual({ + wordsAndPhrases: ['typescript', 'java'], + }); + expect( + booleanQueryTokenizer.extractWords('typescript !(((((php AND java))))) java') + ).toEqual({ + wordsAndPhrases: ['typescript', 'java'], + }); + }); + }); }); diff --git a/stories/packages/BooleanQueryTokenizer.tsx b/stories/packages/BooleanQueryTokenizer.tsx index 925af97ec..9f7a20586 100644 --- a/stories/packages/BooleanQueryTokenizer.tsx +++ b/stories/packages/BooleanQueryTokenizer.tsx @@ -18,12 +18,12 @@ storiesOf('packages|BooleanQueryTokenizer', module).add('BooleanQueryTokenizer', BooleanQueryTokenizer.tokenize(initialValue) ); const [wordsAndPhrases, setWordsAndPhrases] = React.useState( - BooleanQueryTokenizer.extractWordsAndPhrases(initialValue) + BooleanQueryTokenizer.extractWords(initialValue) ); const tokenizeBooleanQuery = (event) => { const { value } = event.target; setTokens(BooleanQueryTokenizer.tokenize(value)); - setWordsAndPhrases(BooleanQueryTokenizer.extractWordsAndPhrases(value)); + setWordsAndPhrases(BooleanQueryTokenizer.extractWords(value)); }; return (
@@ -45,7 +45,7 @@ storiesOf('packages|BooleanQueryTokenizer', module).add('BooleanQueryTokenizer',

Words and phrases

-
{JSON.stringify(wordsAndPhrases, null, 4)}
+
{JSON.stringify(wordsAndPhrases.wordsAndPhrases, null, 4)}

Tokens