Permalink
Browse files

HashSplitterSearchTokenizer handles wildcards

It's a precondition to generating a real wildcard query.
  • Loading branch information...
1 parent 84d70ed commit a80e891aab0627c9407c359c05615302ef085a2e @ofavre ofavre committed Apr 16, 2012
@@ -26,6 +26,8 @@
import java.io.IOException;
import java.io.Reader;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
/**
* Tokenizes the input to search against HashSplitter tokenized fields.
@@ -44,6 +46,8 @@
private char wildcardAny;
private boolean sizeIsVariable;
private int sizeValue;
+ private Matcher wildcardAnySuppresser;
+ private String allWildcardOnesChunk;
private int gramSize;
private int pos = 0;
@@ -133,6 +137,10 @@ private void init(int chunkLength, String prefixes, char wildcardOne, char wildc
this.wildcardAny = wildcardAny;
this.sizeIsVariable = sizeIsVariable;
this.sizeValue = sizeValue;
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0 ; i < chunkLength ; ++i)
+ sb.append(this.wildcardOne);
+ this.allWildcardOnesChunk = sb.toString();
}
/** Returns the next token in the stream, or null at EOS. */
@@ -145,6 +153,47 @@ public final boolean incrementToken() throws IOException {
input.read(chars);
inStr = new String(chars).trim(); // remove any trailing empty strings
inLen = inStr.length();
+ // Check for wildcardAny
+ int posFirstAny = inStr.indexOf(wildcardAny);
+ if (posFirstAny != -1) {
+ if (posFirstAny < inLen - 1 && (sizeIsVariable || inStr.indexOf(wildcardAny, posFirstAny + 1) != -1)) {
+ // Invalid case:
+ // - either variable hash size, and "*" not at the end
+ // - or multiple "*"
+ // Treat them as matching a 0 length part at least...
+ if (wildcardAnySuppresser == null) {
+ wildcardAnySuppresser = Pattern.compile(Character.toString(wildcardAny), Pattern.LITERAL).matcher(inStr);
+ } else {
+ wildcardAnySuppresser.reset(inStr);
+ }
+ inStr = wildcardAnySuppresser.replaceAll("");
+ inLen = inStr.length();
+ } else if (posFirstAny == inLen - 1) {
+ // Remove final "*"
+ inStr = inStr.substring(0, inLen - 1);
+ inLen--;
+ } else { // We have a single, enclosed "*", and a fixed size
+ // Expand the "*" to the right number of "?"s
+ StringBuilder sbOnes = new StringBuilder();
+ for (int i = sizeValue - inLen + 1 ; i > 0 ; --i)
+ sbOnes.append(wildcardOne);
+ StringBuilder sbStr = new StringBuilder();
+ sbStr.append(inStr, 0, posFirstAny);
+ sbStr.append(sbOnes);
+ sbStr.append(inStr, posFirstAny+1, inLen);
+ inStr = sbStr.toString();
+ inLen = sizeValue;
+ }
+ }
+ if (inLen % chunkLength != 0) {
+ // Pad the last chunk with "?"s
+ StringBuilder sb = new StringBuilder(inLen + chunkLength - (inLen % chunkLength));
+ sb.append(inStr);
+ for (int i = chunkLength - (inLen % chunkLength) ; i > 0 ; --i)
+ sb.append(wildcardOne);
+ inStr = sb.toString();
+ inLen = inStr.length();
+ }
}
if (pos >= inLen) { // if we hit the end of the string
@@ -160,6 +209,10 @@ public final boolean incrementToken() throws IOException {
offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
prefix++;
prefix %= prefixCount;
+ if (inStr.regionMatches(false, oldPos, allWildcardOnesChunk, 0, chunkLength)) {
+ // Blank token (all "???"s), skip to the next token
+ return incrementToken();
+ }
return true;
}
@@ -108,10 +108,152 @@ public void testPrefixes() throws Exception {
closeAnalysis();
}
- // TODO testSearchWildcardOne
- // TODO testSearchWildcardAny prefix variable size
- // TODO testSearchWildcardAny prefix fixed size
- // TODO testSearchWildcardAny suffix fixed size
- // TODO testSearchWildcardAny prefix and suffix fixed size
+ @Test
+ public void testSearchWildcardOne() throws Exception {
+ tokenizer = new HashSplitterSearchTokenizer(null, 4, "abcd", '?', '*', false, 12);
+
+ analyze("00001??12222");
+ assertThat("at i = 0", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 0", termAttr.toString(), equalTo("a0000"));
+ assertThat("at i = 0", offAttr.startOffset(), equalTo(0));
+ assertThat("at i = 0", offAttr.endOffset(), equalTo(4));
+ assertThat("at i = 1", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 1", termAttr.toString(), equalTo("b1??1"));
+ assertThat("at i = 1", offAttr.startOffset(), equalTo(4));
+ assertThat("at i = 1", offAttr.endOffset(), equalTo(8));
+ assertThat("at i = 2", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 2", termAttr.toString(), equalTo("c2222"));
+ assertThat("at i = 2", offAttr.startOffset(), equalTo(8));
+ assertThat("at i = 2", offAttr.endOffset(), equalTo(12));
+ assertThat(tokenizer.incrementToken(), equalTo(false));
+ closeAnalysis();
+
+ analyze("?????11?????");
+ assertThat("at i = 0", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 1", termAttr.toString(), equalTo("b?11?"));
+ assertThat("at i = 1", offAttr.startOffset(), equalTo(4));
+ assertThat("at i = 1", offAttr.endOffset(), equalTo(8));
+ assertThat("at i = 2", tokenizer.incrementToken(), equalTo(false));
+ closeAnalysis();
+ }
+
+ @Test
+ public void testSearchWildcardAnyPrefixVariableSize() throws Exception {
+ tokenizer = new HashSplitterSearchTokenizer(null, 4, "abcd", '?', '*', true, -1);
+
+ analyze("00001*");
+ assertThat("at i = 0", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 0", termAttr.toString(), equalTo("a0000"));
+ assertThat("at i = 0", offAttr.startOffset(), equalTo(0));
+ assertThat("at i = 0", offAttr.endOffset(), equalTo(4));
+ assertThat("at i = 1", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 1", termAttr.toString(), equalTo("b1???"));
+ assertThat("at i = 1", offAttr.startOffset(), equalTo(4));
+ assertThat("at i = 1", offAttr.endOffset(), equalTo(8));
+ assertThat(tokenizer.incrementToken(), equalTo(false));
+ closeAnalysis();
+
+ analyze("0000111*");
+ assertThat("at i = 0", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 0", termAttr.toString(), equalTo("a0000"));
+ assertThat("at i = 0", offAttr.startOffset(), equalTo(0));
+ assertThat("at i = 0", offAttr.endOffset(), equalTo(4));
+ assertThat("at i = 1", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 1", termAttr.toString(), equalTo("b111?"));
+ assertThat("at i = 1", offAttr.startOffset(), equalTo(4));
+ assertThat("at i = 1", offAttr.endOffset(), equalTo(8));
+ assertThat(tokenizer.incrementToken(), equalTo(false));
+ closeAnalysis();
+ }
+
+ @Test
+ public void testSearchWildcardAnyPrefixFixedSize() throws Exception {
+ tokenizer = new HashSplitterSearchTokenizer(null, 4, "abcd", '?', '*', false, 12);
+
+ analyze("00001*");
+ assertThat("at i = 0", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 0", termAttr.toString(), equalTo("a0000"));
+ assertThat("at i = 0", offAttr.startOffset(), equalTo(0));
+ assertThat("at i = 0", offAttr.endOffset(), equalTo(4));
+ assertThat("at i = 1", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 1", termAttr.toString(), equalTo("b1???"));
+ assertThat("at i = 1", offAttr.startOffset(), equalTo(4));
+ assertThat("at i = 1", offAttr.endOffset(), equalTo(8));
+ assertThat(tokenizer.incrementToken(), equalTo(false));
+ closeAnalysis();
+
+ analyze("0000111*");
+ assertThat("at i = 0", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 0", termAttr.toString(), equalTo("a0000"));
+ assertThat("at i = 0", offAttr.startOffset(), equalTo(0));
+ assertThat("at i = 0", offAttr.endOffset(), equalTo(4));
+ assertThat("at i = 1", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 1", termAttr.toString(), equalTo("b111?"));
+ assertThat("at i = 1", offAttr.startOffset(), equalTo(4));
+ assertThat("at i = 1", offAttr.endOffset(), equalTo(8));
+ assertThat(tokenizer.incrementToken(), equalTo(false));
+ closeAnalysis();
+ }
+
+ @Test
+ public void testSearchWildcardAnySuffixFixedSize() throws Exception {
+ tokenizer = new HashSplitterSearchTokenizer(null, 4, "abcd", '?', '*', false, 12);
+
+ analyze("*12222");
+ assertThat("at i = 1", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 1", termAttr.toString(), equalTo("b???1"));
+ assertThat("at i = 1", offAttr.startOffset(), equalTo(4));
+ assertThat("at i = 1", offAttr.endOffset(), equalTo(8));
+ assertThat("at i = 2", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 2", termAttr.toString(), equalTo("c2222"));
+ assertThat("at i = 2", offAttr.startOffset(), equalTo(8));
+ assertThat("at i = 2", offAttr.endOffset(), equalTo(12));
+ assertThat(tokenizer.incrementToken(), equalTo(false));
+ closeAnalysis();
+
+ analyze("*1112222");
+ assertThat("at i = 1", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 1", termAttr.toString(), equalTo("b?111"));
+ assertThat("at i = 1", offAttr.startOffset(), equalTo(4));
+ assertThat("at i = 1", offAttr.endOffset(), equalTo(8));
+ assertThat("at i = 2", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 2", termAttr.toString(), equalTo("c2222"));
+ assertThat("at i = 2", offAttr.startOffset(), equalTo(8));
+ assertThat("at i = 2", offAttr.endOffset(), equalTo(12));
+ closeAnalysis();
+ }
+
+ @Test
+ public void testSearchWildcardAnyPrefixAndSuffixFixedSize() throws Exception {
+ tokenizer = new HashSplitterSearchTokenizer(null, 4, "abcd", '?', '*', false, 12);
+
+ analyze("0*12222");
+ assertThat("at i = 0", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 0", termAttr.toString(), equalTo("a0???"));
+ assertThat("at i = 0", offAttr.startOffset(), equalTo(0));
+ assertThat("at i = 0", offAttr.endOffset(), equalTo(4));
+ assertThat("at i = 1", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 1", termAttr.toString(), equalTo("b???1"));
+ assertThat("at i = 1", offAttr.startOffset(), equalTo(4));
+ assertThat("at i = 1", offAttr.endOffset(), equalTo(8));
+ assertThat("at i = 2", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 2", termAttr.toString(), equalTo("c2222"));
+ assertThat("at i = 2", offAttr.startOffset(), equalTo(8));
+ assertThat("at i = 2", offAttr.endOffset(), equalTo(12));
+ assertThat(tokenizer.incrementToken(), equalTo(false));
+ closeAnalysis();
+
+ analyze("0*2");
+ assertThat("at i = 0", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 0", termAttr.toString(), equalTo("a0???"));
+ assertThat("at i = 0", offAttr.startOffset(), equalTo(0));
+ assertThat("at i = 0", offAttr.endOffset(), equalTo(4));
+ assertThat("at i = 2", tokenizer.incrementToken(), equalTo(true));
+ assertThat("at i = 2", termAttr.toString(), equalTo("c???2"));
+ assertThat("at i = 2", offAttr.startOffset(), equalTo(8));
+ assertThat("at i = 2", offAttr.endOffset(), equalTo(12));
+ assertThat(tokenizer.incrementToken(), equalTo(false));
+ closeAnalysis();
+ }
}

0 comments on commit a80e891

Please sign in to comment.