Permalink
Browse files

add snippeting

  • Loading branch information...
1 parent 9a03c07 commit 52cb9204e233464cd8bb9f79924a40c1e46d1f69 @wmorgan committed Apr 6, 2012
Showing with 310 additions and 23 deletions.
  1. +31 −23 Makefile
  2. +125 −0 snippeter.c
  3. +10 −0 snippeter.h
  4. +143 −0 test-snippets.c
  5. +1 −0 whistlepig.h
View
@@ -23,8 +23,8 @@ CCLINK?= -pthread
CCOPT= $(CFLAGS) $(CCLINK) $(ARCH) $(PROF)
DEBUG?= -rdynamic -ggdb
-TESTFILES = test-segment.c test-stringmap.c test-stringpool.c test-termhash.c test-search.c test-labels.c test-tokenizer.c test-queries.c
-CSRCFILES = segment.c termhash.c stringmap.c error.c query.c search.c stringpool.c mmap-obj.c query-parser.c index.c entry.c lock.c
+TESTFILES = test-segment.c test-stringmap.c test-stringpool.c test-termhash.c test-search.c test-labels.c test-tokenizer.c test-queries.c test-snippets.c
+CSRCFILES = segment.c termhash.c stringmap.c error.c query.c search.c stringpool.c mmap-obj.c query-parser.c index.c entry.c lock.c snippeter.c
HEADERFILES = $(CSRCFILES:.c=.h) defaults.h whistlepig.h khash.h
LEXFILES = tokenizer.lex query-parser.lex
YFILES = query-parser.y
@@ -49,68 +49,72 @@ loc: $(CSRCFILES) $(LEXFILES) $(YFILES) $(HEADERFILES)
## deps (use `make dep` to generate this (in vi: :r !make dep)
batch-run-queries.o: batch-run-queries.c whistlepig.h defaults.h index.h \
segment.h stringmap.h stringpool.h error.h termhash.h query.h search.h \
- mmap-obj.h entry.h khash.h rarray.h query-parser.h lock.h timer.h
+ mmap-obj.h entry.h khash.h rarray.h query-parser.h lock.h snippeter.h \
+ timer.h
benchmark-queries.o: benchmark-queries.c whistlepig.h defaults.h index.h \
segment.h stringmap.h stringpool.h error.h termhash.h query.h search.h \
- mmap-obj.h entry.h khash.h rarray.h query-parser.h lock.h timer.h
+ mmap-obj.h entry.h khash.h rarray.h query-parser.h lock.h snippeter.h \
+ timer.h
dump.o: dump.c whistlepig.h defaults.h index.h segment.h stringmap.h \
stringpool.h error.h termhash.h query.h search.h mmap-obj.h entry.h \
- khash.h rarray.h query-parser.h lock.h
+ khash.h rarray.h query-parser.h lock.h snippeter.h
entry.o: entry.c whistlepig.h defaults.h index.h segment.h stringmap.h \
stringpool.h error.h termhash.h query.h search.h mmap-obj.h entry.h \
- khash.h rarray.h query-parser.h lock.h tokenizer.lex.h
+ khash.h rarray.h query-parser.h lock.h snippeter.h tokenizer.lex.h
error.o: error.c error.h
file-indexer.o: file-indexer.c timer.h whistlepig.h defaults.h index.h \
segment.h stringmap.h stringpool.h error.h termhash.h query.h search.h \
- mmap-obj.h entry.h khash.h rarray.h query-parser.h lock.h
+ mmap-obj.h entry.h khash.h rarray.h query-parser.h lock.h snippeter.h
index.o: index.c whistlepig.h defaults.h index.h segment.h stringmap.h \
stringpool.h error.h termhash.h query.h search.h mmap-obj.h entry.h \
- khash.h rarray.h query-parser.h lock.h
+ khash.h rarray.h query-parser.h lock.h snippeter.h
interactive.o: interactive.c whistlepig.h defaults.h index.h segment.h \
stringmap.h stringpool.h error.h termhash.h query.h search.h mmap-obj.h \
- entry.h khash.h rarray.h query-parser.h lock.h timer.h
+ entry.h khash.h rarray.h query-parser.h lock.h snippeter.h timer.h
lock.o: lock.c whistlepig.h defaults.h index.h segment.h stringmap.h \
stringpool.h error.h termhash.h query.h search.h mmap-obj.h entry.h \
- khash.h rarray.h query-parser.h lock.h
+ khash.h rarray.h query-parser.h lock.h snippeter.h
make-queries.o: make-queries.c tokenizer.lex.h segment.h defaults.h \
stringmap.h stringpool.h error.h termhash.h query.h search.h mmap-obj.h
mbox-indexer.o: mbox-indexer.c whistlepig.h defaults.h index.h segment.h \
stringmap.h stringpool.h error.h termhash.h query.h search.h mmap-obj.h \
- entry.h khash.h rarray.h query-parser.h lock.h timer.h
+ entry.h khash.h rarray.h query-parser.h lock.h snippeter.h timer.h
mmap-obj.o: mmap-obj.c whistlepig.h defaults.h index.h segment.h \
stringmap.h stringpool.h error.h termhash.h query.h search.h mmap-obj.h \
- entry.h khash.h rarray.h query-parser.h lock.h
+ entry.h khash.h rarray.h query-parser.h lock.h snippeter.h
query-parser.o: query-parser.c whistlepig.h defaults.h index.h segment.h \
stringmap.h stringpool.h error.h termhash.h query.h search.h mmap-obj.h \
- entry.h khash.h rarray.h query-parser.h lock.h query-parser.tab.h
+ entry.h khash.h rarray.h query-parser.h lock.h snippeter.h \
+ query-parser.tab.h
query-parser.lex.o: query-parser.lex.c whistlepig.h defaults.h index.h \
segment.h stringmap.h stringpool.h error.h termhash.h query.h search.h \
- mmap-obj.h entry.h khash.h rarray.h query-parser.h lock.h \
+ mmap-obj.h entry.h khash.h rarray.h query-parser.h lock.h snippeter.h \
query-parser.tab.h
query-parser.tab.o: query-parser.tab.c query.h segment.h defaults.h \
stringmap.h stringpool.h error.h termhash.h search.h mmap-obj.h \
query-parser.h query-parser.tab.h
query.o: query.c whistlepig.h defaults.h index.h segment.h stringmap.h \
stringpool.h error.h termhash.h query.h search.h mmap-obj.h entry.h \
- khash.h rarray.h query-parser.h lock.h
+ khash.h rarray.h query-parser.h lock.h snippeter.h
search.o: search.c whistlepig.h defaults.h index.h segment.h stringmap.h \
stringpool.h error.h termhash.h query.h search.h mmap-obj.h entry.h \
- khash.h rarray.h query-parser.h lock.h
+ khash.h rarray.h query-parser.h lock.h snippeter.h
segment.o: segment.c whistlepig.h defaults.h index.h segment.h \
stringmap.h stringpool.h error.h termhash.h query.h search.h mmap-obj.h \
- entry.h khash.h rarray.h query-parser.h lock.h
+ entry.h khash.h rarray.h query-parser.h lock.h snippeter.h
snippeter.o: snippeter.c whistlepig.h defaults.h index.h segment.h \
stringmap.h stringpool.h error.h termhash.h query.h search.h mmap-obj.h \
- entry.h khash.h rarray.h query-parser.h lock.h tokenizer.lex.h
+ entry.h khash.h rarray.h query-parser.h lock.h snippeter.h \
+ tokenizer.lex.h
stringmap.o: stringmap.c whistlepig.h defaults.h index.h segment.h \
stringmap.h stringpool.h error.h termhash.h query.h search.h mmap-obj.h \
- entry.h khash.h rarray.h query-parser.h lock.h
+ entry.h khash.h rarray.h query-parser.h lock.h snippeter.h
stringpool.o: stringpool.c whistlepig.h defaults.h index.h segment.h \
stringmap.h stringpool.h error.h termhash.h query.h search.h mmap-obj.h \
- entry.h khash.h rarray.h query-parser.h lock.h
+ entry.h khash.h rarray.h query-parser.h lock.h snippeter.h
termhash.o: termhash.c whistlepig.h defaults.h index.h segment.h \
stringmap.h stringpool.h error.h termhash.h query.h search.h mmap-obj.h \
- entry.h khash.h rarray.h query-parser.h lock.h
+ entry.h khash.h rarray.h query-parser.h lock.h snippeter.h
test-labels.o: test-labels.c test.h query.h segment.h defaults.h \
stringmap.h stringpool.h error.h termhash.h search.h mmap-obj.h \
query-parser.h index.h entry.h khash.h rarray.h
@@ -127,6 +131,9 @@ test-segment.o: test-segment.c test.h segment.h defaults.h stringmap.h \
stringpool.h error.h termhash.h query.h search.h mmap-obj.h \
tokenizer.lex.h index.h entry.h khash.h rarray.h
test-segment_main.o: test-segment_main.c error.h test.h
+test-snippets.o: test-snippets.c test.h whistlepig.h defaults.h index.h \
+ segment.h stringmap.h stringpool.h error.h termhash.h query.h search.h \
+ mmap-obj.h entry.h khash.h rarray.h query-parser.h lock.h snippeter.h
test-stringmap.o: test-stringmap.c stringmap.h stringpool.h error.h \
test.h
test-stringpool.o: test-stringpool.c stringpool.h error.h test.h
@@ -188,11 +195,11 @@ query-parser.lex.o: query-parser.lex.c
%.lex.c %.lex.h: %.lex
@$(ECHO) LEX $+
- @$(LEX) $<
+ $(LEX) $<
%.tab.c %.tab.h: %.y
@$(ECHO) YACC $+
- @$(YACC) $<
+ $(YACC) $<
clean:
rm -rf $(TESTBIN) *.o *.gcda *.gcno *.gcov $(GENFILES) $(ALLBIN)
@@ -208,6 +215,7 @@ test: $(TESTBIN)
./test-search
./test-labels
./test-queries
+ ./test-snippets
integration-tests/enron1m.index0.pr: integration-tests/enron1m.mbox $(MBOXADDBIN) $(OBJ)
rm -f integration-tests/enron1m.index*
View
@@ -0,0 +1,125 @@
+#include "whistlepig.h"
+#include "tokenizer.lex.h"
+
+typedef struct pword {
+ const char* token;
+ pos_t start;
+ pos_t end;
+} pword;
+RARRAY_DECLARE(pword);
+
+RAISING_STATIC(is_match(wp_query* query, const char* field, RARRAY(pword) words, uint32_t start, uint32_t* end, int* found)) {
+ wp_query* child;
+
+ *found = 0;
+ switch(query->type) {
+ // for these four guys, we never match on snippets
+ case WP_QUERY_LABEL:
+ case WP_QUERY_EMPTY:
+ case WP_QUERY_NEG:
+ case WP_QUERY_EVERY:
+ break;
+
+ // terms match only if it's an exact match
+ case WP_QUERY_TERM:
+ DEBUG("term: comparing %s:%s to %s:%s", query->field, query->word, field, RARRAY_GET(words, start).token);
+ if(!strcmp(field, query->field) && !strcmp(query->word, RARRAY_GET(words, start).token)) {
+ *end = start;
+ *found = 1;
+ }
+ break;
+
+ // for conjunctions AND disjunctions, we match if any of the subclauses
+ // match. this makes sense for conjunctions because the query "bob AND joe"
+ // should produce a snippet for occurrences of either bob or joe, even if
+ // the document semantics are different.
+ case WP_QUERY_CONJ:
+ case WP_QUERY_DISJ:
+ child = query->children;
+ while(child != NULL) {
+ RELAY_ERROR(is_match(child, field, words, start, end, found));
+ if(*found) break;
+ child = child->next;
+ }
+ break;
+ // phrases we have to do the hard way
+ case WP_QUERY_PHRASE:
+ child = query->children;
+ if(strcmp(child->field, field)) break; // just look at the first one
+ while(child != NULL) {
+ DEBUG("phrase: comparing %s:%s to %s:%s", child->field, child->word, field, RARRAY_GET(words, start).token);
+ if(strcmp(child->word, RARRAY_GET(words, start).token)) break;
+ start++;
+ child = child->next;
+ }
+ if(child == NULL) { // made it all the way through!
+ *end = start - 1;
+ *found = 1;
+ }
+ break;
+ }
+
+ return NO_ERROR;
+}
+
+RAISING_STATIC(snippetize_query(wp_query* query, const char* field, RARRAY(pword) words, uint32_t max_num_results, uint32_t* num_results, pos_t* start_offsets, pos_t* end_offsets)) {
+ uint32_t idx = 0;
+ *num_results = 0;
+
+ while((*num_results < max_num_results) && (idx < RARRAY_NELEM(words))) {
+ uint32_t final_idx;
+ int found;
+ RELAY_ERROR(is_match(query, field, words, idx, &final_idx, &found));
+ if(found) {
+ start_offsets[*num_results] = RARRAY_GET(words, idx).start;
+ end_offsets[*num_results] = RARRAY_GET(words, final_idx).end;
+ (*num_results)++;
+ idx = final_idx + 1;
+ }
+ else idx++;
+ }
+
+ return NO_ERROR;
+}
+
+RAISING_STATIC(snippetize_from_lexer(wp_query* query, lexinfo* charpos, yyscan_t* scanner, const char* field, uint32_t max_num_results, uint32_t* num_results, pos_t* start_offsets, pos_t* end_offsets)) {
+ RARRAY(pword) words;
+
+ RARRAY_INIT(pword, words);
+ while(yylex(*scanner) != TOK_DONE) {
+ pword pw = { strdup(yyget_text(*scanner)), charpos->start, charpos->end };
+ RARRAY_ADD(pword, words, pw);
+ }
+
+ RELAY_ERROR(snippetize_query(query, field, words, max_num_results, num_results, start_offsets, end_offsets));
+
+ return NO_ERROR;
+}
+
+// tokenizes and adds everything under a single field
+wp_error* wp_snippetize_string(wp_query* query, const char* field, const char* string, uint32_t max_num_results, uint32_t* num_results, pos_t* start_offsets, pos_t* end_offsets) {
+ yyscan_t scanner;
+ lexinfo charpos = {0, 0};
+
+ yylex_init_extra(&charpos, &scanner);
+ YY_BUFFER_STATE state = yy_scan_string(string, scanner);
+ RELAY_ERROR(snippetize_from_lexer(query, &charpos, &scanner, field, max_num_results, num_results, start_offsets, end_offsets));
+ yy_delete_buffer(state, scanner);
+ yylex_destroy(scanner);
+
+ return NO_ERROR;
+}
+
+// tokenizes and adds everything from a file under a single field
+wp_error* wp_snippetize_file(wp_query* query, const char* field, FILE* f, uint32_t max_num_results, uint32_t* num_results, pos_t* start_offsets, pos_t* end_offsets) {
+ yyscan_t scanner;
+ lexinfo charpos = {0, 0};
+
+ yylex_init_extra(&charpos, &scanner);
+ yyset_in(f, scanner);
+ RELAY_ERROR(snippetize_from_lexer(query, &charpos, &scanner, field, max_num_results, num_results, start_offsets, end_offsets));
+ yylex_destroy(scanner);
+
+ return NO_ERROR;
+}
+
View
@@ -0,0 +1,10 @@
+#ifndef SNIPPETER_H_
+#define SNIPPETER_H_
+
+#include "error.h"
+#include "query.h"
+
+wp_error* wp_snippetize_string(wp_query* query, const char* field, const char* string, uint32_t max_num_results, uint32_t* num_results, pos_t* start_offsets, pos_t* end_offsets);
+wp_error* wp_snippetize_file(wp_query* query, const char* field, FILE* f, uint32_t max_num_results, uint32_t* num_results, pos_t* start_offsets, pos_t* end_offsets);
+
+#endif
Oops, something went wrong.

0 comments on commit 52cb920

Please sign in to comment.