Permalink
Browse files

initial checkin

  • Loading branch information...
0 parents commit a79c565908c9a1c793efce44b20b75932a828d5d @wmorgan committed Feb 9, 2011
Showing with 10,734 additions and 0 deletions.
  1. +25 −0 .gitignore
  2. +192 −0 Makefile
  3. +86 −0 README
  4. +44 −0 batch-run-queries.c
  5. +44 −0 build/gen-test-main.rb
  6. +28 −0 defaults.h
  7. +65 −0 dump.c
  8. +181 −0 entry.c
  9. +66 −0 entry.h
  10. +24 −0 error.c
  11. +94 −0 error.h
  12. +63 −0 file-indexer.c
  13. +294 −0 index.c
  14. +88 −0 index.h
  15. +5 −0 integration-tests/README
  16. +39 −0 integration-tests/eval.rb
  17. +17 −0 integration-tests/testset1.txt
  18. +155 −0 interactive.c
  19. +316 −0 khash.h
  20. +92 −0 make-queries.c
  21. +111 −0 mbox-indexer.c
  22. +76 −0 mmap-obj.c
  23. +52 −0 mmap-obj.h
  24. +37 −0 query-parser.c
  25. +25 −0 query-parser.h
  26. +50 −0 query-parser.lex
  27. +88 −0 query-parser.y
  28. +194 −0 query.c
  29. +78 −0 query.h
  30. +27 −0 ruby/Rakefile
  31. +68 −0 ruby/bin/email-indexer
  32. +142 −0 ruby/bin/email-searcher
  33. +6 −0 ruby/ext/whistlepig/extconf.rb
  34. +119 −0 ruby/lib/whistlepig.rb
  35. +5 −0 ruby/lib/whistlepig/email.rb
  36. +129 −0 ruby/lib/whistlepig/email/decoder.rb
  37. +58 −0 ruby/lib/whistlepig/email/mbox-splitter.rb
  38. +162 −0 ruby/lib/whistlepig/email/message.rb
  39. +45 −0 ruby/lib/whistlepig/email/person.rb
  40. +145 −0 ruby/lib/whistlepig/email/store.rb
  41. +746 −0 search.c
  42. +76 −0 search.h
  43. +615 −0 segment.c
  44. +137 −0 segment.h
  45. +278 −0 stringmap.c
  46. +82 −0 stringmap.h
  47. +44 −0 stringpool.c
  48. +58 −0 stringpool.h
  49. +294 −0 termhash.c
  50. +79 −0 termhash.h
  51. +199 −0 test-labels.c
  52. +358 −0 test-queries.c
  53. +404 −0 test-segment.c
  54. +82 −0 test-stringmap.c
  55. +67 −0 test-stringpool.c
  56. +95 −0 test-termhash.c
  57. +38 −0 test.h
  58. +28 −0 timer.h
  59. +51 −0 tokenizer.lex
  60. +15 −0 whistlepig.h
  61. +199 −0 www/doc/README.html
  62. +167 −0 www/doc/Whistlepig.html
  63. +331 −0 www/doc/Whistlepig/Entry.html
  64. +160 −0 www/doc/Whistlepig/Error.html
  65. +838 −0 www/doc/Whistlepig/Index.html
  66. +160 −0 www/doc/Whistlepig/ParseError.html
  67. +413 −0 www/doc/Whistlepig/Query.html
  68. +4 −0 www/doc/created.rid
  69. +52 −0 www/doc/ext/whistlepig/whistlepigc_c.html
  70. +221 −0 www/doc/index.html
  71. +54 −0 www/doc/lib/whistlepig_rb.html
  72. +706 −0 www/doc/rdoc.css
  73. +148 −0 www/index.html
@@ -0,0 +1,25 @@
+.*.swp
+*.o
+dump
+test
+add
+addmbox
+mancorpus
+query
+query-parser.tab.[ch]
+query-tokenizer.[ch]
+query-parser.lex.[ch]
+tokenizer.lex.[ch]
+test-stringpool
+test-queries
+test-segment
+test-stringmap
+test-termhash
+test-labels
+batch-run-queries
+ruby/ext/whistlepig/*.[ch]
+ruby/ext/whistlepig/Makefile
+integration-tests/enron1m.mbox
+integration-tests/enron1m.index*
+ruby/pkg/
+ruby/README
192 Makefile
@@ -0,0 +1,192 @@
+## Whistlepig Makefile
+## Copyright (c) 2011 William Morgan <wmorgan@masanjin.net>
+## Whistlepig is released under the three-clause BSD license. See the COPYING
+## file for terms.
+
+LEX=flex
+YACC=bison
+CC=gcc
+RUBY=/usr/bin/ruby
+ECHO=/bin/echo
+
+## stolen from redis
+uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
+OPTIMIZATION?=-O3
+CFLAGS?= -std=c99 $(OPTIMIZATION) -Wall -Wextra -Wwrite-strings -Werror
+CCLINK?= #-lm -pthread
+CCOPT= $(CFLAGS) $(CCLINK) $(ARCH) $(PROF)
+DEBUG?= -rdynamic -ggdb
+
+TESTFILES = test-segment.c test-stringmap.c test-stringpool.c test-termhash.c test-queries.c test-labels.c
+CSRCFILES = segment.c termhash.c stringmap.c error.c query.c search.c stringpool.c mmap-obj.c query-parser.c index.c entry.c
+HEADERFILES = $(CSRCFILES:.c=.h) defaults.h whistlepig.h khash.h
+LEXFILES = tokenizer.lex query-parser.lex
+YFILES = query-parser.y
+GENFILES = $(LEXFILES:.lex=.lex.c) $(LEXFILES:.lex=.lex.h) $(YFILES:.y=.tab.c) $(YFILES:.y=.tab.h)
+OBJ = $(CSRCFILES:.c=.o) $(LEXFILES:.lex=.lex.o) $(YFILES:.y=.tab.o)
+
+QUERYBIN=query
+DUMPBIN=dump
+ADDBIN=add
+MBOXADDBIN=addmbox
+TESTBIN = $(TESTFILES:.c=)
+ALLBIN=$(QUERYBIN) $(DUMPBIN) $(ADDBIN) $(MBOXADDBIN) batch-run-queries
+
+all: $(ALLBIN)
+
+## remove implicit rules because they fuck with my shit
+.SUFFIXES:
+
+loc: $(CSRCFILES) $(LEXFILES) $(YFILES) $(HEADERFILES)
+ sloccount $+
+
+## deps (use `make dep` to generate this (in vi: :r !make dep)
+batch-run-queries.o: batch-run-queries.c whistlepig.h defaults.h index.h \
+ segment.h stringmap.h stringpool.h error.h termhash.h query.h search.h \
+ mmap-obj.h entry.h khash.h query-parser.h timer.h
+dump.o: dump.c whistlepig.h defaults.h index.h segment.h stringmap.h \
+ stringpool.h error.h termhash.h query.h search.h mmap-obj.h entry.h \
+ khash.h query-parser.h
+entry.o: entry.c whistlepig.h defaults.h index.h segment.h stringmap.h \
+ stringpool.h error.h termhash.h query.h search.h mmap-obj.h entry.h \
+ khash.h query-parser.h tokenizer.lex.h
+error.o: error.c error.h
+file-indexer.o: file-indexer.c timer.h whistlepig.h defaults.h index.h \
+ segment.h stringmap.h stringpool.h error.h termhash.h query.h search.h \
+ mmap-obj.h entry.h khash.h query-parser.h
+index.o: index.c whistlepig.h defaults.h index.h segment.h stringmap.h \
+ stringpool.h error.h termhash.h query.h search.h mmap-obj.h entry.h \
+ khash.h query-parser.h
+interactive.o: interactive.c whistlepig.h defaults.h index.h segment.h \
+ stringmap.h stringpool.h error.h termhash.h query.h search.h mmap-obj.h \
+ entry.h khash.h query-parser.h timer.h
+make-queries.o: make-queries.c tokenizer.lex.h segment.h defaults.h \
+ stringmap.h stringpool.h error.h termhash.h query.h search.h mmap-obj.h
+mbox-indexer.o: mbox-indexer.c whistlepig.h defaults.h index.h segment.h \
+ stringmap.h stringpool.h error.h termhash.h query.h search.h mmap-obj.h \
+ entry.h khash.h query-parser.h timer.h
+mmap-obj.o: mmap-obj.c whistlepig.h defaults.h index.h segment.h \
+ stringmap.h stringpool.h error.h termhash.h query.h search.h mmap-obj.h \
+ entry.h khash.h query-parser.h
+query-parser.o: query-parser.c whistlepig.h defaults.h index.h segment.h \
+ stringmap.h stringpool.h error.h termhash.h query.h search.h mmap-obj.h \
+ entry.h khash.h query-parser.h query-parser.tab.h
+query-parser.lex.o: query-parser.lex.c query-parser.h query.h segment.h \
+ defaults.h stringmap.h stringpool.h error.h termhash.h search.h \
+ mmap-obj.h query-parser.tab.h
+query-parser.tab.o: query-parser.tab.c query.h segment.h defaults.h \
+ stringmap.h stringpool.h error.h termhash.h search.h mmap-obj.h \
+ query-parser.h query-parser.tab.h
+query.o: query.c query.h segment.h defaults.h stringmap.h stringpool.h \
+ error.h termhash.h search.h mmap-obj.h
+search.o: search.c whistlepig.h defaults.h index.h segment.h stringmap.h \
+ stringpool.h error.h termhash.h query.h search.h mmap-obj.h entry.h \
+ khash.h query-parser.h
+segment.o: segment.c whistlepig.h defaults.h index.h segment.h \
+ stringmap.h stringpool.h error.h termhash.h query.h search.h mmap-obj.h \
+ entry.h khash.h query-parser.h
+stringmap.o: stringmap.c whistlepig.h defaults.h index.h segment.h \
+ stringmap.h stringpool.h error.h termhash.h query.h search.h mmap-obj.h \
+ entry.h khash.h query-parser.h
+stringpool.o: stringpool.c whistlepig.h defaults.h index.h segment.h \
+ stringmap.h stringpool.h error.h termhash.h query.h search.h mmap-obj.h \
+ entry.h khash.h query-parser.h
+termhash.o: termhash.c whistlepig.h defaults.h index.h segment.h \
+ stringmap.h stringpool.h error.h termhash.h query.h search.h mmap-obj.h \
+ entry.h khash.h query-parser.h
+test-labels.o: test-labels.c test.h query.h segment.h defaults.h \
+ stringmap.h stringpool.h error.h termhash.h search.h mmap-obj.h \
+ query-parser.h index.h entry.h khash.h
+test-queries.o: test-queries.c test.h query.h segment.h defaults.h \
+ stringmap.h stringpool.h error.h termhash.h search.h mmap-obj.h \
+ query-parser.h index.h entry.h khash.h
+test-segment.o: test-segment.c test.h segment.h defaults.h stringmap.h \
+ stringpool.h error.h termhash.h query.h search.h mmap-obj.h \
+ tokenizer.lex.h index.h entry.h khash.h
+test-stringmap.o: test-stringmap.c stringmap.h stringpool.h error.h \
+ test.h
+test-stringpool.o: test-stringpool.c stringpool.h error.h test.h
+test-termhash.o: test-termhash.c termhash.h error.h test.h
+tokenizer.lex.o: tokenizer.lex.c segment.h defaults.h stringmap.h \
+ stringpool.h error.h termhash.h query.h search.h mmap-obj.h
+
+batch-run-queries: batch-run-queries.o $(OBJ)
+ @$(ECHO) LINK $@
+ @$(CC) -o $@ $(CCOPT) $(DEBUG) $+
+
+$(QUERYBIN): $(OBJ) interactive.o
+ @$(ECHO) LINK $@
+ @$(CC) -o $@ $(CCOPT) $(DEBUG) $+
+
+$(ADDBIN): $(OBJ) file-indexer.o
+ @$(ECHO) LINK $@
+ @$(CC) -o $@ $(CCOPT) $(DEBUG) $+
+
+$(MBOXADDBIN): $(OBJ) mbox-indexer.o
+ @$(ECHO) LINK $@
+ @$(CC) -o $@ $(CCOPT) $(DEBUG) $+
+
+$(DUMPBIN): $(OBJ) dump.o
+ @$(ECHO) LINK $@
+ @$(CC) -o $@ $(CCOPT) $(DEBUG) $+
+
+test-%_main.c: test-%.c
+ @$(ECHO) MAGIC $<
+ @$(RUBY) build/gen-test-main.rb $+ > $@
+
+test-%: test-%.o test-%_main.o $(OBJ)
+ @$(ECHO) LINK $@
+ @$(CC) -o $@ $(CCOPT) $(DEBUG) $+
+
+## these next two rules are to ignore warnings in generated c code
+tokenizer.lex.o: tokenizer.lex.c
+ @$(ECHO) CC \(ignore warnings\) $<
+ @$(CC) -c $(CFLAGS) $(DEBUG) $(DEBUGOUTPUT) $(COMPILE_TIME) -w $<
+
+query-parser.lex.o: query-parser.lex.c
+ @$(ECHO) CC \(ignore warnings\) $<
+ @$(CC) -c $(CFLAGS) $(DEBUG) $(DEBUGOUTPUT) $(COMPILE_TIME) -w $<
+
+## object compilation
+%.o: %.c
+ @$(ECHO) CC $<
+ @$(CC) -c $(CFLAGS) $(DEBUG) $(DEBUGOUTPUT) $(COMPILE_TIME) $<
+
+%.lex.c %.lex.h: %.lex
+ @$(ECHO) LEX $+
+ @$(LEX) $<
+
+%.tab.c %.tab.h: %.y
+ @$(ECHO) YACC $+
+ @$(YACC) $<
+
+clean:
+ rm -rf $(TESTBIN) *.o *.gcda *.gcno *.gcov $(GENFILES) $(ALLBIN)
+
+dep: $(GENFILES)
+ $(CC) -MM *.c
+
+test: $(TESTBIN)
+ ./test-segment
+ ./test-stringmap
+ ./test-stringpool
+ ./test-termhash
+ ./test-queries
+ ./test-labels
+
+integration-tests/enron1m.index0.pr: integration-tests/enron1m.mbox $(MBOXADDBIN) $(OBJ)
+ rm -f integration-tests/enron1m.index*
+ ./$(MBOXADDBIN) integration-tests/enron1m.index integration-tests/enron1m.mbox
+
+test-integration: batch-run-queries integration-tests/enron1m.index0.pr
+ ruby integration-tests/eval.rb integration-tests/testset1.txt
+
+debug:
+ make DEBUGOUTPUT=-DDEBUGOUTPUT
+
+EXPORTFILES=$(CSRCFILES) $(HEADERFILES) $(GENFILES)
+rubygem: $(EXPORTFILES)
+ cp README ruby
+ cp $+ ruby/ext/whistlepig
+ cd ruby && rake gem
+ @echo gem is in ruby/pkg/
86 README
@@ -0,0 +1,86 @@
+= Whistlepig
+
+Whistlepig is a minimalist realtime full-text search index. Its goal is to be
+as small and feature-free as possible, while still remaining useful, performant
+and scalable to large corpora. If you want realtime full-text search without
+the frills, Whistlepig may be for you.
+
+Whistlepig is written in ANSI C99. It currently provides a C API and Ruby
+bindings.
+
+Latest version: 0.1, released 2010-02-08.
+ Status: alpha
+ News: http://all-thing.net/label/whistlepig/
+ Homepage: http://masanjin.net/whistlepig/
+
+= Getting it
+
+ Tarball: whistlepig-0.1.tar.gz
+ Rubygem: gem install whistlepig
+ Git: git clone git://masanjin.net/whistlepig/
+
+= Realtime search
+
+Roughly speaking, realtime search means:
+- documents are available to to queries immediately after indexing, without
+ any further index merging steps; and
+- later documents are more important than earlier documents.
+
+Whistlepig takes these principles to an extreme. In particular:
+- It only returns documents in the reverse order to which they were added
+ (i.e. LIFO order), and performs no ranking, reordering, or scoring.
+- It only supports incremental indexing. There is no notion of batch indexing
+ or index merging.
+- It does not support document deletion or modification (except in the
+ special case of labels; see below).
+- In only supports in-memory indexes.
+
+Features that Whistlepig does provide:
+- Incremental indexing. Updates to the index are immediately available to
+ readers.
+- Fielded terms with arbitrary fields.
+- A full query language and parser with conjunctions, disjunctions, phrases,
+ negations, grouping, and nesting.
+- Labels: arbitrary tokens which can be added to and removed from documents
+ at any point, and incorporated into search queries. (This is the only
+ mutable aspect of a document once it has been indexed.)
+- Early query termination.
+- Resumable queries.
+- A tiny, < 3 KLOC ANSI C99 implementation.
+
+== Synopsis (using Ruby bindings)
+
+ require 'rubygems'
+ require 'whistlepig'
+
+ include Whistlepig
+
+ index = Index.new "index"
+
+ entry1 = Entry.new
+ entry1.add_string "body", "hello there bob"
+ docid1 = index.add_entry entry1 # => 1
+
+ entry2 = Entry.new
+ entry2.add_string "body", "goodbye bob"
+ docid2 = index.add_entry entry2 # => 2
+
+ q1 = Query.new "body", "bob"
+ results1 = index.search q1 # => [2, 1]
+
+ q2 = q1.and Query.new("body", "hello")
+ results2 = index.search q2 # => [1]
+
+ index.add_label docid2, "funny"
+
+ q3 = Query.new "body", "bob ~funny"
+ results3 = index.search q3 # => [2]
+
+== A note on concurrency:
+
+Whistlepig is currently single-process and single-thread only. However, it is
+built with multi-process access in mind. Per-segment single-writer,
+multi-reader support is planned in the near future. Multi-writer support can be
+accomplished via index striping and is planned for the distant future.
+
+Please send bug reports and comments to: wmorgan-whistlepig-design@masanjin.net.
@@ -0,0 +1,44 @@
+#include <stdio.h>
+#include "whistlepig.h"
+#include "timer.h"
+
+int main(int argc, char* argv[]) {
+ wp_index* index;
+ wp_error* e;
+
+ if(argc != 2) {
+ fprintf(stderr, "Usage: %s <index basepath>\n", argv[0]);
+ return -1;
+ }
+
+ DIE_IF_ERROR(wp_index_load(&index, argv[1]));
+
+ while(1) {
+ char input[1024];
+ TIMER(query);
+ uint32_t total_num_results;
+ wp_query* query;
+
+#define HANDLE_ERROR(v) e = v; if(e != NULL) { PRINT_ERROR(e, stdout); wp_error_free(e); continue; }
+
+ printf("query: ");
+ fflush(stdout);
+
+ input[0] = 0;
+ if(fgets(input, 1024, stdin) == NULL) break;
+ if(input[0] == '\0') break;
+
+ HANDLE_ERROR(wp_query_parse(input, "body", &query));
+ if(query == NULL) continue;
+
+ RESET_TIMER(query);
+ HANDLE_ERROR(wp_index_count_results(index, query, &total_num_results));
+ MARK_TIMER(query);
+ wp_query_free(query);
+ printf("found %d results in %.1fms\n", total_num_results, (float)TIMER_MS(query));
+ }
+
+ DIE_IF_ERROR(wp_index_unload(index));
+
+ return 0;
+}
@@ -0,0 +1,44 @@
+#!/usr/bin/env ruby
+
+abort "expecting one argument: the filename" unless ARGV.size == 1
+fn = ARGV.shift
+funcs = []
+IO.foreach(fn) do |l|
+ if l =~ /TEST\((.+?)\)/
+ funcs << $1
+ end
+end
+
+puts %q!
+#include <stdio.h>
+#include "error.h"
+#include "test.h"
+!
+
+puts funcs.map { |f| "TEST(#{f});" }
+
+puts %q!
+int main(int argc, char* argv[]) {
+ (void) argc; (void) argv;
+ int failures = 0, errors = 0, asserts = 0, tests = 0;
+
+ //printf("Running tests...\n\n");
+
+!
+puts funcs.map { |f| "RUNTEST(#{f});" }
+
+puts %q!
+ printf("%d tests, %d assertions, %d failures, %d errors\n", tests, asserts, failures, errors);
+
+ if((errors == 0) && (failures == 0)) {
+ // printf("Tests passed.\n");
+ return 0;
+ }
+
+ else {
+ //printf("Tests FAILED.\n");
+ return -1;
+ }
+}
+!
+
Oops, something went wrong.

0 comments on commit a79c565

Please sign in to comment.