Permalink
Browse files

First commit

  • Loading branch information...
0 parents commit 699565fb468ddd462f2257320727603d35da291e @zooie committed Jul 5, 2009
Showing with 3,921 additions and 0 deletions.
  1. +15 −0 README
  2. BIN exp/software/ohsumed/.DS_Store
  3. +11 −0 exp/software/ohsumed/README
  4. +9 −0 exp/software/ohsumed/avg_query_length.py
  5. +22 −0 exp/software/ohsumed/etl.py
  6. +19 −0 exp/software/ohsumed/etl_queries.py
  7. +11 −0 exp/software/ohsumed/fsize.py
  8. BIN exp/software/ohsumed/lucene/.DS_Store
  9. +37 −0 exp/software/ohsumed/lucene/Index.java
  10. +16 −0 exp/software/ohsumed/lucene/README
  11. +42 −0 exp/software/ohsumed/lucene/Search.java
  12. +33 −0 exp/software/ohsumed/lucene/perf.txt
  13. +630 −0 exp/software/ohsumed/lucene/results.txt
  14. +74 −0 exp/software/ohsumed/scorer.py
  15. BIN exp/software/ohsumed/sphinx/.DS_Store
  16. +30 −0 exp/software/ohsumed/sphinx/README
  17. +35 −0 exp/software/ohsumed/sphinx/Search.java
  18. +29 −0 exp/software/ohsumed/sphinx/codegen.py
  19. +27 −0 exp/software/ohsumed/sphinx/perf.txt
  20. +630 −0 exp/software/ohsumed/sphinx/results.txt
  21. +21 −0 exp/software/ohsumed/sphinx/sphinx.conf
  22. BIN exp/software/ohsumed/sqlite/.DS_Store
  23. +20 −0 exp/software/ohsumed/sqlite/README
  24. +17 −0 exp/software/ohsumed/sqlite/codegen_build.py
  25. +16 −0 exp/software/ohsumed/sqlite/codegen_query.py
  26. +24 −0 exp/software/ohsumed/sqlite/perf.txt
  27. +620 −0 exp/software/ohsumed/sqlite/results.txt
  28. BIN exp/software/ohsumed/xapian/.DS_Store
  29. +38 −0 exp/software/ohsumed/xapian/README
  30. +36 −0 exp/software/ohsumed/xapian/perf.txt
  31. +630 −0 exp/software/ohsumed/xapian/results.txt
  32. +49 −0 exp/software/ohsumed/xapian/simpleindex.cc
  33. +71 −0 exp/software/ohsumed/xapian/simplesearch.cc
  34. BIN exp/software/ohsumed/zettair/.DS_Store
  35. +24 −0 exp/software/ohsumed/zettair/README
  36. +8 −0 exp/software/ohsumed/zettair/codegen.py
  37. +27 −0 exp/software/ohsumed/zettair/perf.txt
  38. +630 −0 exp/software/ohsumed/zettair/results.txt
  39. +20 −0 exp/software/ohsumed/zettair/results_parser.py
15 README
@@ -0,0 +1,15 @@
+
+@author Vik Singh (viksi@yahoo-inc.com)
+
+This is a public git repository for hosting opensearch projects, libraries, examples, and benchmarks
+
+Main topics:
+
+Open Search Services ([main|exp]/services/)
+===========================================
+BOSS, Bing, AJAX Search API, Twitter Search API, Mashups
+
+Open Source Vertical Indexing Platforms ([main|exp]/software/)
+==============================================================
+Lucene, sphinx, zettair, RDBMS, Xapian, Terrier, etc.
+Benchmarks, unifying interfaces
Binary file not shown.
@@ -0,0 +1,11 @@
+
+# tested with python2.6
+
+# download filtering.tar.gz from http://trec.nist.gov/data/t9_filtering.html
+# cp ohsumed.88-91 qrels.ohsu.88-91 query.ohsu.1-63 into this dir
+
+# to generate ohsumed.flat and queries.txt
+python etl.py
+python etl_queries.py
+
+# then can cd into any of the vertical search directories (i.e. lucene/) and follow its README
@@ -0,0 +1,9 @@
+
+# computes average query length (# of tokens)
+
+total = 0
+count = 0
+for line in open("queries.txt", "r"):
+ total += len(line.split())
+ count += 1
+print "AVG Query Length", total / float(count)
@@ -0,0 +1,22 @@
+
+# output format: docid\tcontent\n
+
+out = open("ohsumed.flat", "w")
+
+toindex = False
+buffer = {}
+
+for line in open("ohsumed.88-91", "r"):
+
+ if toindex:
+ buffer[toindex] = line.strip()
+ toindex = False
+ continue
+
+ if line.startswith(".") and not line.startswith(".I") and not line.startswith(".P"):
+ if len(buffer) == 6:
+ out.write(buffer[".U"] + "\t" + " ".join([buffer[k] for k in buffer if k != ".U"]) + "\n")
+ buffer = {}
+ toindex = line.strip()
+
+out.close()
@@ -0,0 +1,19 @@
+
+# output format: query\n
+
+d = {}
+topicid = None
+
+for line in open("query.ohsu.1-63", "r"):
+ if line.startswith("<num>"):
+ topicid = line.split()[-1].strip()
+ if line.startswith("<title>"):
+ query = line[7:].strip()
+ d[query] = topicid
+
+out = open("queries.txt", "w")
+
+for key in d:
+ out.write(key + "\n")
+
+out.close()
@@ -0,0 +1,11 @@
+
+# total size in bytes of a dir
+# used for measuring index sizes
+
+import os
+import sys
+
+size = 0
+for f in os.listdir(sys.argv[1]):
+ size += os.path.getsize(sys.argv[1].rstrip("/") + "/" + f)
+print "Size", size
Binary file not shown.
@@ -0,0 +1,37 @@
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Scanner;
+
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.store.FSDirectory;
+
+public class Index {
+
+ public static void main(String[] args) throws IOException {
+
+ IndexWriter indexWriter = new IndexWriter(FSDirectory.getDirectory("index"),
+ new StandardAnalyzer(),
+ IndexWriter.MaxFieldLength.LIMITED);
+
+ Scanner scan = new Scanner(new File("../ohsumed.flat"));
+
+ while (scan.hasNextLine()) {
+ String line = scan.nextLine().trim();
+ String[] data = line.split("\t");
+ Document doc = new Document();
+ doc.add(new Field("id", data[0], Field.Store.YES, Field.Index.NO));
+ doc.add(new Field("content", data[1], Field.Store.NO, Field.Index.ANALYZED));
+ indexWriter.addDocument(doc);
+ }
+
+ //indexWriter.optimize();
+ indexWriter.close();
+
+ scan.close();
+ }
+
+}
@@ -0,0 +1,16 @@
+
+# change dir to point to your lucene core jar
+
+javac -cp /Users/viksi/lucene-2.4.1/lucene-core-2.4.1.jar:. Index.java
+
+# remove any existing indices
+
+rm -fr index/
+time java -cp /Users/viksi/lucene-2.4.1/lucene-core-2.4.1.jar:. Index
+
+time java -cp /Users/viksi/lucene-2.4.1/lucene-core-2.4.1.jar:. Search > results.txt
+
+python ../fsize.py index
+
+cd ..
+cat lucene/results.txt | python scorer.py
@@ -0,0 +1,42 @@
+import java.io.File;
+import java.io.IOException;
+import java.util.Scanner;
+
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Searcher;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+
+public class Search {
+
+ public static void main(String[] args) throws IOException, ParseException {
+ String queryString;
+ Scanner scan = new Scanner(new File("../queries.txt"));
+
+ IndexReader idx = IndexReader.open("index");
+ Searcher searcher = new IndexSearcher(idx);
+ QueryParser qp = new QueryParser("content", new StandardAnalyzer());
+
+ while (scan.hasNextLine()) {
+ queryString = scan.nextLine().trim();
+
+ Query query = qp.parse(queryString);
+ TopDocs results = searcher.search(query, null, 10);
+ for (ScoreDoc sd : results.scoreDocs) {
+ Document doc = searcher.doc(sd.doc);
+ System.out.println(queryString + " " + doc.get("id"));
+ }
+ }
+
+ idx.close();
+ scan.close();
+
+ }
+}
@@ -0,0 +1,33 @@
+
+index peak memory
+37m
+
+index time
+2m4.572s
+
+index time (w/ optimization)
+2m15.294s
+
+index size (w/o optimization)
+90722553
+
+index size
+89269733
+
+data size
+300030495
+
+records
+196403
+
+search peak memory
+18m
+
+search time
+1.366s
+
+search relevancy
+5.59715145439
+
+queries size
+63
Oops, something went wrong.

0 comments on commit 699565f

Please sign in to comment.