Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Added retrieval code

  • Loading branch information...
commit b91f6c7988d563a15809ae9d777b48e263e27e75 1 parent 564dfde
@turian authored
Showing with 48 additions and 1 deletion.
  1. +2 −0  .hgignore
  2. +1 −1  index-sentences.py
  3. +45 −0 retrieve-sentences.py
View
2  .hgignore
@@ -0,0 +1,2 @@
+syntax: glob
+lucene.ukwac
View
2  index-sentences.py
@@ -55,7 +55,7 @@
doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
- if i % 1000 == 0:
+ if i % 10000 == 0:
print >> sys.stderr, "Read %d lines from stdin (%d documents in index)..." % (i, writer.numDocs())
print >> sys.stderr, stats()
# if i > 100000: break
View
45 retrieve-sentences.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+"""
+Read one-sentence-per-line from stdin.
+Index each line in Lucene as a separate document.
+
+TODO:
+ * Remove previous index, start from scratch.
+
+USAGE:
+ /u/turian/data/web_corpus/WaCky2/sentencesplit.py | ./index-sentences.py
+
+NB we use the StandardAnalyzer, but should try the SnowballAnalyzer
+"""
+
+import sys
+import string
+
+from common.stats import stats
+
+import lucene
+from lucene import \
+ SimpleFSDirectory, System, File, \
+ Document, Field, StandardAnalyzer, IndexSearcher, Version, QueryParser
+
+if __name__ == "__main__":
+ lucene.initVM()
+ # create an index called 'index-dir' in a temp directory
+# indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
+# 'index-dir')
+ indexDir = "/Tmp/REMOVEME.index-dir"
+# indexDir = "lucene.ukwac"
+ dir = SimpleFSDirectory(File(indexDir))
+ analyzer = StandardAnalyzer(Version.LUCENE_30)
+ searcher = IndexSearcher(dir)
+
+ query = QueryParser(Version.LUCENE_30, "text", analyzer).parse("Find this sentence please")
+# query = QueryParser(analyzer).parse("Find this sentence please")
+ hits = searcher.search(query, 1000)
+
+ print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
+
+ for hit in hits.scoreDocs:
+ print hit.score, hit.doc, hit.toString()
+ doc = searcher.doc(hit.doc)
+ print doc.get("text").encode("utf-8")
Please sign in to comment.
Something went wrong with that request. Please try again.