In [1]:
import sys, os, csv, lucene
import numpy as np

In [2]:
from java.io import File
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import DirectoryReader, IndexWriter, IndexWriterConfig
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.document import Document, Field, FieldType, StringField, TextField
from org.apache.lucene.store import SimpleFSDirectory, RAMDirectory
from org.apache.lucene.util import Version

In [3]:
lucene.initVM(vmargs=['-Djava.awt.headless=true'])

<jcc.JCCEnv at 0x7f0027092900>

In [4]:
fsDir = SimpleFSDirectory(File("term_index"))
reader = DirectoryReader.open(fsDir)
searcher = IndexSearcher(reader)

In [5]:
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
parser = QueryParser(Version.LUCENE_CURRENT, "question", analyzer)

In [6]:
query = parser.parse('When athletes begin to exercise, their heart rates and respiration rates increase.  At what level of organization does the human body coordinate these functions?')

In [7]:
result = searcher.search(query, 10)
print result.getMaxScore()
scoreDocs = result.scoreDocs

6.00224876404


In [8]:
doc = scoreDocs[0].doc
print doc

234203


In [9]:
answer_vector = reader.getTermVector(doc, "answer")
print answer_vector

org.apache.lucene.codecs.compressing.CompressingTermVectorsReader$TVTerms@30432a9e


In [10]:
def score_answer(answer, answerA):
    directory = RAMDirectory()
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    #analyzer = LimitTokenCountAnalyzer(analyzer, 10000)
    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    writer = IndexWriter(directory, config)

    doc = Document()
    doc.add(Field("answer", answer, TextField.TYPE_STORED))
    writer.addDocument(doc)

    writer.commit()
    writer.close()
    
    reader = DirectoryReader.open(directory)
    searcher = IndexSearcher(reader)

    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    parser = QueryParser(Version.LUCENE_CURRENT, "answer", analyzer)
    query = parser.parse(QueryParser.escape(answerA))
    result = searcher.search(query, 1)

    scoreDocs = result.scoreDocs
    #for scoreDoc in scoreDocs:
    #    doc = searcher.doc(scoreDoc.doc)
    #    print answerA, doc.get("answer"), scoreDoc.score

    return result.getMaxScore() if len(scoreDocs) > 0 else 0

print score_answer("at the tissue level", "at the system level")
print score_answer("at the organ level", "at the system level")
print score_answer("at the system level", "at the system level")
print score_answer("at the cellular level", "at the system level")

0.0281300246716
0.0281300246716
0.271222114563
0.0281300246716


In [None]:
correct = 0
results = []
with open("../data/training_set.tsv") as f:
    csv_reader = csv.reader(f, delimiter="\t", strict=True, quoting=csv.QUOTE_NONE)
    header = next(csv_reader)  # ignore header
    is_train_set = (len(header) == 7)
    i = 0
    for id, question, right, answerA, answerB, answerC, answerD in list(csv_reader):
        query = parser.parse(QueryParser.escape(question))
        result = searcher.search(query, 500)

        #print question, answerA, answerB, answerC, answerD
        
        scoreA = 0
        scoreB = 0
        scoreC = 0
        scoreD = 0
        scoreDocs = result.scoreDocs
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            question = doc.get("question")
            answer = doc.get("answer")
            docscore = scoreDoc.score
            #print question, answer, docscore

            scoreA += docscore * score_answer(answer, answerA)
            scoreB += docscore * score_answer(answer, answerB)
            scoreC += docscore * score_answer(answer, answerC)
            scoreD += docscore * score_answer(answer, answerD)
        
        scores = [scoreA, scoreB, scoreC, scoreD]
        best = np.argmax(scores)
        best = chr(ord('A') + best)
        results.append([id, best] + scores)

        #print scoreA, scoreB, scoreC, scoreD
        #print right, best
        i += 1
        if i % 100 == 0:
            print i

        if best == right:
            correct += 1

accuracy = float(correct) / len(results)
print "Correct: %d Total: %d Accuracy: %f" % (correct, len(results), accuracy)

In [141]:
with open("lucene500_predictions_training.csv", "w") as f:
    writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL)
    for i in xrange(len(results)):
        writer.writerow(results[i])

In [None]:
correct = 0
results = []
with open("../data/test_set.tsv") as f:
    csv_reader = csv.reader(f, delimiter="\t", strict=True, quoting=csv.QUOTE_NONE)
    header = next(csv_reader)  # ignore header
    is_train_set = (len(header) == 7)
    i = 0
    for id, question, answerA, answerB, answerC, answerD in list(csv_reader):
        query = parser.parse(QueryParser.escape(question))
        result = searcher.search(query, 1000)

        #print question, answerA, answerB, answerC, answerD
        
        scoreA = 0
        scoreB = 0
        scoreC = 0
        scoreD = 0
        scoreDocs = result.scoreDocs
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            question = doc.get("question")
            answer = doc.get("answer")
            docscore = scoreDoc.score
            #print question, answer, docscore

            scoreA += docscore * score_answer(answer, answerA)
            scoreB += docscore * score_answer(answer, answerB)
            scoreC += docscore * score_answer(answer, answerC)
            scoreD += docscore * score_answer(answer, answerD)
        
        scores = [scoreA, scoreB, scoreC, scoreD]
        best = np.argmax(scores)
        best = chr(ord('A') + best)
        results.append([id, best] + scores)

        #print scoreA, scoreB, scoreC, scoreD
        #print right, best
        i += 1
        if i % 100 == 0:
            print i


100
200
300
400
500
600
2000
2100
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
18300

In [None]:
with open("lucene1000_predictions_test.csv", "w") as f:
    writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL)
    for i in xrange(len(results)):
        writer.writerow(results[i])

In [None]:
print "ok"