In [1]:
import lucene
from java.nio.file import Paths
from org.apache.lucene.analysis.core import KeywordAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.document import Document, Field, StoredField, StringField, TextField
from org.apache.lucene.store import SimpleFSDirectory
import csv
import time
import string

In [2]:
PARSED_DATA = "/data/data/parsed_data/test/combined.csv"
INDEX_OUTPUT = "/data/data/indexes/index_test"

In [3]:
lucene.initVM(vmargs=['-Djava.awt.headless=true'])

<jcc.JCCEnv at 0x7faec48738d0>

In [4]:
store = SimpleFSDirectory(Paths.get(INDEX_OUTPUT))
analyzer = KeywordAnalyzer()
config = IndexWriterConfig(analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
config.setRAMBufferSizeMB(float(1000))
writer = IndexWriter(store, config)

In [5]:
def format_time(seconds):
    min, sec = divmod(seconds, 60)
    hour, min = divmod(min, 60)
    
    return "%d:%02d:%02d" % (hour, min, sec)

In [6]:
def get_director(text):
    text = text.split("|")
    text = text[0].strip()
    text = text.split("=")
    if len(text) > 1:
        text = text[1]
    else:
        return None

    return text

In [7]:
def get_producer(text):
    text = text.split("|")
    if len(text) > 1:
        text = text[1].strip()
    else:
        return None
    text = text.split("=")
    if len(text) > 1:
        text = text[1]
    else:
        return None

    return text

In [8]:
def get_screenplay(text):
    try:
        text = text.split("screenplay=",1)[1]
        text = text.split("|")
        if len(text) > 1:
            text = text[0].strip()
        else:
            return None
    except:
        text = ""
    return text

In [9]:
def get_cast(text):
    try:
        cast = text.split("|movie cast|",1)[1] 
    except:
        cast = ""
    return cast

In [10]:
def clean_title(title):
    result = title
    
    if '(' in title:
        pos = title.find('(')
        result = title[:pos]
        
    return result.lower()

In [11]:
parsed_file = open(PARSED_DATA, "r", encoding="utf-8")
csv_reader = csv.DictReader(parsed_file)

start = time.time()

for row in csv_reader:
    title = clean_title(row['title'])

    doc = Document()
    doc.add(Field("title", title, TextField.TYPE_STORED))
    
    director = get_director(row['text'])
    if director:
        doc.add(Field("director", director.lower(), TextField.TYPE_STORED))
    
    producer = get_producer(row['text'])
    if producer:
        doc.add(Field("producer", producer.lower(), TextField.TYPE_STORED))
        
    screenplay = get_screenplay(row['text'])
    if screenplay:
        doc.add(Field("screenplay", screenplay.lower(), TextField.TYPE_STORED))
        
    movie_cast = get_cast(row['text'])
    if movie_cast:
        doc.add(Field("movie_cast", movie_cast, TextField.TYPE_STORED))
    
    writer.addDocument(doc)

writer.commit()
writer.close()
print("\nDONE")

end = time.time()
print('Elapsed time: ', format_time(end - start))


DONE
Elapsed time:  0:01:51


In [12]:
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser

In [24]:
directory = SimpleFSDirectory(Paths.get(INDEX_OUTPUT))
searcher = IndexSearcher(DirectoryReader.open(directory))
analyzer = KeywordAnalyzer()

query = QueryParser("title", analyzer).parse("1941 ")
scoreDocs = searcher.search(query, 50).scoreDocs

print(len(scoreDocs))

for scoreDoc in scoreDocs:
    doc = searcher.doc(scoreDoc.doc)
    print("title: ", doc.get("title"))
    print("cast: ", doc.get("movie_cast"))

0
