In [1]:
import json
import lucene
import os
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.search import IndexSearcher
from java.nio.file import Paths
from lucene import getVMEnv


In [2]:

def load_reddit_data(filepath):
    with open(filepath, 'r') as file:
        for line in file:
            yield json.loads(line)


In [3]:
import time
base_dir = 'reddit_lucene_index'
def create_index(dir, data):
    writer = None
    total_time = 0
    #num_documents = 0
    try:
        if not os.path.exists(dir):
            os.mkdir(dir)
        store = SimpleFSDirectory(Paths.get(dir))
        analyzer = StandardAnalyzer()
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)
    
        
        metaType = FieldType()
        metaType.setStored(True)
        metaType.setTokenized(False)
    
        contextType = FieldType()
        contextType.setStored(True)
        contextType.setTokenized(True)
        contextType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
        for sample in data:
            start_time=time.time()
            subreddit = sample.get('subreddit', '')
            post_title = sample.get('post_title', '')
            post_body = sample.get('post_body', '')  
            parent_comment_id = sample.get('parent_comment_id', '')
            comment_id = sample.get('comment_id', '')
            comment_body = sample.get('body', '')  
    
            doc = Document()
            doc.add(Field('Subreddit', subreddit, metaType))
            doc.add(Field('PostTitle', post_title, metaType))
            doc.add(Field('PostBody', post_body, contextType))  
            doc.add(Field('ParentCommentId', parent_comment_id, metaType))
            doc.add(Field('CommentId', comment_id, metaType))
            doc.add(Field('CommentBody', comment_body, contextType))
            writer.addDocument(doc)
            end_time=time.time()
            total_time+=(end_time - start_time)
            #num_documents += 1
    finally:
        if writer is not None:
            writer.close()
    #avg_time_per_doc = total_time/num_documents if num_documents else 0
    print("total time taken to index(in seconds):", total_time )
    writer.close()

In [4]:
def retrieve(storedir, query):
    searchDir = SimpleFSDirectory(Paths.get(storedir))
    searcher = IndexSearcher(DirectoryReader.open(searchDir))
    parser = QueryParser('CommentBody', StandardAnalyzer())  
    parsed_query = parser.parse(query)

    topDocs = searcher.search(parsed_query, 5).scoreDocs
    topkdocs = []
    for hit in topDocs:
        doc = searcher.doc(hit.doc)
        topkdocs.append({
            "score": hit.score,
            "subreddit": doc.get("Subreddit"),
            "post_title": doc.get("PostTitle"),
            "post_body": doc.get("PostBody"),  
            "parent_comment_id": doc.get("ParentCommentId"),  
            "comment_id": doc.get("CommentId"),  
            "comment_body": doc.get("CommentBody")
        })
    
    return topkdocs

In [5]:
reddit_data = load_reddit_data('./reddit_comments_duplicates_final.json')
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
create_index('reddit_lucene_index_copy/', reddit_data)
results = retrieve('reddit_lucene_index_copy/', 'search query here')
print(results)

total time taken to index(in seconds): 55.85971021652222
[{'score': 6.4062042236328125, 'subreddit': 'Tronix', 'post_title': 'Tron Protocol Inc. Article of Incorporation. SF Office confirmed', 'post_body': '', 'parent_comment_id': 't1_ds4a4x1', 'comment_id': 'ds4btxk', 'comment_body': 'Search Tron Protocol here https://businesssearch.sos.ca.gov/'}, {'score': 5.895445823669434, 'subreddit': 'OMGnetwork', 'post_title': 'Daily Discussion - February 03, 2021', 'post_body': '## [OMG Network Daily Discussion](https://i.imgur.com/jiwfY8e.png)\n\n**Rules**\n\n* Please read and follow the subreddit rules in the sidebar.\n* Please read the [disclaimer](https://omg.eco/disclaimer).\n\n**OMG Network Resources**\n\n* [Official OMG Network Telegram](https://t.me/OmiseGo)\n* [What is OMG Network?](https://youtu.be/w-tKyhA4QzE)\n* [Website](https://omg.network/)\n* [Network Documentation](https://docs.omg.network/)\n* [Block Explorer](https://omg.eco/blockexplorer)\n* [Web Wallet](https://omg.eco/webw

In [28]:
document_counts = [100, 500, 1000, 5000] 
average_times = []
reddit_data = load_reddit_data('./reddit_comments_duplicates_final.json')
for count in document_counts:
    subset = list(reddit_data)[:count]  
    avg_time, total_time, num_docs = create_index('reddit_lucene_index/', subset)
    average_times.append(avg_time)
    print(f"Indexed {num_docs} documents. Average time per document: {avg_time} seconds")

Indexed 100 documents. Average time per document: 5.214929580688476e-05 seconds
Indexed 0 documents. Average time per document: 0 seconds
Indexed 0 documents. Average time per document: 0 seconds
Indexed 0 documents. Average time per document: 0 seconds
