In [3]:
import lucene
from java.io import StringReader
from org.apache.lucene.analysis.ja import JapaneseAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer, StandardTokenizer
from org.apache.lucene.analysis.tokenattributes import CharTermAttribute

In [4]:
from java.nio.file import Path, Paths
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version
from org.apache.lucene.document import Document, Field, TextField, StringField, LongPoint
from org.apache.lucene.search import IndexSearcher

In [5]:
lucene.initVM(vmargs=['-Djava.awt.headless=true'])

<jcc.JCCEnv at 0x7f336ad3bd70>

## 1. Tokenizers

### 1.1 Standard Tokenizer

In [4]:
test = "This is how we do it."
tokenizer = StandardTokenizer()
tokenizer.setReader(StringReader(test))

In [5]:
charTermAttrib = tokenizer.getAttribute(CharTermAttribute.class_)
tokenizer.reset()

In [6]:
tokens = []
while tokenizer.incrementToken():
    tokens.append(charTermAttrib.toString())

In [7]:
print(tokens)

['This', 'is', 'how', 'we', 'do', 'it']


In [8]:
# StandardAnalyzer example.
analyzer = StandardAnalyzer()
stream = analyzer.tokenStream("", StringReader(test))
stream.reset()
tokens = []
while stream.incrementToken():
    tokens.append(stream.getAttribute(CharTermAttribute.class_).toString())
print(tokens)

['this', 'is', 'how', 'we', 'do', 'it']


## Index Documents  in Folder

In [13]:
from org.apache.lucene.util import Version

print(Version.LUCENE_CURRENT)

8.9.0


In [6]:
from java.nio.file import Path, Paths
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version
from org.apache.lucene.document import Document, Field, TextField
from org.apache.lucene.search import IndexSearcher


def get_index_writer(store, create=False):
  analyzer = StandardAnalyzer()
  config = IndexWriterConfig(analyzer)
  if create:
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
  index_writer = IndexWriter(store, config)
  return index_writer

def create_document(title: str, abstract: str) -> Document:
  document = Document()
  document.add(Field("title", title, TextField.TYPE_STORED))
  document.add(Field("abstract", abstract, TextField.TYPE_STORED))
  return document


store = SimpleFSDirectory(Paths.get('./index'))
index_writer = get_index_writer(store, create=True)

titles = [
  'toy story 1',
  'toy story 2',
  'finding nemo0'
]

abstracts = [
  'This is the text to be indexed.',
  'Some other text',
  "It's friday at the lab"
]

for title, abstract in zip(titles, abstracts):
  document = create_document(title, abstract)
  index_writer.addDocument(document)

index_writer.close()

store.close()



In [54]:
# https://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
# https://lucene.apache.org/core/8_0_0/queryparser/org/apache/lucene/queryparser/classic/package-summary.html#package.description
from org.apache.lucene.search import IndexSearcher, BooleanClause
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser, MultiFieldQueryParser

directory = SimpleFSDirectory(Paths.get('./index'))
dir_reader = DirectoryReader.open(directory)
index_searcher = IndexSearcher(dir_reader)

# query_parser = QueryParser("abstract", StandardAnalyzer())

text_query = "1"
fieldnames = ['title', 'abstract']
SHOULD = BooleanClause.Occur.SHOULD
query = MultiFieldQueryParser.parse(text_query,
                                    fieldnames,
                                    [SHOULD, SHOULD],
                                    StandardAnalyzer())

# query = query_parser.parse("text")
hits = index_searcher.search(query, 10).scoreDocs

for hit in hits:
  hitDoc = index_searcher.doc(hit.doc)
  print(hitDoc.get("abstract"))


# index_searcher.close()
dir_reader.close()



This is the text to be indexed.


# CORD-19 Test

In [9]:
import pandas as pd
cord_19_path = './small_cord_19.csv'

df = pd.read_csv(cord_19_path)

In [10]:
df.head()

Unnamed: 0,cord_uid,source_x,title,abstract,publish_time,journal,authors,url
0,ug7v899j,PMC,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...,2001-07-04,BMC Infect Dis,"Madani, Tariq A; Al-Ghamdi, Aisha A",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...
1,02tnwd4m,PMC,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...,2000-08-15,Respir Res,"Vliet, Albert van der; Eiserich, Jason P; Cros...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
2,ejv2xln0,PMC,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...,2000-08-25,Respir Res,"Crouch, Erika C",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...


In [118]:
from org.apache.lucene.document import Document, Field, TextField, LongPoint
from datetime import datetime

fields = {
  'cord_uid': StringField.TYPE_NOT_STORED,
  'source_x': TextField.TYPE_STORED,
  'title': TextField.TYPE_STORED,
  'abstract': TextField.TYPE_STORED,
  'publish_time': LongPoint,
  'journal': TextField.TYPE_STORED,
  'authors': TextField.TYPE_STORED,
  'url': TextField.TYPE_STORED
}

def date2long(date):
  return int(datetime.strptime(date, '%Y-%m-%d').strftime('%Y%m%d'))

def get_index_writer(store, create=False):
  analyzer = StandardAnalyzer()
  config = IndexWriterConfig(analyzer)
  if create:
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
  index_writer = IndexWriter(store, config)
  return index_writer

def index_from_dataframe(index_writer, df, fields):
  df = df.reset_index()
  for _, row, in df.iterrows():
    document = Document()
    for key in fields.keys():
      if fields[key] == LongPoint:
        date_millis = date2long(row[key])
        document.add(LongPoint(key, date_millis))
        document.add(Field("publish", row[key], StringField.TYPE_STORED))
      else:
        document.add(Field(key, row[key], fields[key]))
    index_writer.addDocument(document)

try:
  store = SimpleFSDirectory(Paths.get('./test_cord'))
  index_writer = get_index_writer(store, create=True)
  index_from_dataframe(index_writer, df, fields)
finally:
  index_writer.close()
  store.close()

  

In [119]:
from org.apache.lucene.search import IndexSearcher, BooleanClause
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser, MultiFieldQueryParser

directory = SimpleFSDirectory(Paths.get('./test_cord'))
dir_reader = DirectoryReader.open(directory)
index_searcher = IndexSearcher(dir_reader)

try: 
  text_query = "respiratory"
  fieldnames = ['title', 'abstract']
  SHOULD = BooleanClause.Occur.SHOULD
  query = MultiFieldQueryParser.parse(text_query,
                                      fieldnames,
                                      [SHOULD, SHOULD],
                                      StandardAnalyzer())

  # query = query_parser.parse("text")
  hits = index_searcher.search(query, 10).scoreDocs

  for hit in hits:
    hitDoc = index_searcher.doc(hit.doc)
    print(hitDoc.get("title"))
    print(hitDoc.get("abstract"))
    print(hitDoc.get("publish"))
    print()
finally:
  dir_reader.close()


# index_searcher.close()




Nitric oxide: a pro-inflammatory mediator in lung disease?
Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO• -dependent oxidative stress. Although NO• is known to have anti-microbial, anti-inflammatory and anti-oxidant properties, various lines of evidence support the contribution of NO• to lung injury in several disease models. On the basis of biochemical evidence, it is often presumed that such NO• -dependent oxidations are due to the formation of the oxidant peroxynitrite, although alternative mechanisms involving the phagocyte-derived heme proteins myeloperoxidase and eosinophil peroxidase might be operative during conditions of inflammation. Because of the overwhelming literature on NO• generation and activities in the respiratory tract, it would be beyond the scope of this commentary to review this area comprehensively. Instead, it focuses on recent evidence and concepts of the presum

In [111]:
from org.apache.lucene.search import IndexSearcher, BooleanClause, BooleanQuery
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser, MultiFieldQueryParser

directory = SimpleFSDirectory(Paths.get('./test_cord'))
dir_reader = DirectoryReader.open(directory)
index_searcher = IndexSearcher(dir_reader)

def time_to_millis(date):
  date_millis = datetime.strptime(date, '%Y-%m-%d').timestamp() * 1000
  return int(date_millis)

try: 
  text_query = "respiratory"
  date_query1 = "2000-08-15"
  date_query2 = "2000-08-25"

  rangeQuery = LongPoint.newRangeQuery("publish_time", date2long(date_query1), date2long(date_query2))  

  query_parser = QueryParser("default", StandardAnalyzer())
  special_query = f"title:{text_query} OR abstract:{text_query}"
  searchQuery = query_parser.parse(special_query)

  booleanQuery = BooleanQuery.Builder().add(rangeQuery, BooleanClause.Occur.MUST).add(searchQuery, BooleanClause.Occur.MUST).build()

  # special_query = f"publish_time:[{date2long(date_query1)} TO {date2long(date_query2)}]"
  # hits = index_searcher.search(query, 10).scoreDocs
  hits = index_searcher.search(booleanQuery, 10).scoreDocs

  for hit in hits:
    hitDoc = index_searcher.doc(hit.doc)
    print(hitDoc.get("title"))
    print(hitDoc.get("abstract"))
    print(hitDoc.get("publish_time"))
    print()
finally:
  dir_reader.close()


# index_searcher.close()


Nitric oxide: a pro-inflammatory mediator in lung disease?
Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO• -dependent oxidative stress. Although NO• is known to have anti-microbial, anti-inflammatory and anti-oxidant properties, various lines of evidence support the contribution of NO• to lung injury in several disease models. On the basis of biochemical evidence, it is often presumed that such NO• -dependent oxidations are due to the formation of the oxidant peroxynitrite, although alternative mechanisms involving the phagocyte-derived heme proteins myeloperoxidase and eosinophil peroxidase might be operative during conditions of inflammation. Because of the overwhelming literature on NO• generation and activities in the respiratory tract, it would be beyond the scope of this commentary to review this area comprehensively. Instead, it focuses on recent evidence and concepts of the presum

In [10]:
fields = {
  'cord_uid': StringField.TYPE_NOT_STORED,
  'source_x': TextField.TYPE_STORED,
  'title': TextField.TYPE_STORED,
  'abstract': TextField.TYPE_STORED,
  'publish_time': LongPoint,
  'journal': TextField.TYPE_STORED,
  'authors': TextField.TYPE_STORED,
  'url': TextField.TYPE_STORED
}

for key, val in fields.items():
  print(val)

indexed,omitNorms,indexOptions=DOCS
stored,indexed,tokenized
stored,indexed,tokenized
stored,indexed,tokenized
<class 'org.apache.lucene.document.LongPoint'>
stored,indexed,tokenized
stored,indexed,tokenized
stored,indexed,tokenized


In [71]:
time_to_millis(date_query)

994222800000

In [55]:
LongPoint("key", int(x))

<LongPoint: LongPoint <key:994222800000>>

In [77]:
int(datetime.strptime("2010-01-02", '%Y-%m-%d').strftime('%Y%m%d'))

20100102

In [94]:
date2long(date_query1)

20000815