In [1]:
import psycopg
from dotenv import dotenv_values
from retrieval.utils import connect
from retrieval.prepare_cord19_indexing import get_documents, get_documents_modality_info
from retrieval.models import LuceneDocument
import pandas as pd
from dataclasses import asdict
from datetime import datetime

env_file = '../.db.env'
config = dotenv_values(env_file)

In [4]:
def get_documents_to_index(config) -> list[LuceneDocument]:
  lucene_docs = []
  document_tuples = get_documents(config)

  for document in document_tuples:
    lucene_docs.append(LuceneDocument(
      docId=document[0],
      source=document[1],
      title=document[2],
      abstract=document[3],
      pub_date=datetime.strftime(document[4], "%Y-%m-%d"),
      journal=document[5],
      authors=" ".join(document[6]) if document[6] else "",
      url=document[7],
      pmcid=document[8],
      num_figures=document[9],
      modalities=[]
    ))
  modality_tuples = get_documents_modality_info(config)
  id_2_modalities = {x[0]: x[1] for x in modality_tuples}

  for document in lucene_docs:
    modalities = id_2_modalities.get(document.docId, None)
    if modalities:
      document.modalities = " ".join(modalities)
  return lucene_docs


In [5]:
documents_to_index = get_documents_to_index(config)

In [6]:
df = pd.json_normalize(asdict(obj) for obj in documents_to_index)

In [7]:
df.head()

Unnamed: 0,docId,source,title,abstract,pub_date,journal,authors,pmcid,num_figures,modalities,url
0,6569263,PMC,Technical Description of RODS: A Real-time Pub...,This report describes the design and implement...,2003-09-01,Journal of the American Medical Informatics As...,"Tsui, Fu-Chiang Espino, Jeremy U. Dato, Virgin...",PMC212776,6,oth oth oth oth oth oth oth,cord19-uic/PMC212776/main.pdf
1,6569266,PMC,A Method to Identify p62's UBA Domain Interact...,The UBA domain is a conserved sequence motif a...,2003-12-12,Biol Proced Online,"Pridgeon, Julia W. Geetha, Thangiah Wooten, Ma...",PMC302190,8,oth exp.gel gra exp.gel exp oth exp exp exp ex...,cord19-uic/PMC302190/main.pdf
2,6569271,PMC,Logistics of community smallpox control throug...,BACKGROUND: Previous smallpox ring vaccination...,2004-08-06,BMC Public Health,"Porco, Travis C Holbrook, Karen A Fernyak, Sus...",PMC520756,9,gra.sca gra.lin gra.lin gra.lin oth gra.lin gr...,cord19-uic/PMC520756/main.pdf
3,6569273,PMC,Bioinformatic mapping of AlkB homology domains...,BACKGROUND: AlkB-like proteins are members of ...,2005-01-03,BMC Genomics,"Bratlie, Marit S Drabløs, Finn",PMC544882,5,gra.sca gra.sca gra mol.dna mol gra mol.dna gr...,cord19-uic/PMC544882/main.pdf
4,6569276,PMC,Detection and characterization of horizontal t...,Horizontal DNA transfer is an important factor...,2005-01-13,Nucleic Acids Res,"Dufraigne, Christine Fertil, Bernard Lespinats...",PMC546175,4,mol gra.sca mol oth oth oth oth mol gra,cord19-uic/PMC546175/main.pdf


In [13]:
from index_writer import Indexer

LuceneDocument(docId=6569263, source='PMC', title='Technical Description of RODS: A Real-time Public Health Surveillance System', abstract='This report describes the design and implementation of the Real-time Outbreak and Disease Surveillance (RODS) system, a computer-based public health surveillance system for early detection of disease outbreaks. Hospitals send RODS data from clinical encounters over virtual private networks and leased lines using the Health Level 7 (HL7) message protocol. The data are sent in real time. RODS automatically classifies the registration chief complaint from the visit into one of seven syndrome categories using Bayesian classifiers. It stores the data in a relational database, aggregates the data for analysis using data warehousing techniques, applies univariate and multivariate statistical detection algorithms to the data, and alerts users of when the algorithms identify anomalous patterns in the syndrome counts. RODS also has a Web-based user interface