# Solr Baseline

The easiest way to run ad hoc Solr is to use Docker. Make sure you have Docker installed and do following:

```
docker pull solr
docker run -d -p 8983:8983 -t solr
docker exec -it solr /bin/bash
bin/solr create -c cord_19_2020_06_19_abstract

```

Unlike Anserini where we use pre-built index. Here we have to index it by ourselves. Make sure you already downloaded the CORD-19 dataset from [here](https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases.html), use 2020-06-19 version.



In [1]:
# Edit this variable to your path to CORD-19

PATH_TO_CORD_19_DATA = "../../CORD-19/2020-06-19/"

import pysolr
import json
from xml.etree import ElementTree
import os
import csv

In [3]:
solr = pysolr.Solr('http://0.0.0.0:8983/solr/cord_19_2020_06_19_abstract' , timeout=10, always_commit=True )

# test connection to Solr
print(solr.ping())

{
  "responseHeader":{
    "zkConnected":null,
    "status":0,
    "QTime":0,
    "params":{
      "q":"{!lucene}*:*",
      "distrib":"false",
      "df":"_text_",
      "rows":"10",
      "echoParams":"all"}},
  "status":"OK"}



In [12]:
# get abstract from dataset

def get_abstract(path):
    abstracts = dict()
    with open(path + "metadata.csv") as f_in:
        reader = csv.DictReader(f_in)
        for row in reader:
            cord_uid = row['cord_uid']
            abstract = row['abstract']
            abstracts[cord_uid] = abstract
            
    return abstracts


abstracts_dict = get_abstract(PATH_TO_CORD_19_DATA)

In [18]:
# build index

def index_solr(data_dict, solr):

    solr_payloads = list()
    for uid, data in data_dict.items():
        solr_payload = {
            "id": uid,
            "text": data,
        }
            
        solr_payloads.append(solr_payload)
        if len(solr_payloads) == 1000:
            solr.add(solr_payloads)
            solr_payloads = list()


index_solr(abstracts_dict, solr)

In [14]:
# get query list

def read_topics(path_to_topic):
    tree = ElementTree.parse(path_to_topic)
    topics = list()
    for topic in tree.getroot():
        d = dict()
        d["number"] = topic.attrib["number"]
        for field in topic:
            d[field.tag] = field.text
        topics.append(d)

    return topics

topics = read_topics("../tmp/topics-rnd4.xml")

In [22]:
# test query

query = "text:" + topics[0]["query"]
print("query", query)

solr_query_param = {
    "fl": "id,score",
    "rows": 10
}
results = solr.search(query, **solr_query_param).docs
print("results", "\n".join(str(e) for e in results))

query text:coronavirus origin
results {'id': '5tb29n9s', 'score': 1.3778645}
{'id': 'wbtaoo0o', 'score': 1.3755488}
{'id': 'lajzpk2c', 'score': 1.3659077}
{'id': 'e53w0ext', 'score': 1.3659077}
{'id': 'hix57xwa', 'score': 1.3620061}
{'id': 'r2w5csll', 'score': 1.3617227}
{'id': 'azir1gvm', 'score': 1.3606712}
{'id': 'x6bryq9d', 'score': 1.3510695}
{'id': 'uc0vp5pr', 'score': 1.3485974}
{'id': 'xhf8yg6o', 'score': 1.3483565}


# Run Solr on abstract

In [29]:
lines = list()
template = "{} Q0 {} {} {} solr_baseline_abstract"

for topic_id, query in enumerate(topics):
    solr_query = "text:" + query["query"]
    
    solr_query_param = {
        "fl": "id,score",
        "rows": 1000
    }
    
    results = solr.search(solr_query, **solr_query_param).docs
    
    for j, hit in enumerate(results):
        lines.append(template.format(topic_id + 1, hit["id"], j +1, hit["score"]))
        

In [34]:
# see example
print(lines[0])

1 Q0 5tb29n9s 1 1.3778645 solr_baseline_abstract


In [35]:
with open("../tmp/baseline.txt", 'w') as f:
    f.write("\n".join(lines))

# Evalutation

Prerequisite: Have [trec_eval](https://github.com/usnistgov/trec_eval) installed. Alternatively, use [Python](https://github.com/cvangysel/pytrec_eval) version.

Install original trec_eval:
1. Go to parent directory of this repo
2. git clone https://github.com/usnistgov/trec_eval.git
3. make


In [32]:
# If follow above instruction, no need to change
PATH_TO_TREC = "../../trec_eval/trec_eval"


In [41]:

os.system(PATH_TO_TREC + " -c -m all_trec ../tmp/qrels-covid_d4_j0.5-4.txt  ../tmp/baseline.txt > ../tmp/out.txt")

with open("../tmp/out.txt", "r") as f:
    print(f.read())

runid                 	all	solr_baseline_abstract
num_q                 	all	45
num_ret               	all	43448
num_rel               	all	15765
num_rel_ret           	all	980
map                   	all	0.0192
gm_map                	all	0.0005
Rprec                 	all	0.0417
bpref                 	all	0.0685
recip_rank            	all	0.1259
iprec_at_recall_0.00  	all	0.1494
iprec_at_recall_0.10  	all	0.0535
iprec_at_recall_0.20  	all	0.0444
iprec_at_recall_0.30  	all	0.0302
iprec_at_recall_0.40  	all	0.0227
iprec_at_recall_0.50  	all	0.0058
iprec_at_recall_0.60  	all	0.0000
iprec_at_recall_0.70  	all	0.0000
iprec_at_recall_0.80  	all	0.0000
iprec_at_recall_0.90  	all	0.0000
iprec_at_recall_1.00  	all	0.0000
P_5                   	all	0.0711
P_10                  	all	0.0578
P_15                  	all	0.0652
P_20                  	all	0.0711
P_30                  	all	0.0681
P_100                 	all	0.0529
P_200                 	all	0.0433
P_500                 	all	0.0296
P_1000 