In [1]:
import json
from evaluate import load
import pandas as pd
from tqdm.auto import tqdm
import pickle

In [2]:
base_path = "."
topics = "trec-covid"
expand_doc = "generated_expansion.pkl"

In [3]:
# Mudar o formato de jsonl para tsv
with open(f'{base_path}/data/queries.tsv','w') as output:
    with open(f'{base_path}/data/queries.jsonl', 'r') as file:
        for line in file:
            data = json.loads(line)
            id = data['_id']
            text = data['text']
            output.write(f'{id}\t{text}\n')

In [4]:
!python -m pyserini.search.lucene \
  --index ~/.cache/pyserini/indexes/lucene-index.beir-v1.0.0-trec-covid.flat.20221116.505594 \
  --topics {base_path}/data/queries.tsv \
  --output {base_path}/runs/run.{topics}.bm25tuned.txt \
  --hits 1000 \
  --bm25 --k1 0.82 --b 0.68

Setting BM25 parameters: k1=0.82, b=0.68
Running ./data/queries.tsv topics, saving to ./runs/run.trec-covid.bm25tuned.txt...
100%|███████████████████████████████████████████| 50/50 [00:13<00:00,  3.58it/s]


In [5]:
qrel = pd.read_csv(f"{base_path}/data/test.tsv", sep="\t", header=None, 
                   skiprows=1, names=["query", "docid", "rel"])
qrel["q0"] = "q0"
qrel = qrel.to_dict(orient="list")

In [6]:
run = pd.read_csv(f"{base_path}/runs/run.{topics}.bm25tuned.txt", sep="\s+", 
                  names=["query", "q0", "docid", "rank", "score", "system"])
run = run.to_dict(orient="list")

In [7]:
trec_eval = load("trec_eval")
results = trec_eval.compute(predictions=[run], references=[qrel])

In [8]:
results['NDCG@10']

0.5963435398557583

In [9]:
with open(expand_doc, "rb") as file:
    exp_df = pd.DataFrame(pickle.load(file))

In [10]:
exp_df.head()

Unnamed: 0,doc_indexes,doc_expansion
0,0,what were some of the common symptoms of pneum...
1,1,NO• and lung inflammation what is NO• in respi...
2,2,does sp-d interact with proinflammatory molecu...
3,3,what is the biology of et-1 endothelin-1 role ...
4,4,what does respiratory syncytial virus do respi...


In [11]:
trec_covid_corpus_df = pd.read_csv(f"{base_path}/data/corpus.tsv", sep='\t', names=["id", "text"])
trec_covid_corpus_df = trec_covid_corpus_df.dropna()

In [12]:
exp_df.set_index(exp_df['doc_indexes'])

Unnamed: 0_level_0,doc_indexes,doc_expansion
doc_indexes,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,what were some of the common symptoms of pneum...
1,1,NO• and lung inflammation what is NO• in respi...
2,2,does sp-d interact with proinflammatory molecu...
3,3,what is the biology of et-1 endothelin-1 role ...
4,4,what does respiratory syncytial virus do respi...
...,...,...
171325,171325,what factors affect health in the global popul...
171326,171326,how does a small business affect its employees...
171328,171328,what is the functional function of the imm in ...
171329,171329,what is the sarse antibody where does phenylal...


In [13]:
trec_covid_expanded_df = trec_covid_corpus_df.merge(exp_df.set_index(exp_df['doc_indexes']), left_index=True, right_index=True, how='left')

In [14]:
trec_covid_expanded_df['text'] = trec_covid_expanded_df['text'] + trec_covid_expanded_df['doc_expansion'].fillna("")

In [15]:
trec_covid_expanded_df.rename(columns={'id': '_id'}, inplace=True)

In [16]:
trec_covid_expanded_df.head()

Unnamed: 0,_id,text,doc_indexes,doc_expansion
0,ug7v899j,OBJECTIVE: This retrospective chart review des...,0.0,what were some of the common symptoms of pneum...
1,02tnwd4m,Inflammatory diseases of the respiratory tract...,1.0,NO• and lung inflammation what is NO• in respi...
2,ejv2xln0,Surfactant protein-D (SP-D) participates in th...,2.0,does sp-d interact with proinflammatory molecu...
3,2b73a28n,Endothelin-1 (ET-1) is a 21 amino acid peptide...,3.0,what is the biology of et-1 endothelin-1 role ...
4,9785vg6d,Respiratory syncytial virus (RSV) and pneumoni...,4.0,what does respiratory syncytial virus do respi...


In [17]:
trec_covid_expanded_df[['_id', 'text']].dropna().to_csv(f"{base_path}/data/trec_covid_expanded.tsv", sep="\t", index=False, header=False)

### Indexing

In [18]:
PYSERINI_TOOLS_FOLDER = "/home/manoel/Documents/Doutorado/P_IA368DD_2023S1/Exercicio1/tools/anserini-tools-master/scripts/msmarco/"
TREC_COVID_EXPANDED_FILENAME = f"{base_path}/data/trec_covid_expanded.tsv"
TREC_COVID_EXPANDED_FOLDER = f"{base_path}/data/trec_covid_expanded"

In [19]:
!python {PYSERINI_TOOLS_FOLDER}convert_collection_to_jsonl.py \
    --collection-path {TREC_COVID_EXPANDED_FILENAME} \
    --output-folder {TREC_COVID_EXPANDED_FOLDER}

Converting collection...
Converted 0 docs, writing into file 1
Converted 100,000 docs, writing into file 1
Done!


In [20]:
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input {base_path}/data/trec_covid_expanded \
  --index indexes/extended_trec_covid \
  --generator DefaultLuceneDocumentGenerator \
  --threads 9 \
  --storePositions --storeDocvectors --storeRaw

2023-04-11 14:18:34,867 INFO  [main] index.IndexCollection (IndexCollection.java:380) - Setting log level to INFO
2023-04-11 14:18:34,869 INFO  [main] index.IndexCollection (IndexCollection.java:383) - Starting indexer...
2023-04-11 14:18:34,869 INFO  [main] index.IndexCollection (IndexCollection.java:385) - DocumentCollection path: ./data/trec_covid_expanded
2023-04-11 14:18:34,870 INFO  [main] index.IndexCollection (IndexCollection.java:386) - CollectionClass: JsonCollection
2023-04-11 14:18:34,870 INFO  [main] index.IndexCollection (IndexCollection.java:387) - Generator: DefaultLuceneDocumentGenerator
2023-04-11 14:18:34,870 INFO  [main] index.IndexCollection (IndexCollection.java:388) - Threads: 9
2023-04-11 14:18:34,870 INFO  [main] index.IndexCollection (IndexCollection.java:389) - Language: en
2023-04-11 14:18:34,871 INFO  [main] index.IndexCollection (IndexCollection.java:390) - Stemmer: porter
2023-04-11 14:18:34,871 INFO  [main] index.IndexCollection (IndexCollection.java:391

In [21]:
!python -m pyserini.search.lucene \
  --index indexes/extended_trec_covid \
  --topics {base_path}/data/queries.tsv \
  --output {base_path}/runs/run.{topics}.bm25tuned-expanded.txt \
  --hits 1000 \
  --bm25 --k1 0.82 --b 0.68

Setting BM25 parameters: k1=0.82, b=0.68
Running ./data/queries.tsv topics, saving to ./runs/run.trec-covid.bm25tuned-expanded.txt...
100%|███████████████████████████████████████████| 50/50 [00:03<00:00, 15.02it/s]


In [22]:
run = pd.read_csv(f"{base_path}/runs/run.{topics}.bm25tuned-expanded.txt", sep="\s+", 
                  names=["query", "q0", "docid", "rank", "score", "system"])
run = run.to_dict(orient="list")

In [23]:
trec_eval = load("trec_eval")
results = trec_eval.compute(predictions=[run], references=[qrel])

In [24]:
results['NDCG@10']

0.6447870307110422

In [26]:
print(round(0.5963435398557583, 4))

0.5963
