# Evaluation with pytrec_eval
Evaluation and output runs in TREC format

In [19]:
import pytrec_eval
import pandas as pd
import os
import json
import pyterrier as pt
if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])  # Initialisation package for RM3


# Loading indexes
indexref_np = pt.IndexRef.of('./indexes_p/iterindex_noprocess/data.properties')
index_np = pt.IndexFactory.of(indexref_np)
indexref1 = pt.IndexRef.of('./indexes_p/iterindex/data.properties')
index1 = pt.IndexFactory.of(indexref1)
indexref2 = pt.IndexRef.of('./indexes_p/iterindex_opt/data.properties')
index2 = pt.IndexFactory.of(indexref2)

# Load qrels and queries
topics_df = pd.read_csv("NIR2022 dataset/train_query.csv", dtype=str)
print(topics_df.shape)
topics_df
qrels_df = pd.read_csv("NIR2022 dataset/train_qrel.csv", dtype=str)
qrels_df.label = qrels_df.label.astype('int')
print(qrels_df.shape)
qrels_df.head()

(200, 2)
(247569, 4)


Unnamed: 0,qid,docno,label,iteration
0,302,FBIS3-10615,0,0
1,302,FBIS3-10855,0,0
2,302,FBIS3-11418,0,0
3,302,FBIS3-14832,0,0
4,302,FBIS3-20548,1,0


In [4]:
# Load qrels in a dictionary
qrels_dict = dict()
for _, r in qrels_df.iterrows():
    qid, docno, label, iteration = r
    if qid not in qrels_dict:
        qrels_dict[qid] = dict()
    qrels_dict[qid][docno] = int(label)

# Build evaluator based on the qrels and metrics
metrics = {"map", "ndcg_cut_5", "ndcg_cut_10", "ndcg_cut_20", "P_5", "P_10", "P_20"}
my_qrel = {q: d for q, d in qrels_dict.items()}
evaluator = pytrec_eval.RelevanceEvaluator(my_qrel, metrics)

In [5]:
# Load run
# MODELavg = "cross_avg1"
# MODELsum = "cross_sum1"
MODELmax = "cross_max1"

def parse(MODEL):
    PATH_RUN = f"outputs/{MODEL}.run"   
    with open(PATH_RUN, 'r') as f_run:
        cross_run = pytrec_eval.parse_run(f_run)
        return cross_run

# cross_run_avg = parse(MODELavg)
# cross_run_sum = parse(MODELsum)
cross_run_max = parse(MODELmax)

In [6]:
# Evaluate model
def eva(cross_run):    
    cross_evals = evaluator.evaluate(cross_run)

    # Compute performance in different metrics for each query
    cross_metric2vals = {m: [] for m in metrics}
    for q, d in cross_evals.items():
        for m, val in d.items():
            cross_metric2vals[m].append(val)

    # Average results by query
    cross_metric2avg = dict()
    for m in metrics:
        val = pytrec_eval.compute_aggregated_measure(m, cross_metric2vals[m])
        cross_metric2avg[m] = val
        print(m, '\t', val)
    return cross_metric2avg
# cross_metric2avg_avg = eva(cross_run_avg)
# cross_metric2avg_sum = eva(cross_run_sum)
cross_metric2avg_max = eva(cross_run_max)

P_5 	 0.5266331658291458
ndcg_cut_20 	 0.4782812244217364
ndcg_cut_10 	 0.49689150400582993
P_20 	 0.4155778894472361
P_10 	 0.4934673366834171
ndcg_cut_5 	 0.49941187065785936
map 	 0.31450085372827585


In [7]:
b1 = pt.BatchRetrieve(index1, wmodel='BM25', controls={'c':0.3,'bm25.k_1':0.8})
b0 = pt.BatchRetrieve(index_np, wmodel='BM25', controls={'c':0.3,'bm25.k_1':0.8})
b2 = pt.BatchRetrieve(index2, wmodel='BM25', controls={'c':0.3,'bm25.k_1':0.8})
dlm = pt.BatchRetrieve(index2, wmodel="DirichletLM", controls={'c' : 275})
rm3 = pt.rewrite.RM3(index2, fb_lambda=0.6)
rm3_pipe = b2 >> rm3 >> b2

0         0
1         0
2         0
3         0
4         1
         ..
247564    0
247565    0
247566    0
247567    0
247568    0
Name: label, Length: 247569, dtype: int64

In [20]:
# Compare system performance
experiment =  pt.Experiment([b0,b1,b2,rm3_pipe,dlm], topics_df, qrels_df, ["map", "ndcg_cut_5", "ndcg_cut_10", "ndcg_cut_20", "P_5", "P_10", "P_20"])



cross_metric2avg_max['name'] = 'BM25 >> Cross-Encoder'
experiment.append(cross_metric2avg_max, ignore_index=True)

  experiment.append(cross_metric2avg_max, ignore_index=True)


Unnamed: 0,name,map,ndcg_cut_5,ndcg_cut_10,ndcg_cut_20,P_5,P_10,P_20
0,BR(BM25),0.231402,0.448493,0.423982,0.40391,0.473367,0.409548,0.342211
1,BR(BM25),0.260289,0.466882,0.447841,0.422747,0.492462,0.437186,0.357538
2,BR(BM25),0.261404,0.473442,0.449995,0.424825,0.500503,0.438191,0.359045
3,"Compose(Compose(BR(BM25), QueryExpansion(/User...",0.294065,0.470086,0.460263,0.44653,0.511558,0.457789,0.38995
4,BR(DirichletLM),0.239717,0.446968,0.423664,0.402892,0.460302,0.407538,0.339196
5,BM25 >> Cross-Encoder,0.314501,0.499412,0.496892,0.478281,0.526633,0.493467,0.415578


### Output runs

In [None]:
!mkdir ./outputs_test

In [None]:
test_topics_df = pd.read_csv("NIR2022 dataset/test_query.csv", dtype=str)
print(test_topics_df.shape)
test_topics_df.head()

In [None]:
bm25_idx0 = pt.BatchRetrieve(index_np, wmodel='BM25', controls={'c':0.3,'bm25.k_1':0.8,'bm25.k_3':0.5})
bm25_idx1 = pt.BatchRetrieve(index1, wmodel='BM25', controls={'c':0.3,'bm25.k_1':0.8,'bm25.k_3':0.5})
DLM_idx2 = pt.BatchRetrieve(index2, wmodel='DirichletLM', controls={'c': 275})

In [None]:
w2v_model = api.load('word2vec-google-news-300')

k=2

test_topics_qe_df = test_topics_df.copy()
for i in range(len(test_topics_qe_df)):
    q = test_topics_qe_df.iloc[i]['query']
    qe = []
    for word in q.split(' '):
        # OOV
        try:
            expanded_words = [pair[0] for pair in w2v_model.most_similar(word, topn=k) if pair[0].isalnum()]
        except:
            expanded_words = []
        expanded_words.append(word)
        qe.append(expanded_words)
    topics_qe_df.iloc[i]['query'] = gensim.parsing.preprocessing.remove_stopwords(" ".join([e for l in qe for e in l]))
    
DPH = pt.BatchRetrieve(index2, wmodel="DPH")

def w2v_rewrite(topics):
    return test_topics_qe_df

pipeline_w2v = pt.apply.generic(w2v_rewrite) >> DPH

pipeline_weight_w2v = 1.0*DPH + 0.2*pipeline_w2v
# bm25 model with optimized parameters
bm25 = pt.BatchRetrieve(index2, wmodel='BM25', controls={'c':0.3,'bm25.k_1':0.8,'bm25.k_3':0.5})


bo1 = pt.rewrite.Bo1QueryExpansion(index2)
bm25_bo1 = bm25 >> bo1 >> bm25
dph_bo1 = DPH >> bo1 >> DPH

In [None]:
# change model name and model
MODEL_NAME = 'MODEL_NAME'
MODEL = rm3_pipe
model_run = []
for _, row in test_topics_df.iterrows():
    qid, query = row
    res_df = MODEL.search(query)
    for _, res_row in res_df.iterrows():
        _, docid, docno, rank, score, query= res_row
        row_str = f"{qid} Q0 {docno} {rank+1} {score} {MODEL_NAME}"
        model_run.append(row_str)
with open(f"outputs_test/{MODEL_NAME}.txt", "w") as f:
    for l in model_run:
        f.write(l + "\n")

### Analysis
Generating results for analysis

In [21]:
b1 = pt.BatchRetrieve(index1, wmodel='BM25', controls={'c':0.3,'bm25.k_1':0.8})
b0 = pt.BatchRetrieve(index_np, wmodel='BM25', controls={'c':0.3,'bm25.k_1':0.8})
b2 = pt.BatchRetrieve(index2, wmodel='BM25', controls={'c':0.3,'bm25.k_1':0.8})
rm3 = pt.rewrite.RM3(index2)
rm3_pipe = b2 >> rm3 >> b2
bo1 = pt.rewrite.Bo1QueryExpansion(index2)
bm25_bo1 = b2 >> bo1 >> b2

In [80]:
# Query specific evaluation
exp = pt.Experiment([b0,b2,rm3_pipe,bm25_bo1],
 topics_df, 
 qrels_df, 
 ["map","ndcg_cut_10"],
 names=['b0','b2','rm3','bo1'],
 perquery=True)

  warn(f'{backfill_count} topic(s) not found in qrels. Scores for these topics are given as NaN and should not contribute to averages.')
  warn(f'{backfill_count} topic(s) not found in qrels. Scores for these topics are given as NaN and should not contribute to averages.')
  warn(f'{backfill_count} topic(s) not found in qrels. Scores for these topics are given as NaN and should not contribute to averages.')
  warn(f'{backfill_count} topic(s) not found in qrels. Scores for these topics are given as NaN and should not contribute to averages.')


In [60]:
exp

Unnamed: 0,name,qid,measure,value
0,b0,301,map,0.059936
1,b0,301,ndcg_cut_5,0.491260
2,b0,302,map,0.615075
3,b0,302,ndcg_cut_5,0.830420
4,b0,303,map,0.247897
...,...,...,...,...
1193,rm3,697,ndcg_cut_5,0.000000
1194,rm3,698,map,0.470321
1195,rm3,698,ndcg_cut_5,0.626181
1196,rm3,700,map,0.133874


In [82]:
map = exp[exp['measure']=='map'].copy()
b0map = map[map['name']=='b0'].copy()
b2map = map[map['name']=='b2'].copy()
rm3map = map[map['name']=='rm3'].copy()
bo1map = map[map['name']=='bo1'].copy()

In [81]:
ndcg5 = exp[exp['measure']=='ndcg_cut_10'].copy()
b2ndcg = ndcg5[ndcg5['name']=='b2'].copy()

In [24]:
rm3_hard = set(rm3map.sort_values(by='value').iloc[:10].qid)
bo1_hard = set(bo1map.sort_values(by='value').iloc[:10].qid)
b2_hard = set(b2map.sort_values(by='value').iloc[:10].qid)

In [None]:
hard = rm3_hard.intersection(b2_hard).intersection(bo1_hard)
hard_query = topics_df[topics_df['qid'].isin(hard)]

In [None]:
x_ = evaluator.evaluate(cross_run_max)

ds={'qid':[],'map':[]}
for id,m in x_.items():
    ds['qid'].append(id)
    ds['map'].append(m['map'])
x_map = pd.DataFrame(ds)

In [None]:
# Venn diagram

import matplotlib.pyplot as plt
from matplotlib_venn import venn2,venn2_circles,venn3
import matplotlib.pyplot as plt

k=10
x_hard = set(x_map.sort_values(by='map').iloc[:k].qid)
rm3_hard = set(rm3map.sort_values(by='value').iloc[:k].qid)
bo1_hard = set(bo1map.sort_values(by='value').iloc[:k].qid)
b2_hard = set(b2map.sort_values(by='value').iloc[:k].qid)

my_dpi=150
plt.figure(figsize=(600/my_dpi, 600/my_dpi), dpi=my_dpi)
g=venn3(subsets = [rm3_hard, b2_hard, x_hard], 
        set_labels = ('RM3', 'BM25-NR','CE'), 
        set_colors=("#01a2d9", "#31A354", "#c72e29"),
        alpha=0.8,
        normalize_to=1.0,
       )
plt.show()

In [None]:
# sort query by map
sort_x_map = x_map.sort_values(by='map',ascending=False)
sort_rm3map = rm3map.sort_values(by='value',ascending=False)
sort_rm3map=sort_rm3map.dropna()
sort_bo1map = bo1map.sort_values(by='value',ascending=False)
sort_bo1map=sort_bo1map.dropna()
sort_b2_map = b2map.sort_values(by='value',ascending=False)

In [None]:
# plot
linx = np.linspace(1,199,num=199)
plt.plot(linx, sort_x_map.map, label='CE')
plt.plot(linx, sort_rm3map.value, label='RM3')
plt.plot(linx, sort_bo1map.value, label='Bo1')
fig = plt.gcf()
plt.legend()
fig.set_size_inches(5, 5)
plt.xlabel('Query Difficulty for Model')
plt.ylabel('MAP')
plt.savefig('1.png',dpi=250)

In [None]:
k=10
x_easy = set(sort_x_map.iloc[:k].qid)
rm3_easy = set(sort_rm3map.iloc[:k].qid)
bo1_easy = set(sort_bo1map.iloc[:k].qid)

easy=x_easy.intersection(rm3_easy).intersection(bo1_easy)

easy_query = topics_df[topics_df['qid'].isin(easy)]
easy_query

In [84]:
b2map = b2map.dropna(subset=['value'])
b2ndcg = b2ndcg.dropna(subset=['value'])

In [85]:
list(b2map['qid']) == list(b2ndcg['qid'])

True

Analysis was done based on the hard and easy queries obtained above.

In [86]:

ap = []
ndcg = []
nrels = []
nass = []
nas_rates = []
for _,i in b2map.iterrows():
    ap.append(i['value'])
    nas = len(qrels_df[qrels_df['qid']==i['qid']])
    nrel = len(qrels_df[(qrels_df['qid']==i['qid']) & (qrels_df['label']>0)])
    nas_rate = nrel / nas
    nrels.append(nrel)
    nass.append(nas)
    nas_rates.append(nas_rate)

for _,i in b2ndcg.iterrows():
    ndcg.append(i['value'])

In [97]:
from scipy.stats import pearsonr
pearsonr(ndcg, nas_rates)

(0.07336708430516711, 0.30308571023460146)