In [None]:
from trectools import TrecQrel, TrecRun, TrecEval

# A typical evaluation workflow
r1 = TrecRun("./robust03/runs/input.aplrob03a.gz")
r1.topics()[:5] # Shows the first 5 topics: 601, 602, 603, 604, 605

qrels = TrecQrel("./robust03/qrel/robust03_qrels.txt")

te = TrecEval(r1, qrels)
rbp, residuals = te.getRBP()           # RBP: 0.474, Residuals: 0.001
p100 = te.getPrecision(depth=100)     # P@100: 0.186

print("RBP: %.3f, Residuals: %.2f, P@100: %.2f" % (rbp, residuals, p100))

In [None]:
# Check if documents retrieved by the system were judged:
cover10 = r1.get_mean_coverage(qrels, topX=10)   # 9.99
cover1000 = r1.get_mean_coverage(qrels, topX=1000) # 481.390 
# On average for system 'input.aplrob03a' participating in robust03, 480 documents out of 1000 were judged.

print("Average number of documents judged among top 10: %.2f, among top 1000: %.2f" % (cover10, cover1000))


In [None]:
# Loads another run
r2 = TrecRun("./robust03/runs/input.UIUC03Rd1.gz")

# Check how many documents, on average, in the top 10 of r1 were retrieved in the top 10 of r2
retrievedBYBothRuns = r1.check_run_coverage(r2, topX=10) # 3.64

print("Average number of documents retrieved by Run1 and Run2: %.3f" % (retrievedBYBothRuns))

In [None]:
# Evaluates r1 and r2 using all implemented evaluation metrics
result_r1 = r1.evaluate_run(qrels, per_query=True) 
result_r2 = r2.evaluate_run(qrels, per_query=True)

# Inspect for statistically significant differences between the two runs for  P_10 using two-tailed Student t-test
pvalue = result_r1.compare_with(result_r2, metric="P_10") # pvalue: 0.0167 

print("P-value for wrt P@10 between r1 and r2: %.3f" % (pvalue[1]))