# In this notebook we compare different runs from a real-world scenario

In [1]:
from ranx import Qrels, Run, compare

In [2]:
# Lets import qrels and runs
qrels = Qrels.from_file("data/qrels.txt")

run_1 = Run.from_file("data/run_1.txt")
run_2 = Run.from_file("data/run_2.txt")
run_3 = Run.from_file("data/run_3.txt")
run_4 = Run.from_file("data/run_4.txt")
run_5 = Run.from_file("data/run_5.txt")

Parsing Qrels: 100%|██████████| 5541/5541 [00:00<00:00, 404072.58it/s]
Parsing Run: 100%|██████████| 550500/550500 [00:00<00:00, 728397.16it/s]
Parsing Run: 100%|██████████| 550500/550500 [00:00<00:00, 853845.99it/s]
Parsing Run: 100%|██████████| 550500/550500 [00:00<00:00, 859440.54it/s]
Parsing Run: 100%|██████████| 550500/550500 [00:00<00:00, 854441.91it/s]
Parsing Run: 100%|██████████| 550500/550500 [00:00<00:00, 742908.83it/s]


In [3]:
# While parsing run files, the run names are compied into the `name` of each Run
# These names will be used later on to create tables
# You can rename them just by assigning a new value to the attribute
run_1.name

'model_1'

#### Hint: Custom names can be given to each run, by simply changing the `name` attribute value

## Lets compare all of our runs!

In [4]:
report = compare(
    qrels,
    runs=[run_1, run_2, run_3, run_4, run_5],
    metrics=["map@100", "mrr@100", "ndcg@10"],
    max_p=0.01  # P-value threshold
)

## We can now print the report of our comparison
Superscripts indicates statistical significance in Fisher's Randomization Test with p <= max_p

In [5]:
report

#    Model    MAP@100     MRR@100     NDCG@10
---  -------  ----------  ----------  ----------
a    model_1  0.3202ᵇ     0.3207ᵇ     0.3684ᵇᶜ
b    model_2  0.2332      0.2339      0.239
c    model_3  0.3082ᵇ     0.3089ᵇ     0.3295ᵇ
d    model_4  0.3664ᵃᵇᶜ   0.3668ᵃᵇᶜ   0.4078ᵃᵇᶜ
e    model_5  0.4053ᵃᵇᶜᵈ  0.4061ᵃᵇᶜᵈ  0.4512ᵃᵇᶜᵈ

In [6]:
# Raw results can be accessed as follows
dict(report.results)

{'model_1': {'map@100': 0.3202013385892648,
  'mrr@100': 0.3206643638092855,
  'ndcg@10': 0.368354920783379},
 'model_2': {'map@100': 0.2331730155338538,
  'mrr@100': 0.23393300857674673,
  'ndcg@10': 0.23899376288638746},
 'model_3': {'map@100': 0.30815542768136706,
  'mrr@100': 0.308894877341305,
  'ndcg@10': 0.32946881898869673},
 'model_4': {'map@100': 0.36636770946634367,
  'mrr@100': 0.36678226339878717,
  'ndcg@10': 0.40777140890090235},
 'model_5': {'map@100': 0.40529287119619145,
  'mrr@100': 0.40606597777040787,
  'ndcg@10': 0.451200401254887}}

## By inspecisting the `win_tie_loss` attribute of the report object, we can get other insights of the model performances

In [7]:
report.win_tie_loss["model_5", "model_4"]

{'map@100': {'W': 2213, 'T': 1412, 'L': 1880},
 'mrr@100': {'W': 2213, 'T': 1420, 'L': 1872},
 'ndcg@10': {'W': 1820, 'T': 2191, 'L': 1494}}

## Finally, we can export the report in LaTeX format, ready to be used in scientific publications.

In [8]:
print(report.to_latex())

% Add in preamble
\usepackage{graphicx}
\setlength{\tabcolsep}{6pt}
\begin{table*}[ht]
\centering
\caption{
Overall effectiveness of the models.
The best results are highlighted in boldface.
Superscripts denote significant differences in Fisher's randomization test with $p\le0.01$.
}
\resizebox{1.0\textwidth}{!}{
\begin{tabular}{c|l|l|l|l}
\toprule
\textbf{\#}
& \textbf{Model}
& \textbf{MAP@100}
& \textbf{MRR@100}
& \textbf{NDCG@10} \\ 
\midrule
a &
model\_1 &
0.3202$^{b}$ &
0.3207$^{b}$ &
0.3684$^{bc}$ \\
b &
model\_2 &
0.2332 &
0.2339 &
0.239 \\
c &
model\_3 &
0.3082$^{b}$ &
0.3089$^{b}$ &
0.3295$^{b}$ \\
d &
model\_4 &
0.3664$^{abc}$ &
0.3668$^{abc}$ &
0.4078$^{abc}$ \\
e &
model\_5 &
\textbf{0.4053}$^{abcd}$ &
\textbf{0.4061}$^{abcd}$ &
\textbf{0.4512}$^{abcd}$ \\
\bottomrule
\end{tabular}
}
\label{tab:results}
\end{table*}
