# Evaluation on the Dev Set

In [2]:
import pyterrier as pt

if not pt.started():
  pt.init()

pt_dataset = pt.get_dataset(f'irds:msmarco-passage/dev/small')

In [4]:
runs = {
    'tiny-distill-v0': pt.transformer.get_transformer(pt.io.read_results('lightning-ir/output-runs/version_0/run.txt')),
    'tiny-distill-v1': pt.transformer.get_transformer(pt.io.read_results('lightning-ir/output-runs/version_1/run.txt')),
    'tiny-distill-v2': pt.transformer.get_transformer(pt.io.read_results('lightning-ir/output-runs/version_2/run.txt')),
    'tiny-distill-v3': pt.transformer.get_transformer(pt.io.read_results('lightning-ir/output-runs/version_3/run.txt')),
    'tiny-distill-v4': pt.transformer.get_transformer(pt.io.read_results('lightning-ir/output-runs/version_4/run.txt')),
    'tiny-distill-v5': pt.transformer.get_transformer(pt.io.read_results('lightning-ir/output-runs/version_5/run.txt')),
    'tiny-distill-v6': pt.transformer.get_transformer(pt.io.read_results('lightning-ir/output-runs/version_6/run.txt')),
    'tiny-distill-v7': pt.transformer.get_transformer(pt.io.read_results('lightning-ir/output-runs/version_7/run.txt')),
    'tiny-distill-v8': pt.transformer.get_transformer(pt.io.read_results('lightning-ir/output-runs/version_8/run.txt')),
    'tiny-distill-v9': pt.transformer.get_transformer(pt.io.read_results('lightning-ir/output-runs/version_9/run.txt')),
}
    

In [6]:
pt.Experiment(
    list(runs.values()),
    pt_dataset.get_topics(),
    pt_dataset.get_qrels(),
    ["ndcg_cut.10", "recip_rank"],
    names=list(runs.keys())
).sort_values('recip_rank')

Unnamed: 0,name,ndcg_cut.10,recip_rank
5,tiny-distill-v5,0.284908,0.243308
9,tiny-distill-v9,0.284838,0.243871
7,tiny-distill-v7,0.288263,0.24632
4,tiny-distill-v4,0.289347,0.247558
1,tiny-distill-v1,0.289272,0.247807
3,tiny-distill-v3,0.290621,0.249226
6,tiny-distill-v6,0.29955,0.256591
0,tiny-distill-v0,0.30037,0.257119
8,tiny-distill-v8,0.345675,0.300634
2,tiny-distill-v2,0.347036,0.30225


We select `tiny-distill-v2` as it has the highest ndcg@10 and recip_rank on the validation dataset.

## Evaluation for Plain TinyBERT

In [14]:
directories = ['output-runs-tiny-bert-plain', 'output-runs-without-injection']
runs = {}

for d in directories:
    models = !ls {d}
    
    for model in models:
        runs[f'{d}/{model}'] = pt.transformer.get_transformer(pt.io.read_results(f'{d}/{model}/run.txt'))


In [17]:
df = pt.Experiment(
    list(runs.values()),
    pt_dataset.get_topics(),
    pt_dataset.get_qrels(),
    ["ndcg_cut.10", "recip_rank"],
    names=list(runs.keys())
)

df.sort_values('recip_rank')

Unnamed: 0,name,ndcg_cut.10,recip_rank
8,output-runs-tiny-bert-plain/cross-encoder-32-1...,0.269564,0.226535
2,output-runs-tiny-bert-plain/cross-encoder-32-1...,0.302782,0.255971
1,output-runs-tiny-bert-plain/cross-encoder-16-1...,0.309662,0.261662
11,output-runs-tiny-bert-plain/cross-encoder-32-5...,0.314039,0.269219
3,output-runs-tiny-bert-plain/cross-encoder-32-1...,0.314637,0.269465
5,output-runs-tiny-bert-plain/cross-encoder-32-1...,0.320254,0.273075
4,output-runs-tiny-bert-plain/cross-encoder-32-1...,0.321083,0.274048
10,output-runs-tiny-bert-plain/cross-encoder-32-2...,0.321365,0.274258
7,output-runs-tiny-bert-plain/cross-encoder-32-1...,0.321624,0.275191
6,output-runs-tiny-bert-plain/cross-encoder-32-1...,0.321856,0.275352


In [21]:
df.sort_values('recip_rank').iloc[22].to_dict()

{'name': 'output-runs-without-injection/train-cross-encoder-without-kd-bm25cat-bert-tiny-pretrained-1gb-2024-06-16_10-42-15-latest',
 'ndcg_cut.10': 0.3631568523686784,
 'recip_rank': 0.3171244932051073}

We select `output-runs-without-injection/train-cross-encoder-without-kd-bm25cat-bert-tiny-pretrained-1gb-2024-06-16_10-42-15-latest` as this has the highest ndcg@10 and recip_rank on the validation dataset.

## Evaluation for TinyBERTCatBM25

In [24]:
directories = ['output-runs-with-injection', 'output-runs-with-injection-and-early-stopping']
runs = {}

for d in directories:
    models = !ls {d}
    
    for model in models:
        runs[f'{d}/{model}'] = pt.transformer.get_transformer(pt.io.read_results(f'{d}/{model}/run.txt'))


In [25]:
df = pt.Experiment(
    list(runs.values()),
    pt_dataset.get_topics(),
    pt_dataset.get_qrels(),
    ["ndcg_cut.10", "recip_rank"],
    names=list(runs.keys())
)

df.sort_values('recip_rank')

Unnamed: 0,name,ndcg_cut.10,recip_rank
14,output-runs-with-injection-and-early-stopping/...,0.303435,0.259237
15,output-runs-with-injection-and-early-stopping/...,0.312888,0.26705
18,output-runs-with-injection-and-early-stopping/...,0.318038,0.271632
16,output-runs-with-injection-and-early-stopping/...,0.318039,0.272076
19,output-runs-with-injection-and-early-stopping/...,0.322276,0.275939
10,output-runs-with-injection-and-early-stopping/...,0.322557,0.276763
17,output-runs-with-injection-and-early-stopping/...,0.323641,0.277891
11,output-runs-with-injection-and-early-stopping/...,0.324403,0.278559
12,output-runs-with-injection-and-early-stopping/...,0.335259,0.289827
6,output-runs-with-injection/train-cross-encoder...,0.339761,0.294066


In [28]:
df.sort_values('recip_rank').iloc[19].to_dict()

{'name': 'output-runs-with-injection/train-cross-encoder-kd-bm25cat-bert-tiny-pretrained-1gb-2024-06-11_08-13-34-latest',
 'ndcg_cut.10': 0.36547428280370114,
 'recip_rank': 0.3190072719181832}

We select `output-runs-with-injection/train-cross-encoder-kd-bm25cat-bert-tiny-pretrained-1gb-2024-06-11_08-13-34-latest` as this has the highest recip_rank and ndcg@10.