# Evaluation With PyTerrier

In [2]:
!pip install python-terrier

Defaulting to user installation because normal site-packages is not writeable
[0m

In [13]:
import pandas as pd
import pyterrier as pt
if not pt.started():
  pt.init()

def load_data(data_type, topics_to_exclude):
    import json
    from tqdm import tqdm
    if data_type not in ['PRO', 'CON', 'ONTOPIC']:
        raise ValueError('X')


    qrels = pd.read_csv('task3-qrels.txt', sep='\s', names=['qid', 'stance', 'docno', 'label'])
    qrels = qrels[qrels['stance'] == data_type]
    qrels = qrels[~qrels['qid'].astype(int).isin(topics_to_exclude)]
    qrels['qid'] = qrels['qid'].astype(str)

    topics = pd.DataFrame([{'qid': str(i), 'query': str(i)} for i in qrels.qid.unique()])

    return topics, qrels

def load_run(run_name, data_type, topics_to_exclude):
    if data_type not in ['PRO', 'CON', 'ONTOPIC']:
        raise ValueError('X')
    ret = pt.io.read_results(run_name)
    ret = pd.read_csv(run_name, names=['qid', 'q0', 'docno', 'rank', 'score', 'system'], sep=' ')
    ret = ret[~ret['qid'].astype(int).isin(topics_to_exclude)]
    
    if data_type != 'ONTOPIC':
        ret = ret[ret['q0'] == data_type]
    ret['qid'] = ret['qid'].astype(str)
  
    return pt.transformer.SourceTransformer(ret)

def load_report(stance, measure, models=['google', 'boromir-touche23', 'dummy', 'random', 'afinn', 'bert', 'formula',
          'oracle', 'V3-model_1', 'V6-model_all_topic_w0', 'V6-model_all_topic_w5',
          'V8-model_all_topic', 'V9-model_all_topic'], topics_to_exclude = [1, 2, 4, 8, 9, 10, 15, 20, 21, 22, 27, 31, 33, 36, 37, 40, 43, 45, 47, 48]):
    topics, qrels = load_data(stance,topics_to_exclude)
    runs = [load_run(f'../out/elastic/V3-model_1/{i}/run.txt', stance, topics_to_exclude) for i in models]

    return pt.Experiment(
        runs,
        topics,
        qrels,
        eval_metrics=[measure],
        names=models,
        baseline=0,
        correction='bonferroni'
    ).sort_values(measure, ascending=False)

### Evaluation with Precision@10

In [14]:
load_report('PRO', 'P_10')

  del sys.path[0]


Unnamed: 0,name,P_10,P_10 +,P_10 -,P_10 p-value,P_10 reject,P_10 p-value corrected
7,oracle,1.0,28.0,0.0,8.389231e-10,True,1.0906e-08
5,bert,0.713333,14.0,7.0,0.01924885,False,0.2502351
6,formula,0.706667,15.0,6.0,0.01697733,False,0.2207053
2,dummy,0.686667,13.0,7.0,0.07049986,False,0.9164982
9,V6-model_all_topic_w0,0.676667,11.0,8.0,0.1675025,False,1.0
3,random,0.666667,14.0,9.0,0.1868092,False,1.0
1,boromir-touche23,0.636667,15.0,10.0,0.6772122,False,1.0
0,google,0.623333,,,,False,
10,V6-model_all_topic_w5,0.62,14.0,13.0,0.9287943,False,1.0
4,afinn,0.603333,9.0,13.0,0.5651799,False,1.0


In [15]:
load_report('CON', 'P_10')

  del sys.path[0]


Unnamed: 0,name,P_10,P_10 +,P_10 -,P_10 p-value,P_10 reject,P_10 p-value corrected
7,oracle,0.8,30.0,0.0,3.970421e-15,True,5.161547e-14
1,boromir-touche23,0.243333,19.0,6.0,0.007544304,False,0.09807595
5,bert,0.233333,19.0,6.0,0.005822195,False,0.07568853
10,V6-model_all_topic_w5,0.22,19.0,7.0,0.01262878,False,0.1641742
9,V6-model_all_topic_w0,0.213333,15.0,8.0,0.03458249,False,0.4495723
4,afinn,0.193333,13.0,9.0,0.2008691,False,1.0
2,dummy,0.19,15.0,10.0,0.2363492,False,1.0
8,V3-model_1,0.18,10.0,9.0,0.3431365,False,1.0
6,formula,0.18,13.0,10.0,0.286104,False,1.0
3,random,0.16,10.0,10.0,0.7245199,False,1.0


In [16]:
load_report('ONTOPIC', 'P_10')

  del sys.path[0]


Unnamed: 0,name,P_10,P_10 +,P_10 -,P_10 p-value,P_10 reject,P_10 p-value corrected
7,oracle,1.0,24.0,0.0,2.732014e-07,True,4e-06
3,random,0.916667,15.0,6.0,0.0120323,False,0.15642
2,dummy,0.913333,16.0,6.0,0.01368112,False,0.177855
5,bert,0.91,14.0,5.0,0.01003009,False,0.130391
9,V6-model_all_topic_w0,0.906667,15.0,6.0,0.0219983,False,0.285978
6,formula,0.9,13.0,5.0,0.02642456,False,0.343519
10,V6-model_all_topic_w5,0.893333,17.0,8.0,0.04201138,False,0.546148
1,boromir-touche23,0.89,13.0,5.0,0.02684599,False,0.348998
4,afinn,0.886667,12.0,8.0,0.07293518,False,0.948157
8,V3-model_1,0.883333,14.0,5.0,0.08307467,False,1.0


### Evaluation nDCG@10

In [17]:
load_report('PRO', 'ndcg_cut_10')

  del sys.path[0]


Unnamed: 0,name,ndcg_cut_10,ndcg_cut_10 +,ndcg_cut_10 -,ndcg_cut_10 p-value,ndcg_cut_10 reject,ndcg_cut_10 p-value corrected
7,oracle,1.0,28.0,0.0,4.594535e-10,True,5.972896e-09
5,bert,0.71793,20.0,9.0,0.01010455,False,0.1313592
6,formula,0.701332,16.0,13.0,0.02654518,False,0.3450873
2,dummy,0.677439,16.0,13.0,0.1104133,False,1.0
9,V6-model_all_topic_w0,0.661777,16.0,13.0,0.2960805,False,1.0
3,random,0.658674,17.0,12.0,0.2409449,False,1.0
1,boromir-touche23,0.65013,19.0,11.0,0.3612749,False,1.0
10,V6-model_all_topic_w5,0.628625,15.0,14.0,0.8117163,False,1.0
0,google,0.61867,,,,False,
4,afinn,0.612545,14.0,15.0,0.8649907,False,1.0


In [18]:
load_report('CON', 'ndcg_cut_10')

  del sys.path[0]


Unnamed: 0,name,ndcg_cut_10,ndcg_cut_10 +,ndcg_cut_10 -,ndcg_cut_10 p-value,ndcg_cut_10 reject,ndcg_cut_10 p-value corrected
7,oracle,0.923982,30.0,0.0,1.099686e-19,True,1.429592e-18
1,boromir-touche23,0.253411,18.0,10.0,0.1025575,False,1.0
5,bert,0.246923,20.0,10.0,0.07230263,False,0.9399341
10,V6-model_all_topic_w5,0.242584,18.0,8.0,0.05777903,False,0.7511274
9,V6-model_all_topic_w0,0.210703,17.0,11.0,0.3344683,False,1.0
4,afinn,0.203745,16.0,11.0,0.4885242,False,1.0
6,formula,0.196157,14.0,12.0,0.5809207,False,1.0
2,dummy,0.195056,16.0,12.0,0.6189308,False,1.0
11,V8-model_all_topic,0.182068,18.0,9.0,0.8690045,False,1.0
8,V3-model_1,0.179505,12.0,12.0,0.8910344,False,1.0


In [19]:
load_report('ONTOPIC', 'ndcg_cut_10')

  del sys.path[0]


Unnamed: 0,name,ndcg_cut_10,ndcg_cut_10 +,ndcg_cut_10 -,ndcg_cut_10 p-value,ndcg_cut_10 reject,ndcg_cut_10 p-value corrected
7,oracle,1.0,24.0,0.0,2e-06,True,2.1e-05
3,random,0.917632,17.0,8.0,0.021312,False,0.277054
2,dummy,0.913244,17.0,9.0,0.027756,False,0.360826
5,bert,0.910986,16.0,9.0,0.023198,False,0.301576
9,V6-model_all_topic_w0,0.909476,17.0,9.0,0.042492,False,0.552396
6,formula,0.901319,16.0,10.0,0.039568,False,0.514378
1,boromir-touche23,0.896755,15.0,10.0,0.082797,False,1.0
10,V6-model_all_topic_w5,0.893703,17.0,9.0,0.153663,False,1.0
8,V3-model_1,0.885314,15.0,10.0,0.140424,False,1.0
4,afinn,0.885099,17.0,10.0,0.139966,False,1.0


### Evaluation nDCG@5

In [20]:
load_report('PRO', 'ndcg_cut_5')

  del sys.path[0]


Unnamed: 0,name,ndcg_cut_5,ndcg_cut_5 +,ndcg_cut_5 -,ndcg_cut_5 p-value,ndcg_cut_5 reject,ndcg_cut_5 p-value corrected
7,oracle,1.0,26.0,0.0,6.747387e-09,True,8.771603e-08
5,bert,0.719356,18.0,9.0,0.04480012,False,0.5824016
6,formula,0.694508,16.0,10.0,0.1649737,False,1.0
1,boromir-touche23,0.685982,16.0,12.0,0.143517,False,1.0
2,dummy,0.674001,17.0,11.0,0.2955085,False,1.0
10,V6-model_all_topic_w5,0.662608,15.0,11.0,0.4111669,False,1.0
12,V9-model_all_topic,0.656439,15.0,13.0,0.4304924,False,1.0
9,V6-model_all_topic_w0,0.648135,14.0,13.0,0.5684454,False,1.0
3,random,0.63271,15.0,10.0,0.7022114,False,1.0
4,afinn,0.620702,12.0,16.0,0.9118888,False,1.0


In [21]:
load_report('CON', 'ndcg_cut_5')

  del sys.path[0]


Unnamed: 0,name,ndcg_cut_5,ndcg_cut_5 +,ndcg_cut_5 -,ndcg_cut_5 p-value,ndcg_cut_5 reject,ndcg_cut_5 p-value corrected
7,oracle,0.945136,29.0,0.0,5.585033e-16,True,7.260543e-15
10,V6-model_all_topic_w5,0.261281,13.0,8.0,0.2056678,False,1.0
5,bert,0.217228,14.0,11.0,0.7386162,False,1.0
1,boromir-touche23,0.21508,12.0,12.0,0.8061944,False,1.0
11,V8-model_all_topic,0.214848,14.0,10.0,0.8292244,False,1.0
6,formula,0.211495,13.0,10.0,0.8589605,False,1.0
12,V9-model_all_topic,0.202154,8.0,11.0,0.9917279,False,1.0
0,google,0.201644,,,,False,
9,V6-model_all_topic_w0,0.176333,9.0,12.0,0.6058586,False,1.0
2,dummy,0.175452,10.0,12.0,0.6381958,False,1.0


In [22]:
load_report('ONTOPIC', 'ndcg_cut_5')

  del sys.path[0]


Unnamed: 0,name,ndcg_cut_5,ndcg_cut_5 +,ndcg_cut_5 -,ndcg_cut_5 p-value,ndcg_cut_5 reject,ndcg_cut_5 p-value corrected
7,oracle,1.0,15.0,0.0,0.000743,True,0.00966
5,bert,0.917092,13.0,5.0,0.122846,False,1.0
3,random,0.916174,13.0,5.0,0.151717,False,1.0
2,dummy,0.914199,13.0,6.0,0.160096,False,1.0
9,V6-model_all_topic_w0,0.911594,13.0,7.0,0.225945,False,1.0
6,formula,0.909531,12.0,5.0,0.167139,False,1.0
1,boromir-touche23,0.909196,10.0,9.0,0.285577,False,1.0
8,V3-model_1,0.896256,12.0,6.0,0.3485,False,1.0
10,V6-model_all_topic_w5,0.892238,12.0,7.0,0.530021,False,1.0
4,afinn,0.886719,12.0,6.0,0.497547,False,1.0


### Test various measures

In [23]:
load_report('CON', 'Rprec')

  del sys.path[0]


Unnamed: 0,name,Rprec,Rprec +,Rprec -,Rprec p-value,Rprec reject,Rprec p-value corrected
7,oracle,0.583001,30.0,0.0,2.729715e-11,True,3.54863e-10
1,boromir-touche23,0.148586,19.0,6.0,0.009012128,False,0.1171577
10,V6-model_all_topic_w5,0.127064,18.0,5.0,0.01793807,False,0.2331949
5,bert,0.123816,18.0,6.0,0.04459797,False,0.5797737
9,V6-model_all_topic_w0,0.111398,13.0,6.0,0.05019196,False,0.6524955
4,afinn,0.107881,11.0,8.0,0.2009507,False,1.0
8,V3-model_1,0.099971,9.0,7.0,0.2665608,False,1.0
11,V8-model_all_topic,0.092088,16.0,10.0,0.4123296,False,1.0
2,dummy,0.087734,13.0,10.0,0.5230354,False,1.0
3,random,0.087357,9.0,10.0,0.5859933,False,1.0


In [24]:
load_report('CON', 'map')

  del sys.path[0]


Unnamed: 0,name,map,map +,map -,map p-value,map reject,map p-value corrected
7,oracle,0.583607,30.0,0.0,6.035878e-12,True,7.846642e-11
1,boromir-touche23,0.076442,17.0,11.0,0.1664866,False,1.0
5,bert,0.067165,19.0,10.0,0.1582054,False,1.0
10,V6-model_all_topic_w5,0.062586,18.0,8.0,0.1455579,False,1.0
4,afinn,0.057519,15.0,12.0,0.3896967,False,1.0
9,V6-model_all_topic_w0,0.052025,17.0,11.0,0.4700898,False,1.0
12,V9-model_all_topic,0.047969,10.0,15.0,0.753582,False,1.0
3,random,0.046349,13.0,13.0,0.7915391,False,1.0
11,V8-model_all_topic,0.045661,18.0,9.0,0.7937385,False,1.0
2,dummy,0.043108,16.0,12.0,0.9087765,False,1.0
