# Create Help/Harm Evaluation Dataframe

Since the evaluation takes a while (approximately 1 hour), we create the evaluation dataframe 

In [2]:
from glob import glob
from tqdm import tqdm
import pandas as pd

RUN_DIR='/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/'


def compatibility(qrels, run):
    compatibility_output = !python3 ../../../../third-party/compatibility.py "{qrels}" "{run}"
    ret = {}
    for line in compatibility_output:
        line = line.split()
        assert len(line) == 3 and line[0] == 'compatibility'
        topic = line[1]
        if topic != 'all':
            topic = int(topic)
        
        ret[topic] = float(line[2])
    
    return ret

def evaluate_run(year, run):
    return {
        'helpful': compatibility('../../../../third-party/health-misinfo-' + str(year) + '/misinfo-qrels-graded.helpful-only', run),
        'harmful': compatibility('../../../../third-party/health-misinfo-' + str(year) + '/misinfo-qrels-graded.harmful-only', run),
    }
    

evaluate_run(19, RUN_DIR + '/hmi-19/run.hmi19.bm25_bm25(k1=0.7,b=0.35)_default')

{'helpful': {1: 0.0018,
  4: 0.0003,
  5: 0.6091,
  6: 0.0392,
  8: 0.0612,
  9: 0.0547,
  10: 0.4313,
  11: 0.0079,
  12: 0.1569,
  13: 0.0105,
  15: 0.1562,
  16: 0.0,
  19: 0.7326,
  20: 0.4236,
  21: 0.3954,
  23: 0.1397,
  25: 0.173,
  26: 0.0399,
  27: 0.272,
  28: 0.3773,
  29: 0.366,
  31: 0.0,
  34: 0.1664,
  36: 0.931,
  37: 0.1565,
  38: 0.1247,
  39: 0.6595,
  40: 0.0018,
  41: 0.0362,
  42: 0.0,
  43: 0.3963,
  44: 0.0085,
  45: 0.3356,
  46: 0.4736,
  47: 0.1247,
  48: 0.0202,
  49: 0.1431,
  50: 0.1195,
  51: 0.019,
  'all': 0.2094},
 'harmful': {1: 0.0855,
  2: 0.5059,
  3: 0.4817,
  4: 0.5332,
  5: 0.2746,
  6: 0.7982,
  7: 0.4073,
  8: 0.0438,
  9: 0.3189,
  10: 0.2742,
  11: 0.0002,
  12: 0.1926,
  13: 0.6148,
  15: 0.5872,
  16: 0.4107,
  17: 0.4437,
  18: 0.3304,
  19: 0.0156,
  20: 0.1383,
  21: 0.2036,
  22: 0.2109,
  23: 0.2573,
  24: 0.0264,
  25: 0.2887,
  26: 0.0758,
  27: 0.4058,
  28: 0.5825,
  29: 0.1741,
  30: 0.1381,
  31: 0.3912,
  32: 0.1593,
  33: 0.9

In [4]:
df = []

for year in [19, 20, 21]:
    runs = glob(RUN_DIR + 'hmi-' + str(year) + '*/*')
    for run in tqdm(runs):
        eval_result = evaluate_run(year, run)
        
        df += [{
            'year': year,
            'run': run,
            'helpful': eval_result['helpful'],
            'harmful': eval_result['harmful'],
        }]

df = pd.DataFrame(df)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1823/1823 [13:43<00:00,  2.21it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1823/1823 [16:32<00:00,  1.84it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1823/1823 [16:17<00:00,  1.86it/s]


In [7]:
df.to_json('../resources/help-harm-results.jsonl', lines=True, orient='records')

In [9]:
df.iloc[0].run

'/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-19-rm3/run.hmi19.bm25_bm25(k1=0.9,b=0.4)_sequential-relevance-feedback(Rm3RelevanceFeedback(fbDocs=10,fbTerms=9,originalQueryWeight:0.0);rel=3)-default'

In [13]:
from trectools import TrecRun

run = TrecRun('/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-19-rm3/run.hmi19.bm25_bm25(k1=0.9,b=0.4)_sequential-relevance-feedback(Rm3RelevanceFeedback(fbDocs=10,fbTerms=9,originalQueryWeight:0.0);rel=3)-default')
dir(run)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'check_qrel_coverage',
 'check_run_coverage',
 'evaluate_run',
 'filename',
 'get_filename',
 'get_full_filename_path',
 'get_mean_coverage',
 'get_runid',
 'get_top_documents',
 'print_subset',
 'read_run',
 'rename_runid',
 'run_data',
 'topics',
 'topics_intersection_with']

In [63]:
def load_qrels(year):
    from trectools import TrecQrel
    
    if year == 19:
        return TrecQrel('../../../../third-party/health-misinfo-' + str(year) +'/2019qrels_relevance.txt').qrels_data
    elif year == 20:
        return TrecQrel('../../../../third-party/health-misinfo-' + str(year) + '/qrels/misinfo-2020-qrels-del-me').qrels_data
    elif year == 21:
        return TrecQrel('../../../../third-party/health-misinfo-' + str(year) + '/qrels/qrels-35topics-del-me.txt').qrels_data
    else:
        raise ValueError('blaaa')

def remove_unjudged_runs(year, run_file):
    import os.path
    run = TrecRun(run_file).run_data
    qrels = load_qrels(year)

    onlyjudged = pd.merge(run, qrels[["query","docid","rel"]], how="left")
    onlyjudged = onlyjudged[~onlyjudged["rel"].isnull()]
    run = onlyjudged[["query","q0","docid","rank","score","system"]]

    trecformat = run.sort_values(["query", "score", "docid"], ascending=[True,False,False]).reset_index()
    topX = trecformat.groupby("query")[["query","q0","docid","rank","score","system"]].head(250)

    topX["rank"] = 1
    topX["rank"] = topX.groupby("query")["rank"].cumsum()

    run_file = run_file.replace('/runs/', '/runs/only-judged/')

    if os.path.isfile(run_file):
        raise ValueError('Already exists...')
        
    topX.to_csv(run_file, sep=' ', header=False, index=False)

In [64]:
for _, i in tqdm(df.iterrows()):
    remove_unjudged_runs(i['year'], i['run'])

5469it [14:13,  6.41it/s]


In [3]:
df_only_judged = []

for year in [19, 20, 21]:
    runs = glob(RUN_DIR + '/only-judged/hmi-' + str(year) + '*/*')
    for run in tqdm(runs):
        eval_result = evaluate_run(year, run)
        
        df_only_judged += [{
            'year': year,
            'run': run,
            'helpful': eval_result['helpful'],
            'harmful': eval_result['harmful'],
        }]

df_only_judged = pd.DataFrame(df_only_judged)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1823/1823 [09:20<00:00,  3.25it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1823/1823 [11:14<00:00,  2.70it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1823/1823 [10:49<00:00,  2.81it/s]


In [4]:
df_only_judged.to_json('../resources/help-harm-results-only-judged.jsonl', lines=True, orient='records')

# Trectools evaluation

In [3]:
def load_all_qrels():
    from trectools import TrecQrel
    
    ret = {}
    for year in [19, 20, 21]:
        ret[year] = {}
        for annotation_type in ['helpful', 'harmful']:
            ret[year][annotation_type] = TrecQrel('../../../../third-party/health-misinfo-' + str(year) + '/misinfo-qrels-graded.' + annotation_type + '-only')

    return ret

QRELS = load_all_qrels()

In [4]:
def trec_eval(year, run):
    from trectools import TrecEval, TrecRun
    ret = {}
    run = TrecRun(run)
    
    for annotation_type in ['helpful', 'harmful']:
        te = TrecEval(run, QRELS[year][annotation_type])
        
        ret['unjudged-at-10-' + annotation_type] = te.get_unjudged(depth=10)
        ret['unjudged-at-20-' + annotation_type] = te.get_unjudged(depth=20)
        ret['reciprocal-rank-' + annotation_type] = te.get_reciprocal_rank()
        ret['reciprocal-rank-unjudged-removed-' + annotation_type] = te.get_reciprocal_rank(removeUnjudged=True)
        ret['ndcg-at-10-rank-' + annotation_type] = te.get_ndcg(depth=10)
        ret['ndcg-at-20-rank-' + annotation_type] = te.get_ndcg(depth=20)
        
        ret['ndcg-unjudged-removed-at-10-rank-' + annotation_type] = te.get_ndcg(depth=10, removeUnjudged=True)
        ret['ndcg-unjudged-removed-at-20-rank-' + annotation_type] = te.get_ndcg(depth=20, removeUnjudged=True)
        
    return ret

In [5]:
trec_eval(19, '/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-19-rm3/run.hmi19.bm25_bm25(k1=0.9,b=0.4)_sequential-relevance-feedback(Rm3RelevanceFeedback(fbDocs=10,fbTerms=9,originalQueryWeight:0.0);rel=3)-default')

{'unjudged-at-10-helpful': 1.0153846153846153,
 'unjudged-at-20-helpful': 1.0756410256410258,
 'reciprocal-rank-helpful': 0.6590336134453781,
 'reciprocal-rank-unjudged-removed-helpful': 0.6590336134453781,
 'ndcg-at-10-rank-helpful': 0.36060401672001835,
 'ndcg-at-20-rank-helpful': 0.3456352621269209,
 'ndcg-unjudged-removed-at-10-rank-helpful': 0.6502097166760744,
 'ndcg-unjudged-removed-at-20-rank-helpful': 0.644234309907788,
 'unjudged-at-10-harmful': 0.8653061224489798,
 'unjudged-at-20-harmful': 0.8765306122448979,
 'reciprocal-rank-harmful': 0.20496173874334603,
 'reciprocal-rank-unjudged-removed-harmful': 0.20496173874334603,
 'ndcg-at-10-rank-harmful': 0.1296690535470495,
 'ndcg-at-20-rank-harmful': 0.13651146752795193,
 'ndcg-unjudged-removed-at-10-rank-harmful': 0.5877882420072763,
 'ndcg-unjudged-removed-at-20-rank-harmful': 0.5798899403463031}

In [6]:
df_trec_eval = []

for year in [19, 20, 21]:
    runs = glob(RUN_DIR + 'hmi-' + str(year) + '*/*')
    for run in tqdm(runs):
        eval_result = {
            'year': year,
            'run': run,
        }
        eval_result.update(trec_eval(year, run))
        
        df_trec_eval += [eval_result]

df_trec_eval = pd.DataFrame(df_trec_eval)

  1%|█▏                                                                                                         | 21/1823 [00:17<24:32,  1.22it/s]


KeyboardInterrupt: 

In [20]:
df_trec_eval

Unnamed: 0,year,run,unjudged-at-10-helpful,unjudged-at-20-helpful,reciprocal-rank-helpful,reciprocal-rank-unjudged-removed-helpful,ndcg-at-10-rank-helpful,ndcg-at-20-rank-helpful,ndcg-unjudged-removed-at-10-rank-helpful,ndcg-unjudged-removed-at-20-rank-helpful,unjudged-at-10-harmful,unjudged-at-20-harmful,reciprocal-rank-harmful,reciprocal-rank-unjudged-removed-harmful,ndcg-at-10-rank-harmful,ndcg-at-20-rank-harmful,ndcg-unjudged-removed-at-10-rank-harmful,ndcg-unjudged-removed-at-20-rank-harmful
0,19,/mnt/ceph/storage/data-in-progress/data-resear...,1.015385,1.075641,0.659034,0.659034,0.360604,0.345635,0.65021,0.644234,0.865306,0.876531,0.204962,0.204962,0.129669,0.136511,0.587788,0.57989
1,20,/mnt/ceph/storage/data-in-progress/data-resear...,0.528261,0.643478,0.786177,0.786177,0.475133,0.409722,0.727792,0.717157,1.49375,1.484375,0.083502,0.083502,0.043692,0.056363,0.391297,0.368424
2,21,/mnt/ceph/storage/data-in-progress/data-resear...,1.0,1.018571,0.368872,0.368872,0.195928,0.194378,0.433639,0.444448,1.271875,1.28125,0.254788,0.254788,0.127707,0.133032,0.401823,0.398021
