# Create Help/Harm Evaluation Dataframe

Since the evaluation takes a while (approximately 1 hour), we create the evaluation dataframe 

In [1]:
from glob import glob
from tqdm import tqdm
import pandas as pd

RUN_DIR='/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/'


def compatibility(qrels, run):
    compatibility_output = !python3 ../../../../third-party/compatibility.py "{qrels}" "{run}"
    ret = {}
    for line in compatibility_output:
        line = line.split()
        assert len(line) == 3 and line[0] == 'compatibility'
        topic = line[1]
        if topic != 'all':
            topic = int(topic)
        
        ret[topic] = float(line[2])
    
    return ret

def evaluate_run(year, run):
    return {
        'helpful': compatibility('../../../../third-party/health-misinfo-' + str(year) + '/misinfo-qrels-graded.helpful-only', run),
        'harmful': compatibility('../../../../third-party/health-misinfo-' + str(year) + '/misinfo-qrels-graded.harmful-only', run),
    }
    

evaluate_run(19, RUN_DIR + '/hmi-19/run.hmi19.bm25_bm25(k1=0.7,b=0.35)_default')

{'helpful': {1: 0.0018,
  4: 0.0003,
  5: 0.6091,
  6: 0.0392,
  8: 0.0612,
  9: 0.0547,
  10: 0.4313,
  11: 0.0079,
  12: 0.1569,
  13: 0.0105,
  15: 0.1562,
  16: 0.0,
  19: 0.7326,
  20: 0.4236,
  21: 0.3954,
  23: 0.1397,
  25: 0.173,
  26: 0.0399,
  27: 0.272,
  28: 0.3773,
  29: 0.366,
  31: 0.0,
  34: 0.1664,
  36: 0.931,
  37: 0.1565,
  38: 0.1247,
  39: 0.6595,
  40: 0.0018,
  41: 0.0362,
  42: 0.0,
  43: 0.3963,
  44: 0.0085,
  45: 0.3356,
  46: 0.4736,
  47: 0.1247,
  48: 0.0202,
  49: 0.1431,
  50: 0.1195,
  51: 0.019,
  'all': 0.2094},
 'harmful': {1: 0.0855,
  2: 0.5059,
  3: 0.4817,
  4: 0.5332,
  5: 0.2746,
  6: 0.7982,
  7: 0.4073,
  8: 0.0438,
  9: 0.3189,
  10: 0.2742,
  11: 0.0002,
  12: 0.1926,
  13: 0.6148,
  15: 0.5872,
  16: 0.4107,
  17: 0.4437,
  18: 0.3304,
  19: 0.0156,
  20: 0.1383,
  21: 0.2036,
  22: 0.2109,
  23: 0.2573,
  24: 0.0264,
  25: 0.2887,
  26: 0.0758,
  27: 0.4058,
  28: 0.5825,
  29: 0.1741,
  30: 0.1381,
  31: 0.3912,
  32: 0.1593,
  33: 0.9

In [4]:
df = []

for year in [19, 20, 21]:
    runs = glob(RUN_DIR + 'hmi-' + str(year) + '*/*')
    for run in tqdm(runs):
        eval_result = evaluate_run(year, run)
        
        df += [{
            'year': year,
            'run': run,
            'helpful': eval_result['helpful'],
            'harmful': eval_result['harmful'],
        }]

df = pd.DataFrame(df)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1823/1823 [13:43<00:00,  2.21it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1823/1823 [16:32<00:00,  1.84it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1823/1823 [16:17<00:00,  1.86it/s]


In [7]:
df.to_json('../resources/help-harm-results.jsonl', lines=True, orient='records')

In [9]:
df.iloc[0].run

'/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-19-rm3/run.hmi19.bm25_bm25(k1=0.9,b=0.4)_sequential-relevance-feedback(Rm3RelevanceFeedback(fbDocs=10,fbTerms=9,originalQueryWeight:0.0);rel=3)-default'

In [13]:
from trectools import TrecRun

run = TrecRun('/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-19-rm3/run.hmi19.bm25_bm25(k1=0.9,b=0.4)_sequential-relevance-feedback(Rm3RelevanceFeedback(fbDocs=10,fbTerms=9,originalQueryWeight:0.0);rel=3)-default')
dir(run)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'check_qrel_coverage',
 'check_run_coverage',
 'evaluate_run',
 'filename',
 'get_filename',
 'get_full_filename_path',
 'get_mean_coverage',
 'get_runid',
 'get_top_documents',
 'print_subset',
 'read_run',
 'rename_runid',
 'run_data',
 'topics',
 'topics_intersection_with']

In [63]:
def load_qrels(year):
    from trectools import TrecQrel
    
    if year == 19:
        return TrecQrel('../../../../third-party/health-misinfo-' + str(year) +'/2019qrels_relevance.txt').qrels_data
    elif year == 20:
        return TrecQrel('../../../../third-party/health-misinfo-' + str(year) + '/qrels/misinfo-2020-qrels-del-me').qrels_data
    elif year == 21:
        return TrecQrel('../../../../third-party/health-misinfo-' + str(year) + '/qrels/qrels-35topics-del-me.txt').qrels_data
    else:
        raise ValueError('blaaa')

def remove_unjudged_runs(year, run_file):
    import os.path
    run = TrecRun(run_file).run_data
    qrels = load_qrels(year)

    onlyjudged = pd.merge(run, qrels[["query","docid","rel"]], how="left")
    onlyjudged = onlyjudged[~onlyjudged["rel"].isnull()]
    run = onlyjudged[["query","q0","docid","rank","score","system"]]

    trecformat = run.sort_values(["query", "score", "docid"], ascending=[True,False,False]).reset_index()
    topX = trecformat.groupby("query")[["query","q0","docid","rank","score","system"]].head(250)

    topX["rank"] = 1
    topX["rank"] = topX.groupby("query")["rank"].cumsum()

    run_file = run_file.replace('/runs/', '/runs/only-judged/')

    if os.path.isfile(run_file):
        raise ValueError('Already exists...')
        
    topX.to_csv(run_file, sep=' ', header=False, index=False)

In [64]:
for _, i in tqdm(df.iterrows()):
    remove_unjudged_runs(i['year'], i['run'])

5469it [14:13,  6.41it/s]


In [3]:
df_only_judged = []

for year in [19, 20, 21]:
    runs = glob(RUN_DIR + '/only-judged/hmi-' + str(year) + '*/*')
    for run in tqdm(runs):
        eval_result = evaluate_run(year, run)
        
        df_only_judged += [{
            'year': year,
            'run': run,
            'helpful': eval_result['helpful'],
            'harmful': eval_result['harmful'],
        }]

df_only_judged = pd.DataFrame(df_only_judged)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1823/1823 [09:20<00:00,  3.25it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1823/1823 [11:14<00:00,  2.70it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1823/1823 [10:49<00:00,  2.81it/s]


In [4]:
df_only_judged.to_json('../resources/help-harm-results-only-judged.jsonl', lines=True, orient='records')

# Trectools evaluation

In [5]:
def load_all_qrels():
    from trectools import TrecQrel
    
    ret = {}
    for year in [19, 20, 21]:
        ret[year] = {}
        for annotation_type in ['helpful', 'harmful']:
            ret[year][annotation_type] = TrecQrel('../../../../third-party/health-misinfo-' + str(year) + '/misinfo-qrels-graded.' + annotation_type + '-only')

    return ret

QRELS = load_all_qrels()

In [45]:
def per_query(results):
    ret = {}
    for _, i in results.iterrows():
        assert len(i.values) == 1
        assert i._name not in ret
        
        ret[i._name] = i.values[0]
    return ret
            
def trec_eval(year, run):
    from trectools import TrecEval, TrecRun
    ret = {}
    run = TrecRun(run)
    
    for annotation_type in ['helpful', 'harmful']:
        te = TrecEval(run, QRELS[year][annotation_type])
        
        ret['unjudged-at-10-' + annotation_type] = per_query(te.get_unjudged(depth=10, per_query=True))
        ret['unjudged-at-20-' + annotation_type] = per_query(te.get_unjudged(depth=20, per_query=True))
        ret['reciprocal-rank-' + annotation_type] = per_query(te.get_reciprocal_rank(per_query=True))
        ret['reciprocal-rank-unjudged-removed-' + annotation_type] = per_query(te.get_reciprocal_rank(removeUnjudged=True, per_query=True))
        ret['ndcg-at-10-rank-' + annotation_type] = per_query(te.get_ndcg(depth=10, per_query=True))
        ret['ndcg-at-20-rank-' + annotation_type] = per_query(te.get_ndcg(depth=20, per_query=True))
        
        ret['ndcg-unjudged-removed-at-10-rank-' + annotation_type] = per_query(te.get_ndcg(depth=10, removeUnjudged=True, per_query=True))
        ret['ndcg-unjudged-removed-at-20-rank-' + annotation_type] = per_query(te.get_ndcg(depth=20, removeUnjudged=True, per_query=True))
        
    return ret

In [46]:
trec_eval(19, '/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/runs/hmi-19-rm3/run.hmi19.bm25_bm25(k1=0.9,b=0.4)_sequential-relevance-feedback(Rm3RelevanceFeedback(fbDocs=10,fbTerms=9,originalQueryWeight:0.0);rel=3)-default')

{'unjudged-at-10-helpful': {1: 0.9,
  2: 1.0,
  3: 1.0,
  4: 0.9,
  5: 0.2,
  6: 0.9,
  7: 1.0,
  8: 0.9,
  9: 0.8,
  10: 0.8,
  11: 0.3,
  12: 0.5,
  13: 0.8,
  14: 1.0,
  15: 0.9,
  16: 0.7,
  17: 1.0,
  18: 1.0,
  19: 0.2,
  20: 0.6,
  21: 0.6,
  22: 1.0,
  23: 0.9,
  24: 1.0,
  25: 0.7,
  26: 0.9,
  27: 0.8,
  28: 0.6,
  29: 0.3,
  30: 1.0,
  31: 0.9,
  32: 1.0,
  33: 1.0,
  34: 0.8,
  35: 1.0,
  36: 0.0,
  37: 0.9,
  38: 0.8,
  39: 0.6,
  40: 0.8,
  41: 0.9,
  42: 0.9,
  43: 0.6,
  44: 0.9,
  45: 0.8,
  46: 0.3,
  47: 0.9,
  48: 0.9,
  49: 0.6,
  50: 0.9,
  51: 0.9},
 'unjudged-at-20-helpful': {1: 0.95,
  2: 1.0,
  3: 1.0,
  4: 0.9,
  5: 0.4,
  6: 0.85,
  7: 1.0,
  8: 0.95,
  9: 0.8,
  10: 0.75,
  11: 0.55,
  12: 0.45,
  13: 0.9,
  14: 1.0,
  15: 0.95,
  16: 0.85,
  17: 1.0,
  18: 1.0,
  19: 0.3,
  20: 0.75,
  21: 0.6,
  22: 1.0,
  23: 0.95,
  24: 1.0,
  25: 0.75,
  26: 0.95,
  27: 0.9,
  28: 0.7,
  29: 0.35,
  30: 1.0,
  31: 0.95,
  32: 1.0,
  33: 1.0,
  34: 0.75,
  35: 1.0,
  36

In [47]:
df_trec_eval = []

for year in [20]:
    runs = glob(RUN_DIR + 'hmi-' + str(year) + '*/*')
    for run in tqdm(runs):
        eval_result = {
            'year': year,
            'run': run,
        }
        eval_result.update(trec_eval(year, run))
        
        df_trec_eval += [eval_result]

df_trec_eval = pd.DataFrame(df_trec_eval)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1823/1823 [31:12<00:00,  1.03s/it]


In [8]:
df_trec_eval

Unnamed: 0,year,run,unjudged-at-10-helpful,unjudged-at-20-helpful,reciprocal-rank-helpful,reciprocal-rank-unjudged-removed-helpful,ndcg-at-10-rank-helpful,ndcg-at-20-rank-helpful,ndcg-unjudged-removed-at-10-rank-helpful,ndcg-unjudged-removed-at-20-rank-helpful,unjudged-at-10-harmful,unjudged-at-20-harmful,reciprocal-rank-harmful,reciprocal-rank-unjudged-removed-harmful,ndcg-at-10-rank-harmful,ndcg-at-20-rank-harmful,ndcg-unjudged-removed-at-10-rank-harmful,ndcg-unjudged-removed-at-20-rank-harmful
0,20,/mnt/ceph/storage/data-in-progress/data-resear...,0.528261,0.643478,0.786177,0.786177,0.475133,0.409722,0.727792,0.717157,1.493750,1.484375,0.083502,0.083502,0.043692,0.056363,0.391297,0.368424
1,20,/mnt/ceph/storage/data-in-progress/data-resear...,0.682609,0.765217,0.812917,0.812917,0.363659,0.316569,0.660678,0.637511,1.512500,1.525000,0.081855,0.081855,0.031701,0.029740,0.355916,0.324030
2,20,/mnt/ceph/storage/data-in-progress/data-resear...,0.763043,0.759783,0.427535,0.427535,0.244732,0.250308,0.723645,0.722660,1.506250,1.503125,0.057235,0.057235,0.029543,0.034183,0.436775,0.420891
3,20,/mnt/ceph/storage/data-in-progress/data-resear...,0.602174,0.689130,0.849167,0.849167,0.424052,0.374922,0.694562,0.668822,1.487500,1.504687,0.104929,0.104929,0.047625,0.044169,0.450264,0.440082
4,20,/mnt/ceph/storage/data-in-progress/data-resear...,0.617391,0.707609,0.849524,0.849524,0.419353,0.367220,0.692696,0.669025,1.503125,1.512500,0.100293,0.100293,0.040914,0.039048,0.416806,0.406558
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1818,20,/mnt/ceph/storage/data-in-progress/data-resear...,0.763043,0.759783,0.437502,0.437502,0.246798,0.251617,0.725059,0.723067,1.506250,1.503125,0.043899,0.043899,0.027477,0.032849,0.436775,0.420891
1819,20,/mnt/ceph/storage/data-in-progress/data-resear...,0.560870,0.606522,0.852500,0.852500,0.448034,0.420327,0.727532,0.723806,1.475000,1.476562,0.085949,0.085949,0.048061,0.055263,0.425276,0.412987
1820,20,/mnt/ceph/storage/data-in-progress/data-resear...,0.763043,0.759783,0.437502,0.437502,0.246798,0.251617,0.725059,0.723067,1.506250,1.503125,0.043899,0.043899,0.027477,0.032849,0.436775,0.420891
1821,20,/mnt/ceph/storage/data-in-progress/data-resear...,0.482609,0.568478,0.838038,0.838038,0.504304,0.459274,0.720313,0.720571,1.465625,1.471875,0.083922,0.083922,0.057737,0.065443,0.457158,0.446341


In [48]:
df_trec_eval.to_json('../resources/trec-eval-results-per-query-20.jsonl', lines=True, orient='records')

In [10]:
import pyserini