# Create All qrel files for explicit relevance feedback

### Qrels: least_difficult and most_difficult

In [1]:
DIR = '../../../../third-party/health-misinfo-'

def initialize_query_difficulty(year):
    from trectools import TrecQrel
    import pandas as pd
    df = TrecQrel(DIR + str(year) + '/misinfo-qrels-graded.helpful-only').qrels_data
    df = df[df['rel'] > 0]
    ret = {}
    
    for _, i in df.iterrows():
        if i['query'] not in ret:
            ret[i['query']] = {}
        ret[i['query']][i['docid']] = []
    
    return ret

def calculate_difficulty_of_documents(year):
    from glob import glob
    from trectools import TrecRun
    from tqdm import tqdm
    import pandas as pd
    import os
    run_files = glob(DIR + str(year) + '/runs/*')
    if os.path.isfile(DIR + str(year) + '/ranks-of-helpful-documents.jsonl'):
        print('Reuse pre-calculated document ranks. Delete the file if you want to calculate it from scratch.')
        ret = pd.read_json(DIR + str(year) + '/ranks-of-helpful-documents.jsonl', lines=True)
        ret['approaches'] = len(run_files)
        
        return ret
    
    ret = initialize_query_difficulty(year)
    
    for trec_run in tqdm(run_files):
        trec_run = TrecRun(trec_run).run_data
        for _, i in trec_run.iterrows():
            if i['query'] in ret and i['docid'] in ret[i['query']]:
                ret[i['query']][i['docid']] = ret[i['query']][i['docid']] + [i['rank']]
    
    df = []
    for query, docs in ret.items():
        for doc, importance in docs.items():
            df += [{
                'query': query,
                'q0': 0,
                'docid': doc,
                'rel': importance,
            }]
            
    return pd.DataFrame(df)

def map_to_mrr(i):
    ranks = i['rel']
    from statistics import mean
    ret = []
    for pos in range(i['approaches']):
        ret += [min(1000, ranks[pos]) if pos< len(ranks) else 1000]
    
    return mean(ret)

def write_difficulty_qrels(year):
    ret = calculate_difficulty_of_documents(year)
    ret['mrr'] = ret.apply(map_to_mrr, axis=1)
    ret = ret.sort_values(['query', 'mrr'], ascending=[True, True])
    ret['least_difficult'] = ret.groupby("query")["mrr"].rank("dense", ascending=False)
    ret['most_difficult'] = ret.groupby("query")["mrr"].rank("dense", ascending=True)

    for col in ['least_difficult', 'most_difficult']:
        tmp_col = ret.copy()
        tmp_col['rel'] = tmp_col[col].astype(int)
        del tmp_col['mrr']
        del tmp_col['least_difficult']
        del tmp_col['most_difficult']
        del tmp_col['approaches']
        tmp_col = tmp_col.sort_values(['query', 'rel'], ascending=[True, False])
        
        tmp_col.to_csv(DIR + str(year) + '/qrels.' + col + '.txt', sep=' ', header=False, index=False)
    
    return ret

for i in [19, 20, 21]:
    write_difficulty_qrels(i)

Reuse pre-calculated document ranks. Delete the file if you want to calculate it from scratch.
Reuse pre-calculated document ranks. Delete the file if you want to calculate it from scratch.
Reuse pre-calculated document ranks. Delete the file if you want to calculate it from scratch.


### Qrels: Random

In [13]:
def write_random_qrels(year):
    import random
    ret = write_difficulty_qrels(year)
    
    ret['rand'] = ret['docid'].apply(lambda i: random.random())
    ret['rel'] = ret.groupby("query")["rand"].rank("dense", ascending=False).astype(int)
    del ret['mrr']
    del ret['least_difficult']
    del ret['most_difficult']
    del ret['approaches']
    del ret['rand']
    ret = ret.sort_values(['query', 'rel'], ascending=[True, False])
    
    ret.to_csv(DIR + str(year) + '/qrels.random.txt', sep=' ', header=False, index=False)

for i in [19, 20, 21]:
    write_random_qrels(i)

### Qrels: longest and shortest

In [2]:
from pyserini.search import SimpleSearcher
INDEX_DIR='/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/indexes-judged-only/'

SEARCHERS = {
    19: SimpleSearcher(INDEX_DIR + 'lucene-index.cw12-judged-only-raw'),
    20: SimpleSearcher(INDEX_DIR + 'lucene-index.cc-news-20-judged-only-raw'),
    21: SimpleSearcher(INDEX_DIR + 'lucene-index.c4noclean'),
}

def get_document_text(year, doc_id):
    ret = SEARCHERS[year].doc(doc_id)
    
    return ' '.join(ret.contents().split()) if ret else None

def get_document_text_length(year, doc_id):
    text = get_document_text(year, doc_id)
    
    return len(text.split()) if text else None

In [33]:
### Some example tests:

print(get_document_text(19, 'clueweb12-0109wb-48-31622')[:150] + '\n\n')
print(get_document_text_length(19, 'clueweb12-0109wb-48-31622'), '\n\n')
print(get_document_text(20, 'e971787d-4b8c-4c8d-b4ef-890749d996ab')[:150] + '\n\n')
print(get_document_text_length(20, 'e971787d-4b8c-4c8d-b4ef-890749d996ab'), '\n\n')
print(get_document_text(21, 'en.noclean.c4-train.02221-of-07168.33922')[:150] + '\n\n')
print(get_document_text_length(21, 'en.noclean.c4-train.02221-of-07168.33922'), '\n\n')

Urinary Tract Infection,Its Symptoms,Causes, Remedcy & Treatment Urinary Tract Infection Urinary tract infection is a serious problem. It affects mill


1064 


Vitamin D supplements do not cure or prevent COVID-19 Home News World News Entertainment TV Movies Music People Health Hi-Tech Mobile Gaming Internet 


603 


Achilles Tendon Disorders - Sarasota, FL: Florida Orthopedic Foot & Ankle Center 941-924-8777 Request Appt Home About Practice Meet the Doctors Clinic


3175 




In [47]:
def df_with_length_as_field_qrels(year):
    import random
    ret = write_difficulty_qrels(year)
    
    ret['length'] = ret['docid'].apply(lambda i: get_document_text_length(year, i))
    ret = ret.dropna()
    del ret['mrr']
    del ret['least_difficult']
    del ret['most_difficult']
    del ret['approaches']
    
    return ret

def write_length_qrels(year):
    for name, ascending in [('longest', True), ('shortest', False)]:
        ret = df_with_length_as_field_qrels(year)
        ret['rel'] = ret.groupby('query')['length'].rank("dense", ascending=ascending).astype(int)
        del ret['length']
        
        ret = ret.sort_values(['query', 'rel'], ascending=[True, False])
        
        ret.to_csv(DIR + str(year) + '/qrels.' + name + '.txt', sep=' ', header=False, index=False)

for i in [19, 20, 21]:
    write_length_qrels(i)

Reuse pre-calculated document ranks. Delete the file if you want to calculate it from scratch.
Reuse pre-calculated document ranks. Delete the file if you want to calculate it from scratch.
Reuse pre-calculated document ranks. Delete the file if you want to calculate it from scratch.
Reuse pre-calculated document ranks. Delete the file if you want to calculate it from scratch.
Reuse pre-calculated document ranks. Delete the file if you want to calculate it from scratch.
Reuse pre-calculated document ranks. Delete the file if you want to calculate it from scratch.


### Test the length qrels

In [49]:
!head ../../../../third-party/health-misinfo-19/qrels.longest.txt


print(get_document_text_length(19, 'clueweb12-0109wb-48-31622'), '\n\n')
print(get_document_text_length(19, 'clueweb12-0907wb-18-23499'), '\n\n')
print(get_document_text_length(19, 'clueweb12-0201wb-90-00814'), '\n\n')

1 0 clueweb12-0109wb-48-31622 8
1 0 clueweb12-0907wb-18-23499 7
1 0 clueweb12-1005wb-96-06520 6
1 0 clueweb12-0201wb-90-00814 5
1 0 clueweb12-1702wb-42-00868 4
1 0 clueweb12-1505wb-80-00851 3
1 0 clueweb12-0201wb-90-00828 2
1 0 clueweb12-1309wb-66-04887 1
4 0 clueweb12-1316wb-37-32089 6
4 0 clueweb12-0705wb-97-24678 5
1064 


808 


732 




In [51]:
!head ../../../../third-party/health-misinfo-19/qrels.shortest.txt


print(get_document_text_length(19, 'clueweb12-1309wb-66-04887'), '\n\n')
print(get_document_text_length(19, 'clueweb12-0201wb-90-00828'), '\n\n')
print(get_document_text_length(19, 'clueweb12-1505wb-80-00851'), '\n\n')

1 0 clueweb12-1309wb-66-04887 8
1 0 clueweb12-0201wb-90-00828 7
1 0 clueweb12-1505wb-80-00851 6
1 0 clueweb12-1702wb-42-00868 5
1 0 clueweb12-0201wb-90-00814 4
1 0 clueweb12-1005wb-96-06520 3
1 0 clueweb12-0907wb-18-23499 2
1 0 clueweb12-0109wb-48-31622 1
4 0 clueweb12-0304wb-45-11445 6
4 0 clueweb12-0916wb-93-23970 5
434 


547 


583 




In [53]:
!head ../../../../third-party/health-misinfo-20/qrels.longest.txt


print(get_document_text_length(20, '83bde5b0-ddd1-48ab-b9b6-49d248a6c12f'), '\n\n')
print(get_document_text_length(20, '906c45cb-358f-4d1e-9ce5-b7850ce22e2c'), '\n\n')
print(get_document_text_length(20, '85be03f1-e06e-41f0-80d2-42d26c758f35'), '\n\n')

1 0 83bde5b0-ddd1-48ab-b9b6-49d248a6c12f 250
1 0 906c45cb-358f-4d1e-9ce5-b7850ce22e2c 249
1 0 85be03f1-e06e-41f0-80d2-42d26c758f35 248
1 0 107e038c-9baa-4a60-87a1-36bdbdfcb7c1 247
1 0 10bbe0d7-8034-4167-a5d8-be8084a0ec0e 246
1 0 4e40c027-18d6-43ab-a540-9bc6a762cd61 245
1 0 9116281f-c30a-4a0e-99b7-07482d13cb82 244
1 0 aa6d6172-e34d-473c-9c76-6c44c405bda9 243
1 0 13fa1521-83ea-40ea-b22c-48480e81d546 242
1 0 ccfd40a2-4dc8-4861-8021-f04756bd1ad5 241
12928 


9812 


5767 




In [55]:
!head ../../../../third-party/health-misinfo-20/qrels.shortest.txt


print(get_document_text_length(20, '8bd01f64-e4af-41cc-9742-ad041c9497b8'), '\n\n')
print(get_document_text_length(20, 'b5d2b341-0c1f-4f20-bcf9-855d57155001'), '\n\n')
print(get_document_text_length(20, '9d28e1bf-cb0a-4975-b1a6-5842a41d350e'), '\n\n')

1 0 8bd01f64-e4af-41cc-9742-ad041c9497b8 250
1 0 b5d2b341-0c1f-4f20-bcf9-855d57155001 249
1 0 9d28e1bf-cb0a-4975-b1a6-5842a41d350e 248
1 0 a0eb055a-0c70-42e8-a487-ac71bf7a02c7 247
1 0 c771f702-822d-4ad4-bd90-a7acc9aa80d9 246
1 0 a6f304ed-ca1a-4a65-9836-b1b7e45c07a6 245
1 0 c8ad91b6-705b-46fa-9673-c3cad1327954 244
1 0 60277b81-8d83-4897-9b1d-ab5fda1025c1 243
1 0 330e1951-617a-47b6-8d63-1e4ee4789bfd 242
1 0 3800ef9f-1654-4485-a6b3-71d60da385ed 241
355 


429 


433 




In [57]:
!head ../../../../third-party/health-misinfo-21/qrels.longest.txt


print(get_document_text_length(21, 'en.noclean.c4-train.00871-of-07168.71056'), '\n\n')
print(get_document_text_length(21, 'en.noclean.c4-train.05309-of-07168.84366'), '\n\n')
print(get_document_text_length(21, 'en.noclean.c4-train.02836-of-07168.76315'), '\n\n')

101 0 en.noclean.c4-train.00871-of-07168.71056 53
101 0 en.noclean.c4-train.05309-of-07168.84366 52
101 0 en.noclean.c4-train.02836-of-07168.76315 51
101 0 en.noclean.c4-train.06990-of-07168.103435 50
101 0 en.noclean.c4-train.03364-of-07168.45760 49
101 0 en.noclean.c4-train.06983-of-07168.46260 48
101 0 en.noclean.c4-train.00922-of-07168.50887 47
101 0 en.noclean.c4-train.01681-of-07168.38406 46
101 0 en.noclean.c4-train.06476-of-07168.102082 45
101 0 en.noclean.c4-train.00444-of-07168.126346 44
14396 


14219 


12988 




In [59]:
!head ../../../../third-party/health-misinfo-21/qrels.shortest.txt


print(get_document_text_length(21, 'en.noclean.c4-train.00619-of-07168.117364'), '\n\n')
print(get_document_text_length(21, 'en.noclean.c4-train.05248-of-07168.12130'), '\n\n')
print(get_document_text_length(21, 'en.noclean.c4-train.04820-of-07168.134534'), '\n\n')

101 0 en.noclean.c4-train.00619-of-07168.117364 53
101 0 en.noclean.c4-train.05248-of-07168.12130 52
101 0 en.noclean.c4-train.04820-of-07168.134534 51
101 0 en.noclean.c4-train.00119-of-07168.41683 50
101 0 en.noclean.c4-train.01604-of-07168.53354 49
101 0 en.noclean.c4-train.03685-of-07168.54919 48
101 0 en.noclean.c4-train.01864-of-07168.39631 47
101 0 en.noclean.c4-train.05096-of-07168.19901 47
101 0 en.noclean.c4-train.05198-of-07168.12643 47
101 0 en.noclean.c4-train.06643-of-07168.136880 47
384 


599 


607 


