# Table: Retrieved Documents per query

In [1]:
from trectools import TrecRun
import pandas as pd
from tqdm import tqdm
from glob import glob
RUN_DIR='/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-probst/retrievalExperiments/runs-ecir22/runs-retrieval-homogenity/'
topics = {int(i.split()[0]): i.split('\t')[1].strip() for i in open('../../Data/topics.retrieval-homogenity.tsv')}

def documents_per_topic(file_name):
    parts = file_name.split('.')
    index = parts[1]
    retrieval_model = parts[2]
    run = TrecRun(RUN_DIR + file_name).run_data
    covered_queries = []
    for _, i in run.groupby('query').size().reset_index(name='count').iterrows():
        covered_queries += [i['query']]
        yield {
            'index': index,
            'retrieval_model': retrieval_model,
            'topic': i['query'],
            'query': topics[i['query']],
            'retrieved_docs': i['count'],
        }
    
    covered_queries = set(covered_queries)
    for topic in topics.keys():
        if topic not in covered_queries:
            yield {
                'index': index,
                'retrieval_model': retrieval_model,
                'topic': topic,
                'query': topics[topic],
                'retrieved_docs': 0,
            }

In [2]:
df = []

for run_file in tqdm(glob(RUN_DIR + '*.txt')):
    df += [i for i in documents_per_topic(run_file.split('/')[-1])]

df = pd.DataFrame(df)
df

100%|██████████| 31/31 [13:38<00:00, 26.41s/it]


Unnamed: 0,index,retrieval_model,topic,query,retrieved_docs
0,cc-21-04-anchortext,qld-default,2004033,good woods for carving,1000
1,cc-21-04-anchortext,qld-default,2004062,chance the rapper networth,1000
2,cc-21-04-anchortext,qld-default,2006789,mlb advanced media,1000
3,cc-21-04-anchortext,qld-default,2006876,eclectic energies test,1000
4,cc-21-04-anchortext,qld-default,2007147,best netflix shows of all time,1000
...,...,...,...,...,...
309964,cc-20-05-anchortext,bm25+rm3-default,10909435,wvlpnboard,0
309965,cc-20-05-anchortext,bm25+rm3-default,9744257,www.spellcity.com,0
309966,cc-20-05-anchortext,bm25+rm3-default,3838516,kambuta,0
309967,cc-20-05-anchortext,bm25+rm3-default,10115744,richback,0


In [3]:
df_tmp = df[df['retrieval_model'] == 'bm25-conjunction-default'].groupby(['index', 'topic']).sum().reset_index()
df_tmp['terms'] = df.topic.apply(lambda i: len(topics[i].split()))
df_tmp

Unnamed: 0,index,topic,retrieved_docs,terms
0,cc-19-47-anchortext,2004033,1,4
1,cc-19-47-anchortext,2004062,0,4
2,cc-19-47-anchortext,2006789,0,3
3,cc-19-47-anchortext,2006876,2,3
4,cc-19-47-anchortext,2007147,8,6
...,...,...,...,...
39991,orcas,12852544,5,1
39992,orcas,12853262,1,1
39993,orcas,12853353,20,1
39994,orcas,12853486,3,1


In [4]:
term_to_query_count = {i['terms']:i['retrieved_docs'] for _, i in df_tmp[df_tmp['index'] == 'ms-marco-content'].groupby('terms').count().reset_index().iterrows()}

In [5]:
df_tmp['index'].unique()

array(['cc-19-47-anchortext', 'ms-marco-content',
       'msmarco-document-v1-title-only', 'orcas'], dtype=object)

In [6]:
df_queries_with_empty_result_set = df_tmp[df_tmp['retrieved_docs'] <= 0].groupby(['index', 'terms']).count().reset_index()[['index', 'terms', 'retrieved_docs']].copy()

df_queries_with_empty_result_set['percentage_empty_results'] = df_queries_with_empty_result_set.apply(lambda i: 1-(i['retrieved_docs']/term_to_query_count[i['terms']]), axis=1)
df_queries_with_empty_result_set

Unnamed: 0,index,terms,retrieved_docs,percentage_empty_results
0,cc-19-47-anchortext,1,601,0.476936
1,cc-19-47-anchortext,2,1120,0.520137
2,cc-19-47-anchortext,3,1337,0.494518
3,cc-19-47-anchortext,4,993,0.481733
4,cc-19-47-anchortext,5,603,0.496661
5,cc-19-47-anchortext,6,187,0.527778
6,cc-19-47-anchortext,7,100,0.52381
7,cc-19-47-anchortext,8,49,0.443182
8,cc-19-47-anchortext,9,18,0.470588
9,cc-19-47-anchortext,10,8,0.5


In [100]:
# 1000 Random documents for homogenity evaluation

In [43]:
MARCO_TOPICS = [i['topic'] for _, i in df_tmp[(df_tmp['retrieved_docs'] >= 800) & (df_tmp['index'] == 'ms-marco-content')].iterrows()]
ANCHOR_TOPICS = [i['topic'] for _, i in df_tmp[(df_tmp['retrieved_docs'] >= 800) & (df_tmp['index'] == 'cc-19-47-anchortext')].iterrows()]

topics_conjunction = set(ANCHOR_TOPICS).intersection(set(MARCO_TOPICS))

json.dump([i for i in topics_conjunction], open('topics-for-homogenity-evaluation_conjunction.json', 'w'))

In [41]:
len(ANCHOR_TOPICS)

44

In [98]:
from random import shuffle
import json
df_tmp = df[df['retrieval_model'] == 'bm25-default'].groupby('topic').sum().reset_index()
random_topics = [i for i in df_tmp[df_tmp['retrieved_docs'] >= 8000].topic]

shuffle(random_topics)
json.dump(random_topics[:1000], open('topics-for-homogenity-evaluation.json', 'w'))

In [99]:
!cat topics-for-homogenity-evaluation.json

[2689528, 5252102, 11574750, 9059837, 2658813, 7956181, 2576073, 5830387, 8281267, 11208713, 10848880, 8255800, 12285000, 11740115, 10414970, 6970416, 6053354, 7159891, 9433994, 10681608, 9733926, 12522869, 11816304, 2028315, 11959893, 3288833, 10436098, 4595179, 4216769, 3960392, 4878201, 11456573, 7292700, 4181272, 11052419, 11416630, 9557090, 6071639, 8381438, 2237527, 9098907, 9602955, 3289137, 12180237, 5346076, 7887782, 3960987, 11094636, 4885932, 7458982, 12277572, 3317142, 4938566, 12278496, 8383624, 7025065, 2942309, 11798147, 9758943, 6774687, 8960295, 7571218, 5725131, 4555319, 2704695, 6099117, 10589927, 12842711, 6925733, 3958146, 11926146, 8142819, 3901245, 7995938, 2618400, 5752218, 4106057, 9159829, 2798261, 3583479, 11216005, 10853742, 12048756, 8053964, 7253672, 4314960, 12685633, 9737489, 4364725, 4763745, 7873592, 6689522, 7851559, 11820637, 12446296, 12844550, 5770320, 11306972, 11752202, 11004867, 10059874, 5683920, 9478551, 4255979, 9813413, 10525362, 7185532, 97

In [66]:
df[df['retrieved_docs'] <= 0].groupby('index').size()

index
cc-16-07-anchortext               2100
cc-17-04-anchortext               1980
cc-18-13-anchortext               1935
cc-19-47-anchortext               1962
cc-20-05-anchortext               1917
cc-21-04-anchortext               1959
ms-marco-content                  1251
msmarco-document-v1-title-only    2430
orcas                                6
dtype: int64

In [70]:
2430/30000

0.081

In [39]:
del_run.run_data.groupby('query').size().reset_index(name='count').sort_values('count')

Unnamed: 0,query,count
6225,9348986,1
5872,8917600,1
5322,8298567,1
7008,10268261,1
5806,8850508,1
...,...,...
3100,5728387,1000
3101,5729069,1000
3102,5730324,1000
3122,5755588,1000
