In [1]:
# Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt
import pandas as pd

In [2]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [8]:
# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')
pt_dataset.get_topics('query').head(4)

Unnamed: 0,qid,query
0,1,retrieval system improving effectiveness
1,2,machine learning language identification
2,3,social media detect self harm
3,4,stemming for arabic languages


In [4]:
# This assumes we have execited the ../baseline-retrieval-system/baseline-retrieval-system.ipynb notebook before.
bm25 = pt.io.read_results('../runs/runbm25.txt')
ngrams = pt.io.read_results('../runs/runngram.txt')
pt.Experiment(
    [bm25, ngrams],
    pt_dataset.get_topics(),
    pt_dataset.get_qrels(),
    ["ndcg_cut.10", "recip_rank", "recall_1000"],
    names=["BM25", "Ngrams"]
)

There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


Unnamed: 0,name,ndcg_cut.10,recip_rank,recall_1000
0,BM25,0.374041,0.579877,0.825376
1,Ngrams,0.274593,0.469564,0.703116


In [5]:
# This assumes we have execited the ../baseline-retrieval-system/baseline-retrieval-system.ipynb notebook before.
bm25 = pt.io.read_results('../runs/runbm25.txt')
ngrams = pt.io.read_results('../runs/runngram.txt')
res_dict = pt.Experiment(
    [bm25, ngrams],
    pt_dataset.get_topics(),
    pt_dataset.get_qrels(),
    ["ndcg_cut.10", "recip_rank", "recall_1000", "map"],
    names=["BM25", "Ngrams"],
    #baseline=0,
    perquery = True,
    save_dir = "./",
    save_mode="overwrite",
    dataframe = True
)
print(res_dict)

There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.
       name qid      measure     value
0      BM25   1          map  0.404455
1      BM25   1   recip_rank  1.000000
2      BM25   1  recall_1000  0.966667
3      BM25   1  ndcg_cut.10  0.835780
36     BM25  10          map  0.001188
..      ...  ..          ...       ...
303  Ngrams   8  ndcg_cut.10  0.000000
304  Ngrams   9          map  0.087581
305  Ngrams   9   recip_rank  0.166667
306  Ngrams   9  recall_1000  1.000000
307  Ngrams   9  ndcg_cut.10  0.151668

[544 rows x 4 columns]


In [6]:
import pandas as pd
df = pd.DataFrame(res_dict)
print(df)
# Filter criteria
filter_criteria = (
    (df['measure'] == 'map') &
    (df['name'].isin(['Ngrams', 'BM25']))
)

# Apply the filter
filtered_df = df[filter_criteria]

# Group by 'qid' and filter groups that have both 'NGRAMS' and 'BM25' entries
groups = filtered_df.groupby('qid')
valid_pairs = []
valid_qids = []
for qid, group in groups:
    if len(group) == 2 and set(group['name']) == {'Ngrams', 'BM25'}:
        ngrams_row = group[group['name'] == 'Ngrams'].iloc[0]
        bm25_row = group[group['name'] == 'BM25'].iloc[0]
        
        if ngrams_row['value'] > bm25_row['value']:
            valid_pairs.append((ngrams_row, bm25_row))
            valid_qids.append(qid)
# Print the valid pairs
for ngrams_row, bm25_row in valid_pairs:
    print(f"Pair found for qid {ngrams_row['qid']}:")
    print(f"Ngrams: {ngrams_row}")
    print(f"BM25: {bm25_row}")
    print(valid_qids)


       name qid      measure     value
0      BM25   1          map  0.404455
1      BM25   1   recip_rank  1.000000
2      BM25   1  recall_1000  0.966667
3      BM25   1  ndcg_cut.10  0.835780
36     BM25  10          map  0.001188
..      ...  ..          ...       ...
303  Ngrams   8  ndcg_cut.10  0.000000
304  Ngrams   9          map  0.087581
305  Ngrams   9   recip_rank  0.166667
306  Ngrams   9  recall_1000  1.000000
307  Ngrams   9  ndcg_cut.10  0.151668

[544 rows x 4 columns]
Pair found for qid 10:
Ngrams: name         Ngrams
qid              10
measure         map
value      0.001308
Name: 308, dtype: object
BM25: name           BM25
qid              10
measure         map
value      0.001188
Name: 36, dtype: object
['10', '18', '20', '28', '29', '3', '31', '33', '39', '49', '51', '56', '62', '65', '68']
Pair found for qid 18:
Ngrams: name         Ngrams
qid              18
measure         map
value      0.237411
Name: 340, dtype: object
BM25: name           BM25
qid       

In [7]:
import pandas as pd

#print(pt_dataset.get_topics())
#valid_qids_normal = ['10', '18', '20', '28', '29', '3', '31', '33', '39', '4', '49', '51', '56', '62', '65', '68']
df = pd.DataFrame(pt_dataset.get_topics())
#print(df)

filtered_df = df[df['qid'].isin(valid_qids)]
# Printing the filtered DataFrame
print(filtered_df)

There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.
   qid                                               text  \
2    3                      social media detect self-harm   
9   10  How to represent natural conversations in word...   
18  20           Crawling websites using machine learning   
26  28   medical studies that use information retrieval ?   
27  29  information retrieval on different language so...   
29  31   risks of information retrieval in social media ?   
31  33                                fake news detection   
37  39      informational retrieval using neural networks   
47  49                             exhaustivity of index    
49  51                          Cosine similarity vector    
54  56   What makes Natural Language Processing natural?    
60  62                          How to avoid spam results   
63  65                  info