In [1]:
# Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt

In [2]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

# A (pre-built) PyTerrier index loaded from TIRA
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)

In [4]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [5]:
print('First, we have a short look at the first three topics:')

pt_dataset.get_topics('text').head(3)

First, we have a short look at the first three topics:


Unnamed: 0,qid,query
0,1,retrieval system improving effectiveness
1,2,machine learning language identification
2,3,social media detect self harm


In [6]:
print('Now we do the retrieval...')
run = bm25(pt_dataset.get_topics('text'))

print('Done. Here are the first 10 entries of the run')
run.head(20)

Now we do the retrieval...
Done. Here are the first 10 entries of the run


Unnamed: 0,qid,docid,docno,rank,score,query
0,1,94858,2004.cikm_conference-2004.47,0,15.681777,retrieval system improving effectiveness
1,1,125137,1989.ipm_journal-ir0volumeA25A4.2,1,15.04738,retrieval system improving effectiveness
2,1,125817,2005.ipm_journal-ir0volumeA41A5.11,2,14.144223,retrieval system improving effectiveness
3,1,5868,W05-0704,3,14.025748,retrieval system improving effectiveness
4,1,84876,2016.ntcir_conference-2016.90,4,13.947994,retrieval system improving effectiveness
5,1,82472,1998.sigirconf_conference-98.15,5,13.901647,retrieval system improving effectiveness
6,1,94415,2008.cikm_conference-2008.183,6,13.808208,retrieval system improving effectiveness
7,1,17496,O01-2005,7,13.749449,retrieval system improving effectiveness
8,1,82490,1998.sigirconf_conference-98.33,8,13.735541,retrieval system improving effectiveness
9,1,124801,2006.ipm_journal-ir0volumeA42A3.2,9,13.569263,retrieval system improving effectiveness


In [7]:
for i,query in enumerate(pt_dataset.get_topics('query')['query']):
    print(i,query)

0 retrieval system improving effectiveness
1 machine learning language identification
2 social media detect self harm
3 stemming for arabic languages
4 audio based animal recognition
5 comparison different retrieval models
6 cache architecture
7 document scoping formula
8 pseudo relevance feedback
9 how to represent natural conversations in word nets
10 algorithm acceleration with nvidia cuda
11 mention of algorithm
12 at least three authors
13 german domain
14 mention of open source
15 inclusion of text mining
16 the ethics of artificial intelligence
17 machine learning for more relevant results
18 crawling websites using machine learning
19 recommenders influence on users
20 search engine caching effects
21 consumer product reviews
22 limitations machine learning
23 medicine related research
24 natural language processing
25 graph based ranking
26 medical studies that use information retrieval
27 information retrieval on different language sources
28 papers that compare multiple info

In [62]:
slice = run[9000:9010]
print(slice)

     qid  docid             docno  rank      score  \
9000  10   2925          E83-1021     0  20.916117   
9001  10  23308          W03-1403     1  19.225459   
9002  10  32141  2020.figlang-1.8     2  19.015074   
9003  10  56494          W19-7510     3  18.394361   
9004  10  39091          W12-4804     4  17.761019   
9005  10  49784          C80-1002     5  17.743728   
9006  10  56442      2019.gwc-1.7     6  17.493578   
9007  10  66935          E99-1009     7  17.345538   
9008  10  65142          J77-4003     8  17.336473   
9009  10  27993          P18-1014     9  17.163196   

                                                  query  
9000  how to represent natural conversations in word...  
9001  how to represent natural conversations in word...  
9002  how to represent natural conversations in word...  
9003  how to represent natural conversations in word...  
9004  how to represent natural conversations in word...  
9005  how to represent natural conversations in word...  

In [55]:
slice2 = run[42000:42010]
print(slice2)

      qid   docid                                        docno  rank  \
42000  44   84131            2003.sigirconf_conference-2003.73     0   
42001  44  101568                 2015.ictir_conference-2015.2     1   
42002  44  126596           2011.tois_journal-ir0volumeA29A2.0     2   
42003  44   19891                                     C94-2169     3   
42004  44  122130  2012.sigirjournals_journal-ir0volumeA46A1.9     4   
42005  44  124838            2006.ipm_journal-ir0volumeA42A1.2     5   
42006  44   83081            2005.sigirconf_conference-2005.64     6   
42007  44   82931           2013.sigirconf_conference-2013.125     7   
42008  44   93970                  2014.cikm_conference-2014.9     8   
42009  44   80905           2008.sigirconf_conference-2008.170     9   

          score                  query  
42000  6.025493  information retrieval  
42001  5.912126  information retrieval  
42002  5.868038  information retrieval  
42003  5.845946  information retrieval  
42

In [58]:
slice3 = run[56283:56293]
print(slice3)

      qid   docid                                        docno  rank  \
56283  60   84131            2003.sigirconf_conference-2003.73     0   
56284  60  101568                 2015.ictir_conference-2015.2     1   
56285  60  126596           2011.tois_journal-ir0volumeA29A2.0     2   
56286  60   19891                                     C94-2169     3   
56287  60  122130  2012.sigirjournals_journal-ir0volumeA46A1.9     4   
56288  60  124838            2006.ipm_journal-ir0volumeA42A1.2     5   
56289  60   83081            2005.sigirconf_conference-2005.64     6   
56290  60   82931           2013.sigirconf_conference-2013.125     7   
56291  60   93970                  2014.cikm_conference-2014.9     8   
56292  60   80905           2008.sigirconf_conference-2008.170     9   

          score                          query  
56283  6.025493  what is information retrieval  
56284  5.912126  what is information retrieval  
56285  5.868038  what is information retrieval  
56286  5.84

In [63]:
docnos = list(slice['docno'])
print(docnos)

['E83-1021', 'W03-1403', '2020.figlang-1.8', 'W19-7510', 'W12-4804', 'C80-1002', '2019.gwc-1.7', 'E99-1009', 'J77-4003', 'P18-1014']


In [64]:
text = list()
for doc in pt_dataset.get_corpus_iter():
    for docno in docnos:
        if doc['docno'] == docno:
            text.append(docno + '\n' + doc['text'])

ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:02<00:00, 53931.97it/s]


In [66]:
for i,item in enumerate(text):
    print(i,item)
    print()

0 E83-1021
An Approach to Natural Language in the {SI-N}ets Paradigm


 Thls article deals with the interpretation of conceptual operations underlying the communicative use of natural language (NL) within the Structured Inheritance Network (Sl-Nets) paradigm. The operations are reduced to functions of a fo~al language, thus changing the level of abstraction of the operations to be performed on SI-Nets. In this sense, operations on SI-Nets are not merely isomorphic to single epistemologleal objects, but can be viewed as a simulation of processes on a different level, that pertaining to the conceptual system of NL. For this purpose, we have designed a version of KL-ONE which represents

1 W03-1403
Is There a Way to Represent Metaphors in {W}ord{N}ets? Insights from the {H}amburg Metaphor Database


 This paper addresses the question whether metaphors can be represented in Word-Nets. For this purpose, domain-centered data is collected from the Hamburg Metaphor Database, an online source c