In [22]:
import numpy as np
import pandas as pd
import os

from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run

# this loads and starts pyterrier so that it also works in the TIRA
ensure_pyterrier_is_loaded()

# PyTerrier must be imported after the call to ensure_pyterrier_is_loaded in TIRA.
import pyterrier as pt

Due to execution in TIRA, I have patched ir_datasets to always return the single input dataset mounted to the sandbox.


### Herausfinden des Schwellwerts

In [2]:
df = pd.read_xml("topics-anonymized.xml")

In [3]:
df.head()

Unnamed: 0,number,title,description,narrative
0,1,the frequency of solar storms with impact on e...,\n How often do solar storms occur which ...,\n Scientific publications or articles wh...
1,2,popular pastries in germany,\n Which are the best-selling pastries in...,"\n Find statistics, news articles or othe..."
2,3,flights Frankfurt to Rome,\n Show flight connections from Frankfurt...,\n Relevant are all future flight connect...
3,4,remove wine stains,\n How can wine stains be removed?\n,\n Relevant is all information about wine...
4,5,tipping in us,\n What tipping is considered appropriate...,\n We ask for some guidelines of how to c...


In [4]:
queries = df["title"]

In [5]:
df["length"] = df["title"].map(lambda x: len(x))

In [6]:
df["word_count"] = df["title"].map(lambda x: x.count(" ")+1)

In [15]:
df = df.sort_values("word_count")
df.groupby("word_count").count()

Unnamed: 0_level_0,number,title,description,narrative,length
word_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2,2,2,2,2
2,7,7,7,7,7
3,16,16,16,16,16
4,17,17,17,17,17
5,7,7,7,7,7
6,2,2,2,2,2
7,3,3,3,3,3
12,1,1,1,1,1


In [13]:
df["word_count"].median()

4.0

Wir nehmen die 4 als Schwellwert

### Index bauen

In [21]:
data = pt.get_dataset('irds:ir-lab-jena-leipzig-wise-2023/validation-20231104-training')

Load ir_dataset "ir-lab-jena-leipzig-wise-2023/validation-20231104-training" from tira.


In [23]:
print('Build index:')
iter_indexer = pt.IterDictIndexer("/tmp/index", meta={'docno': 100}, verbose=True)
!rm -Rf /tmp/index
indexref = iter_indexer.index(data.get_corpus_iter())
print('Done. Index is created')

Build index:
No settings given in /Users/dominicwild/.tira/.tira-settings.json. I will use defaults.
No settings given in /Users/dominicwild/.tira/.tira-settings.json. I will use defaults.


ir-lab-jena-leipzig-wise-2023/validation-20231104-training documents:   0%|          | 0/61307 [00:00<?, ?it/s]

No settings given in /Users/dominicwild/.tira/.tira-settings.json. I will use defaults.
No settings given in /Users/dominicwild/.tira/.tira-settings.json. I will use defaults.


ir-lab-jena-leipzig-wise-2023/validation-20231104-training documents: 100%|██████████| 61307/61307 [00:24<00:00, 2502.74it/s]


Done. Index is created


### Unsere Retrieval Pipeline

In [50]:
bo1 = pt.rewrite.Bo1QueryExpansion(indexref, verbose=True)
bm25 = pt.BatchRetrieve(indexref, wmodel="BM25", verbose=True)
qe_pipeline = ~(bm25 >> bo1 >> bm25)
non_qe_pipeline = ~(bm25)


In [31]:
topics = data.get_topics()
topics
topics["word_count"] = topics["title"].map(lambda x: x.count(" ")+1)
# todo: pyterrier tokenisierung statt unserer eigenen

There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


In [51]:
topics["word_count"] = topics["title"].map(lambda x: x.count(" ")+1)
threshold = topics["word_count"].median()
if len(topics[topics["word_count"] > threshold]) > len(topics[topics["word_count"] < threshold]):
    topics["is_long"] = topics["word_count"] > threshold
    # topics_small = topics[topics["word_count"] <= threshold]
    # topics_large = topics[topics["word_count"] > threshold]
else:
    topics["is_long"] = topics["word_count"] >= threshold
    # topics_small = topics[topics["word_count"] < threshold]
    # topics_large = topics[topics["word_count"] >= threshold]

In [56]:
eval = pt.Experiment(
    [qe_pipeline, non_qe_pipeline],
    topics, data.get_qrels(),
    ["ndcg_cut_5"],
    ["mit query expansion","ohne query expansion"],
    perquery=True
)

In [59]:
eval = eval.merge(topics, on="qid")

In [62]:
eval[["is_long","value","name"]].groupby(["is_long","name"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,value
is_long,name,Unnamed: 2_level_1
False,mit query expansion,0.131476
False,ohne query expansion,0.141077
True,mit query expansion,0.14828
True,ohne query expansion,0.14972
