# Query Segmentation in PyTerrier from TIREx executions

### Setup the environment

In [1]:
from tira.local_client import Client
import pyterrier as pt
import pandas as pd

tira = Client()

if not pt.started():
  pt.init()

dataset = "trec-robust-2004"
pt_dataset = pt.get_dataset(dataset)


PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
# Some Dev topics
dev_topics = pt_dataset.get_topics()
dev_topics = dev_topics[dev_topics['qid'].astype(int) == 303]
dev_topics

Unnamed: 0,qid,query
2,303,hubble telescope achievements


### Prepare Retrieval Components

In [5]:
bm25 = tira.pt.from_retriever_submission("ir-benchmarks/tira-ir-starter/PyTerrierBM25", dataset)

In [6]:
bm25_score = pt.text.scorer(takes="docs", body_attr="text", wmodel="BM25")
bm25_retrieval = bm25 >> bm25_score

In [8]:
dev_qrels =  pt_dataset.get_qrels()
dev_qrels = dev_qrels[dev_qrels['qid'].astype(int) == 303]
dev_qrels

Unnamed: 0,qid,docno,label
2386,303,FBIS3-16217,0
2387,303,FBIS3-19093,0
2388,303,FBIS3-21007,0
2389,303,FBIS3-21026,0
2390,303,FBIS3-23561,0
...,...,...,...
3142,303,LA122589-0068,0
3143,303,LA122590-0113,0
3144,303,LA122790-0152,0
3145,303,LA122990-0029,1


In [9]:
dev_qrels[dev_qrels['label'] > 0]

Unnamed: 0,qid,docno,label
2519,303,FT921-7107,1
2595,303,FT924-286,1
2627,303,FT931-6554,1
2731,303,FT941-15661,1
2739,303,FT941-17652,1
2818,303,FT944-128,1
2918,303,LA051290-0079,1
3109,303,LA110590-0076,1
3119,303,LA112190-0043,1
3145,303,LA122990-0029,1


In [23]:
pt.Experiment([bm25_retrieval], dev_topics, dev_qrels, eval_metrics=["recip_rank", "P_3", "P_5", "P_10", "ndcg_cut_10", "ndcg_cut_5"], names=['BM25'])



Unnamed: 0,name,recip_rank,P_3,P_5,P_10,ndcg_cut_10,ndcg_cut_5
0,BM25,0.052632,0.0,0.0,0.0,0.0,0.0


In [49]:
phrase_search = (bm25_retrieval >> pt.apply.doc_score(lambda i: i['text'].lower().count(i['query']) ))

dev_topics = pd.DataFrame([{'qid': '303', 'query': 'hubble telescope'}])

pt.Experiment([phrase_search], dev_topics, dev_qrels, eval_metrics=["recip_rank", "P_3", "P_5", "P_10", "ndcg_cut_10", "ndcg_cut_5"], names=['BM25'])



Unnamed: 0,name,recip_rank,P_3,P_5,P_10,ndcg_cut_10,ndcg_cut_5
0,BM25,1.0,0.333333,0.4,0.2,0.31488,0.485229


In [None]:


pt.Experiment([phrase_search], dev_topics, dev_qrels, eval_metrics=["recip_rank", "P_3", "P_5", "P_10", "ndcg_cut_10", "ndcg_cut_5"], names=['BM25'])

In [11]:
wt_query_segmentation = tira.pt.transform_queries('ir-benchmarks/webis-query-segmentation/wt-snp-baseline', dataset)
hyp_a_query_segmentation = tira.pt.transform_queries('ir-benchmarks/webis-query-segmentation/hyb-a', dataset)

### Setup Retrieval Pipelines

In [12]:
wt_query_segmentation(dev_topics)[['query', 'segmentation']]

Unnamed: 0,query,segmentation
0,international organized crime,[international organized crime]
1,poliomyelitis and post polio,"[poliomyelitis, and, post-polio]"
2,hubble telescope achievements,[hubble telescope achievements]
3,endangered species mammals,"[endangered species, mammals]"
4,most dangerous vehicles,"[most, dangerous vehicles]"
5,african civilian deaths,"[african, civilian deaths]"
6,new hydroelectric projects,[new hydroelectric projects]
7,implant dentistry,"[implant, dentistry]"
8,rap and crime,"[rap, and, crime]"
9,radio waves and brain cancer,"[radio waves, and, brain cancer]"


In [13]:
hyp_a_query_segmentation(dev_topics)[['query', 'segmentation']]

Unnamed: 0,query,segmentation
0,international organized crime,"[international, organized crime]"
1,poliomyelitis and post polio,"[poliomyelitis, and, post-polio]"
2,hubble telescope achievements,"[hubble telescope, achievements]"
3,endangered species mammals,"[endangered species, mammals]"
4,most dangerous vehicles,"[most, dangerous, vehicles]"
5,african civilian deaths,"[african, civilian deaths]"
6,new hydroelectric projects,"[new, hydroelectric projects]"
7,implant dentistry,"[implant, dentistry]"
8,rap and crime,"[rap, and, crime]"
9,radio waves and brain cancer,"[radio waves, and, brain cancer]"
