# Keyphrase Extraction in PyTerrier from TIREx executions

In [1]:
from tira.local_client import Client
import pyterrier as pt
import pandas as pd

if not pt.started():
  pt.init()

tira = Client()
dataset = "trec-robust-2004"
pt_dataset = pt.get_dataset(dataset)

bce_5_keyphrase_extractor = tira.pt.transform_documents('ir-benchmarks/webis-keyphrase-extraction/BCExtractorFO-5', dataset)
bm25 = tira.pt.from_retriever_submission("ir-benchmarks/tira-ir-starter/PyTerrierBM25", dataset)
bm25_score = pt.text.scorer(takes="docs", body_attr="text", wmodel="BM25")
bm25_retrieval = bm25 >> bm25_score

PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:
dev_topics = pt_dataset.get_topics()
dev_topics = dev_topics[dev_topics['qid'].astype(int) <= 305]
dev_topics

Unnamed: 0,qid,query
0,301,international organized crime
1,302,poliomyelitis and post polio
2,303,hubble telescope achievements
3,304,endangered species mammals
4,305,most dangerous vehicles


In [3]:
(bm25 >> bce_5_keyphrase_extractor)(dev_topics).head(5)[['qid', 'query', 'text', 'keyphrases']]

Unnamed: 0,qid,query,text,keyphrases
0,301,international organized crime,Text of Presidential Edict on Organized Crime ...,"[aforementioned crimes, federation additional ..."
1,301,international organized crime,Seminar on Criminology Held 1990-1993 Crime Fi...,"[real state, further increase, public control,..."
2,301,international organized crime,Freeh Visit To Focus on Organized Crime BFN\n[...,"[organized postcommunist crime, organized crim..."
3,301,international organized crime,Government Ties to Organized Crime Examined CS...,"[characteristic organized crime, organized cri..."
4,301,international organized crime,North Caucasus Anticrime Chief Views Current T...,"[territorial administrations, malbakhova crime..."


In [4]:
def append_keyphrases(df):
    df['text'] = df['text'] + df['keyphrases'].apply(lambda i: ' '.join(i))
    return df

append_keyphrases = pt.apply.generic(append_keyphrases)


In [5]:
bm_25_bce_5 = (bm25 >> bce_5_keyphrase_extractor >> append_keyphrases >> bm25_score)

bm_25_bce_5(dev_topics).head(5)[['qid', 'query', 'text', 'score', 'rank']]



Unnamed: 0,qid,query,text,score,rank
0,301,international organized crime,Text of Presidential Edict on Organized Crime ...,8.279819,0
1,301,international organized crime,Seminar on Criminology Held 1990-1993 Crime Fi...,8.035928,5
2,301,international organized crime,Freeh Visit To Focus on Organized Crime BFN\n[...,8.039895,4
3,301,international organized crime,Government Ties to Organized Crime Examined CS...,7.92872,8
4,301,international organized crime,North Caucasus Anticrime Chief Views Current T...,8.065075,2


In [6]:
pt.Experiment([bm25_retrieval, bm_25_bce_5], pt_dataset.get_topics(), pt_dataset.get_qrels(), eval_metrics=["recip_rank", "P_3", "P_5", "P_10", "ndcg_cut_10", "ndcg_cut_5"], names=['BM25', 'BM25 >> BCE 5'])



Unnamed: 0,name,recip_rank,P_3,P_5,P_10,ndcg_cut_10,ndcg_cut_5
0,BM25,0.654809,0.496653,0.465863,0.409237,0.421258,0.444258
1,BM25 >> BCE 5,0.657757,0.506024,0.471486,0.410442,0.42408,0.449429
